def add_bs_xml_content(text: str, f: Parsed, lang: str): """ Add content to Parsed object from BeautifulSoup XML parser output. """ sentences = re.split("[.!?]", text) for sentence in sentences: s = clean_text(sentence) if len(s) > 1: f.add_content_sent(" ".join(s)) s_stem = stem_text(s, lang) f.add_stemmed_sent(" ".join(s_stem)) s_filt = filter_text(s, lang) if len(s_filt) > 1: f.add_filtered_sent(" ".join(s_filt)) s_filt_stem = stem_text(s_filt, lang) f.add_filtered_stemmed_sent(" ".join(s_filt_stem)) text_list = clean_text(text) f.add_content(text_list) stem = stem_text(text_list, lang) f.add_stemmed(stem) filt = filter_text(text_list, lang) f.add_filtered(filt) filt_stem = stem_text(filt, lang) f.add_filtered_stemmed(filt_stem)
def add_xml_content(root, file: Parsed, language: str): """ Transforms text from xml file into raw/filtered/stemmed forms and adds it to a file object. """ text = '' if str(root.text) != 'None': text += root.text if str(root.tail) != 'None': text += ' ' + root.tail if text != '': sentences = re.split('(?<=[.!?]) +', text) for sentence in sentences: sentence = clean_text(sentence) if len(sentence) > 1: file.add_content_sent(" ".join(sentence)) sentence_stemmed = stem_text(sentence, language) file.add_stemmed_sent(" ".join(sentence_stemmed)) sentence_filtered = filter_text(sentence, language) if len(sentence_filtered) > 1: file.add_filtered_sent(" ".join(sentence_filtered)) sentence_filtered_stemmed = stem_text( sentence_filtered, language) file.add_filtered_stemmed_sent( " ".join(sentence_filtered_stemmed)) text_list = clean_text(text) # full text file.add_content(text_list) # stem the full text stemmed = stem_text(text_list, language) file.add_stemmed(stemmed) # filter the unstemmed full text filtered = filter_text(text_list, language) file.add_filtered(filtered) # stem the filtered text filtered_stemmed = stem_text(filtered, language) file.add_filtered_stemmed(filtered_stemmed)
def add_content(text: str, file: Parsed, language: str): """ Transforms text into raw/filtered/stemmed forms and adds it to a file object. """ sentences = re.split('(?<=[.!?]) +', text) for sentence in sentences: sentence = clean_text(sentence) if len(sentence) > 1: file.add_content_sent(" ".join(sentence)) sentence_stemmed = stem_text(sentence, language) file.add_stemmed_sent(" ".join(sentence_stemmed)) sentence_filtered = filter_text(sentence, language) if len(sentence_filtered) > 1: file.add_filtered_sent(" ".join(sentence_filtered)) sentence_filtered_stemmed = stem_text(sentence_filtered, language) file.add_filtered_stemmed_sent( " ".join(sentence_filtered_stemmed)) text_list = clean_text(text) # full text file.add_content(text_list) # stem the full text stemmed = stem_text(text_list, language) file.add_stemmed(stemmed) # filter the unstemmed full text filtered = filter_text(text_list, language) file.add_filtered(filtered) # stem the filtered text filtered_stemmed = stem_text(filtered, language) file.add_filtered_stemmed(filtered_stemmed)