def baseline(filecontent): """Use baseline extraction function targeting JSON metadata and/or text paragraphs""" tree = load_html(filecontent) postbody = etree.Element('body') # scrape from json text for elem in tree.xpath('//script[@type="application/ld+json"]'): if elem.text and '"articleBody":' in elem.text: mymatch = re.search(r'"articleBody":"(.+?)","', elem.text) if mymatch: temp_text = mymatch.group(1) temp_text = temp_text.replace('\\"', '"') # temp_text = trim(temp_text) len_text = len(temp_text) postbody = etree.Element('body') elem = etree.Element('p') elem.text = temp_text postbody.append(elem) return postbody, len_text, temp_text # scrape from article tag elems = tree.xpath('//article') # |//main if len(elems) > 0: article_elem = elems[0] temp_text = sanitize(article_elem.text_content()) len_text = len(temp_text) if len_text > 0: elem = etree.Element('p') elem.text = temp_text postbody.append(elem) return postbody, len_text, temp_text # scrape from text paragraphs results = set() resultlist = list() #search_tree = discard_unwanted(tree) # search_tree = prune_html(tree) for element in tree.iter('blockquote', 'code', 'p', 'pre', 'q', 'quote'): entry = element.text_content() if entry not in results: resultlist.append(entry) results.add(entry) for textpart in resultlist: elem = etree.Element('p') elem.text = textpart postbody.append(elem) temp_text = sanitize('\n'.join(postbody.itertext())) len_text = len(temp_text) return postbody, len_text, temp_text
def test_trim(): '''test string trimming''' assert trim(' Test ') == 'Test' assert trim('\t\tTest Test\r\n') == 'Test Test' my_elem = etree.Element('body') my_elem.text = 'Test Text' assert textfilter(my_elem) is False # my_elem.text = 'Tags: Arbeit, Urlaub' my_elem.text = 'Instagram' assert textfilter(my_elem) is True my_elem.text = '\t\t' assert textfilter(my_elem) is True # sanitize logic assert utils.sanitize(None) is None # non-breaking spaces print(utils.sanitize('Test Text')) assert utils.sanitize('Test Text') == 'Test Text'
def sanitize_tree(tree): '''Sanitize the output from the generic algorithm''' # cleaned_tree = manual_cleaning(tree, True) # cleaned_tree = HTML_CLEANER.clean_html(cleaned_tree) etree.strip_tags(tree, 'div') tree = prune_html(tree) #for elem in tree.iter(): # elem.attrib.clear() cleaned_tree = convert_tags(tree) for elem in cleaned_tree.iter(): #if elem.tag in ('code', 'del', 'head', 'hi', 'item', 'p', 'quote'): # if elem.text is None or elem.text.isspace(): # elem.getparent().remove(elem) # continue if elem.text is not None: elem.text = sanitize(elem.text) if elem.tail is not None: elem.tail = sanitize(elem.tail) # cleaned_tree = prune_html(cleaned_tree) return cleaned_tree
def run_jparser(htmlstring): '''try with jparser''' try: pm = PageModel(htmlstring) except ValueError: return '' result = pm.extract() mylist = list() for x in result['content']: if x['type'] in ('text', 'html'): mylist.append(str(x['data'])) returnstring = ' '.join(mylist) # returnstring = re.sub(r'\s+', ' ', returnstring) returnstring = re.sub(r'\s+(p{P}+)', '\1', returnstring) return sanitize(returnstring)
def extract(filecontent, url=None, record_id='0001', no_fallback=False, include_comments=False, csv_output=False, xml_output=False, tei_output=False, tei_validation=False, target_language=None, include_tables=True, include_formatting=False): '''Main process for text extraction''' # init tree = load_html(filecontent) if tree is None: return None # Metadata here if csv_output is True or xml_output is True or tei_output is True: docmeta = extract_metadata(tree, default_url=url) else: docmeta = None # backup (or not) for further processing if no_fallback is False: backup_tree = deepcopy(tree) else: backup_tree = None # clean cleaned_tree = manual_cleaning(tree, include_tables) # save space and processing time cleaned_tree = prune_html(cleaned_tree) # use LXML cleaner cleaned_tree = HTML_CLEANER.clean_html(cleaned_tree) # tree_cache[cleaned_tree] = list(cleaned_tree.iter()) # convert tags, the rest does not work without conversion cleaned_tree = convert_tags(cleaned_tree) # remove hi-element to avoid tail bug if (xml_output is False and tei_output is False) or include_formatting is False: etree.strip_tags(cleaned_tree, 'hi') # comments first, then remove if include_comments is True: commentsbody, temp_comments, len_comments, cleaned_tree = extract_comments( cleaned_tree) else: commentsbody, temp_comments, len_comments = None, '', 0 # extract content postbody, temp_text, len_text, sure_thing = extract_content( cleaned_tree, include_tables) # compare if necessary if no_fallback is False: # and sure_thing is False: postbody, temp_text, len_text = compare_extraction( backup_tree, url, postbody, temp_text, len_text) # try with justext if len_text < MIN_EXTRACTED_SIZE: LOGGER.error('not enough text %s %s', record_id, url) postbody, len_text, temp_text = justext_rescue( tree, url, target_language, postbody, len_text, temp_text) LOGGER.error('justext length %s', len_text) # second backup # if len_text < MIN_EXTRACTED_SIZE: # postbody, len_text, temp_text = baseline(filecontent) else: # rescue: try to use original/dirty tree if sure_thing is False and len_text < MIN_EXTRACTED_SIZE: postbody, len_text, temp_text = baseline(filecontent) #tree = load_html(filecontent) #tree = convert_tags(tree) #postbody, temp_text, len_text, sure_thing = extract_content(tree) LOGGER.debug('non-clean extracted length: %s (extraction)', len_text) if len_comments < MIN_EXTRACTED_COMM_SIZE: LOGGER.info('not enough comments %s %s', record_id, url) if len_text < MIN_OUTPUT_SIZE and len_comments < MIN_OUTPUT_COMM_SIZE: LOGGER.info('text and comments not long enough: %s %s', len_text, len_comments) return None # sanity check on language if language_filter(temp_text, temp_comments, target_language, record_id, url) is True: return None # check duplicates at body level if duplicate_test(postbody) is True: return None # cache elements put_in_cache(postbody) if commentsbody is not None: put_in_cache(commentsbody) # XML (TEI) steps if xml_output is True or tei_output is True: if xml_output is True: output = build_xml_output(postbody, commentsbody) output = add_xml_meta(output, docmeta) elif tei_output is True: output = build_tei_output(postbody, commentsbody, docmeta) # can be improved control_string = etree.tostring(output, encoding='unicode') control_string = sanitize(control_string) # necessary for cleaning control_parser = etree.XMLParser(remove_blank_text=True) output_tree = etree.fromstring(control_string, control_parser) # validate if tei_output is True and tei_validation is True: result = validate_tei(output_tree) LOGGER.info('TEI validation result: %s %s %s', result, record_id, docmeta.url) returnstring = etree.tostring(output_tree, pretty_print=True, encoding='unicode').strip() # CSV + TXT output else: if csv_output is True: posttext = xmltotxt(postbody) commentstext = xmltotxt(commentsbody) returnstring = txttocsv(posttext, commentstext, docmeta) else: output = build_xml_output(postbody, commentsbody) returnstring = xmltotxt(output) return returnstring