def custom_justext(tree, stoplist): 'Customized version of JusText processing' dom = preprocessor(tree) # tree_cleaning(tree, True) paragraphs = ParagraphMaker.make_paragraphs(dom) classify_paragraphs(paragraphs, stoplist, 50, 200, 0.1, 0.2, 0.2, True) revise_paragraph_classification(paragraphs, 200) return paragraphs
def test_length_low(self): paragraphs = [ self._paragraph(text="0 1 2 3 4 5 6 7 8 9"*2, chars_count_in_links=0), self._paragraph(text="0 1 2 3 4 5 6 7 8 9"*2, chars_count_in_links=20), ] classify_paragraphs(paragraphs, (), max_link_density=1, length_low=1000) tools.assert_equal(paragraphs[0].cf_class, "short") tools.assert_equal(paragraphs[1].cf_class, "bad")
def test_stopwords_high(self): paragraphs = [ self._paragraph(text="0 1 2 3 4 5 6 7 8 9"), self._paragraph(text="0 1 2 3 4 5 6 7 8 9"*2), ] classify_paragraphs(paragraphs, ("0",), max_link_density=1, length_low=0, stopwords_high=0, length_high=20) tools.assert_equal(paragraphs[0].cf_class, "neargood") tools.assert_equal(paragraphs[1].cf_class, "good")
def test_stopwords_low(self): paragraphs = [ self._paragraph(text="0 0 0 0 1 2 3 4 5 6 7 8 9"), self._paragraph(text="0 1 2 3 4 5 6 7 8 9"), self._paragraph(text="1 2 3 4 5 6 7 8 9"), ] classify_paragraphs(paragraphs, ("0", "1",), max_link_density=1, length_low=0, stopwords_high=1000, stopwords_low=0.2) tools.assert_equal(paragraphs[0].cf_class, "neargood") tools.assert_equal(paragraphs[1].cf_class, "neargood") tools.assert_equal(paragraphs[2].cf_class, "bad")
def html2prevert(s, justext_wordlist, justext_level='strict', allowshort=False): # TODO: preclean HTML (remove tables, infoboxes, References, TOC, ...) try: html_root = justext.preprocess(html_text=s, encoding='utf-8') paragraphs = justext.make_paragraphs(html_root) except (ParserError, XMLSyntaxError): return ('', 0, 0) #use Justext to classify paragraphs j_length_low = JUSTEXT_PARAMS_BY_LEVEL[justext_level]['length_low'] j_length_high = JUSTEXT_PARAMS_BY_LEVEL[justext_level]['length_high'] j_max_heading_distance = JUSTEXT_PARAMS_BY_LEVEL[justext_level][ 'max_heading_distance'] if allowshort: j_length_low = j_length_low / 3 j_length_high = j_length_high / 3 j_max_heading_distance = j_max_heading_distance / 3 justext.classify_paragraphs( paragraphs=paragraphs, stoplist=justext_wordlist, length_low=j_length_low, #character count < length_low => bad or short length_high=j_length_high, #character count > length_high => good stopwords_low=JUSTEXT_PARAMS_BY_LEVEL[justext_level] ['stopwords_low'], #number of words frequent in the language >= stopwords_low => neargood stopwords_high=JUSTEXT_PARAMS_BY_LEVEL[justext_level] ['stopwords_high'], #number of words frequent in the language >= stopwords_high => good or neargood max_link_density=JUSTEXT_PARAMS_BY_LEVEL[justext_level] ['max_link_density'] #density of link words (words inside the <a> tag) > max_link_density => bad ) justext.revise_paragraph_classification( paragraphs=paragraphs, max_heading_distance= j_max_heading_distance #Short/near-good heads in the distance [chars] before a good par => good ) #extract good paragraphs prevert_paragraphs, paragraph_count, plaintext_len = [], 0, 0 for p in paragraphs: #if p['class'] == 'good': # TODO find why this does not produce a good result if p['cfclass'] in ('good', 'neargood'): #'good', 'neargood', 'short', 'bad' p_text = justext.html_escape(p['text']).strip() if p_text: paragraph_count += 1 plaintext_len += len(p_text) heading = ' heading="1"' if p['heading'] else '' prevert_paragraphs.append('<p%s>\n%s\n</p>' % (heading, p_text)) return ('\n'.join(prevert_paragraphs), paragraph_count, plaintext_len)
def test_max_link_density(self): paragraphs = [ self._paragraph(text="0123456789"*2, chars_count_in_links=0), self._paragraph(text="0123456789"*2, chars_count_in_links=20), self._paragraph(text="0123456789"*8, chars_count_in_links=40), self._paragraph(text="0123456789"*8, chars_count_in_links=39), self._paragraph(text="0123456789"*8, chars_count_in_links=41), ] classify_paragraphs(paragraphs, (), max_link_density=0.5) tools.assert_equal(paragraphs[0].cf_class, "short") tools.assert_equal(paragraphs[1].cf_class, "bad") tools.assert_equal(paragraphs[2].cf_class, "bad") tools.assert_equal(paragraphs[3].cf_class, "bad") tools.assert_equal(paragraphs[4].cf_class, "bad")