示例#1
0
def custom_justext(tree, stoplist):
    'Customized version of JusText processing'
    dom = preprocessor(tree) # tree_cleaning(tree, True)
    paragraphs = ParagraphMaker.make_paragraphs(dom)
    classify_paragraphs(paragraphs, stoplist, 50, 200, 0.1, 0.2, 0.2, True)
    revise_paragraph_classification(paragraphs, 200)
    return paragraphs
示例#2
0
    def test_length_low(self):
        paragraphs = [
            self._paragraph(text="0 1 2 3 4 5 6 7 8 9"*2, chars_count_in_links=0),
            self._paragraph(text="0 1 2 3 4 5 6 7 8 9"*2, chars_count_in_links=20),
        ]

        classify_paragraphs(paragraphs, (), max_link_density=1, length_low=1000)

        tools.assert_equal(paragraphs[0].cf_class, "short")
        tools.assert_equal(paragraphs[1].cf_class, "bad")
示例#3
0
    def test_stopwords_high(self):
        paragraphs = [
            self._paragraph(text="0 1 2 3 4 5 6 7 8 9"),
            self._paragraph(text="0 1 2 3 4 5 6 7 8 9"*2),
        ]

        classify_paragraphs(paragraphs, ("0",), max_link_density=1, length_low=0,
            stopwords_high=0, length_high=20)

        tools.assert_equal(paragraphs[0].cf_class, "neargood")
        tools.assert_equal(paragraphs[1].cf_class, "good")
示例#4
0
    def test_stopwords_low(self):
        paragraphs = [
            self._paragraph(text="0 0 0 0 1 2 3 4 5 6 7 8 9"),
            self._paragraph(text="0 1 2 3 4 5 6 7 8 9"),
            self._paragraph(text="1 2 3 4 5 6 7 8 9"),
        ]

        classify_paragraphs(paragraphs, ("0", "1",), max_link_density=1,
            length_low=0, stopwords_high=1000, stopwords_low=0.2)

        tools.assert_equal(paragraphs[0].cf_class, "neargood")
        tools.assert_equal(paragraphs[1].cf_class, "neargood")
        tools.assert_equal(paragraphs[2].cf_class, "bad")
示例#5
0
def html2prevert(s,
                 justext_wordlist,
                 justext_level='strict',
                 allowshort=False):
    # TODO: preclean HTML (remove tables, infoboxes, References, TOC, ...)
    try:
        html_root = justext.preprocess(html_text=s, encoding='utf-8')
        paragraphs = justext.make_paragraphs(html_root)
    except (ParserError, XMLSyntaxError):
        return ('', 0, 0)
    #use Justext to classify paragraphs
    j_length_low = JUSTEXT_PARAMS_BY_LEVEL[justext_level]['length_low']
    j_length_high = JUSTEXT_PARAMS_BY_LEVEL[justext_level]['length_high']
    j_max_heading_distance = JUSTEXT_PARAMS_BY_LEVEL[justext_level][
        'max_heading_distance']
    if allowshort:
        j_length_low = j_length_low / 3
        j_length_high = j_length_high / 3
        j_max_heading_distance = j_max_heading_distance / 3
    justext.classify_paragraphs(
        paragraphs=paragraphs,
        stoplist=justext_wordlist,
        length_low=j_length_low,  #character count < length_low => bad or short
        length_high=j_length_high,  #character count > length_high => good
        stopwords_low=JUSTEXT_PARAMS_BY_LEVEL[justext_level]
        ['stopwords_low'],  #number of words frequent in the language >= stopwords_low => neargood
        stopwords_high=JUSTEXT_PARAMS_BY_LEVEL[justext_level]
        ['stopwords_high'],  #number of words frequent in the language >= stopwords_high => good or neargood
        max_link_density=JUSTEXT_PARAMS_BY_LEVEL[justext_level]
        ['max_link_density']  #density of link words (words inside the <a> tag) > max_link_density => bad
    )
    justext.revise_paragraph_classification(
        paragraphs=paragraphs,
        max_heading_distance=
        j_max_heading_distance  #Short/near-good heads in the distance [chars] before a good par => good
    )
    #extract good paragraphs
    prevert_paragraphs, paragraph_count, plaintext_len = [], 0, 0
    for p in paragraphs:
        #if p['class'] == 'good': # TODO find why this does not produce a good result
        if p['cfclass'] in ('good',
                            'neargood'):  #'good', 'neargood', 'short', 'bad'
            p_text = justext.html_escape(p['text']).strip()
            if p_text:
                paragraph_count += 1
                plaintext_len += len(p_text)
                heading = ' heading="1"' if p['heading'] else ''
                prevert_paragraphs.append('<p%s>\n%s\n</p>' %
                                          (heading, p_text))
    return ('\n'.join(prevert_paragraphs), paragraph_count, plaintext_len)
示例#6
0
    def test_max_link_density(self):
        paragraphs = [
            self._paragraph(text="0123456789"*2, chars_count_in_links=0),
            self._paragraph(text="0123456789"*2, chars_count_in_links=20),
            self._paragraph(text="0123456789"*8, chars_count_in_links=40),
            self._paragraph(text="0123456789"*8, chars_count_in_links=39),
            self._paragraph(text="0123456789"*8, chars_count_in_links=41),
        ]

        classify_paragraphs(paragraphs, (), max_link_density=0.5)

        tools.assert_equal(paragraphs[0].cf_class, "short")
        tools.assert_equal(paragraphs[1].cf_class, "bad")
        tools.assert_equal(paragraphs[2].cf_class, "bad")
        tools.assert_equal(paragraphs[3].cf_class, "bad")
        tools.assert_equal(paragraphs[4].cf_class, "bad")