Exemplo n.º 1
0
def try_justext(tree, url, target_language):
    '''Second safety net: try with the generic algorithm justext'''
    result_body = etree.Element('body')
    # determine language
    if target_language is not None and target_language in JUSTEXT_LANGUAGES:
        justext_stoplist = get_stoplist(JUSTEXT_LANGUAGES[target_language])
    else:
        justext_stoplist = JT_STOPLIST
    # extract
    try:
        paragraphs = custom_justext(tree, justext_stoplist)
    except ValueError as err:  # not an XML element: HtmlComment
        LOGGER.error('justext %s %s', err, url)
        result_body = None
    else:
        for paragraph in [p for p in paragraphs if not p.is_boilerplate]:
            #if duplicate_test(paragraph) is not True:
            elem, elem.text = etree.Element('p'), paragraph.text
            result_body.append(elem)
    return result_body
Exemplo n.º 2
0
    def test_get_real_stoplist(self):
        stopwords = get_stoplist("Slovak")

        tools.assert_true(len(stopwords) > 0)
Exemplo n.º 3
0
def jt_stoplist_init():
    'Retrieve and return the content of all JusText stoplists'
    stoplist = set()
    for language in get_stoplists():
        stoplist.update(get_stoplist(language))
    return stoplist
Exemplo n.º 4
0
 def test_get_missing_stoplist(self):
     with pytest.raises(ValueError):
         get_stoplist("Klingon")
Exemplo n.º 5
0
    def test_get_real_stoplist(self):
        stopwords = get_stoplist("Slovak")

        assert len(stopwords) > 0
Exemplo n.º 6
0
 def test_get_missing_stoplist(self):
     with pytest.raises(ValueError):
         get_stoplist("Klingon")
Exemplo n.º 7
0
    def test_get_real_stoplist(self):
        stopwords = get_stoplist("Slovak")

        assert len(stopwords) > 0
Exemplo n.º 8
0
    def test_get_real_stoplist(self):
        stopwords = get_stoplist("Slovak")

        tools.assert_true(len(stopwords) > 0)