예제 #1
0
def try_justext(tree, url, target_language):
    '''Second safety net: try with the generic algorithm justext'''
    result_body = etree.Element('body')
    # determine language
    if target_language is not None and target_language in JUSTEXT_LANGUAGES:
        justext_stoplist = get_stoplist(JUSTEXT_LANGUAGES[target_language])
    else:
        justext_stoplist = JT_STOPLIST
    # extract
    try:
        paragraphs = custom_justext(tree, justext_stoplist)
    except ValueError as err:  # not an XML element: HtmlComment
        LOGGER.error('justext %s %s', err, url)
        result_body = None
    else:
        for paragraph in [p for p in paragraphs if not p.is_boilerplate]:
            #if duplicate_test(paragraph) is not True:
            elem, elem.text = etree.Element('p'), paragraph.text
            result_body.append(elem)
    return result_body
예제 #2
0
    def test_get_real_stoplist(self):
        stopwords = get_stoplist("Slovak")

        tools.assert_true(len(stopwords) > 0)
예제 #3
0
def jt_stoplist_init():
    'Retrieve and return the content of all JusText stoplists'
    stoplist = set()
    for language in get_stoplists():
        stoplist.update(get_stoplist(language))
    return stoplist
예제 #4
0
 def test_get_missing_stoplist(self):
     with pytest.raises(ValueError):
         get_stoplist("Klingon")
예제 #5
0
    def test_get_real_stoplist(self):
        stopwords = get_stoplist("Slovak")

        assert len(stopwords) > 0
예제 #6
0
 def test_get_missing_stoplist(self):
     with pytest.raises(ValueError):
         get_stoplist("Klingon")
예제 #7
0
    def test_get_real_stoplist(self):
        stopwords = get_stoplist("Slovak")

        assert len(stopwords) > 0
예제 #8
0
파일: test_utils.py 프로젝트: Almad/jusText
    def test_get_real_stoplist(self):
        stopwords = get_stoplist("Slovak")

        tools.assert_true(len(stopwords) > 0)