def test_corpus_split_by_paragraphs():
    c = Corpus.from_folder('tests/data/gutenberg', doc_label_fmt='{basename}')

    orig_docs = c.docs
    orig_doc_paths = c.doc_paths
    c.split_by_paragraphs()
    par_docs = c.docs

    assert len(par_docs) >= len(orig_docs)
    assert len(set(orig_doc_paths.values())) == len(set(c.doc_paths.values()))

    for k, d in orig_docs.items():
        assert k in ('goethe_werther1', 'goethe_werther2', 'kafka_verwandlung')
        pars = [
            par_docs[par_k] for par_k in sorted(par_docs.keys())
            if par_k.startswith(k)
        ]
        assert len(pars) > 0

        pars_ = paragraphs_from_lines(d)
        assert len(pars_) == len(pars)
        assert set(pars_) == set(pars)
示例#2
0
def test_paragraphs_from_lines_already_split_hypothesis(lines):
    pars = paragraphs_from_lines(lines, splitchar=None)
    assert len(pars) <= len(lines)
    assert all(len(p) > 0 for p in pars)
示例#3
0
def test_paragraphs_from_lines_hypothesis(lines):
    pars = paragraphs_from_lines(lines)
    assert len(pars) <= len(lines)
    assert all(len(p) > 0 for p in pars)
示例#4
0
def test_paragraphs_from_lines():
    with pytest.raises(ValueError):
        paragraphs_from_lines("foo", splitchar=None)

    assert len(paragraphs_from_lines('')) == 0
    assert len(paragraphs_from_lines(' ')) == 0
    assert len(paragraphs_from_lines('\n')) == 0
    assert len(paragraphs_from_lines('\n\n')) == 0
    assert len(paragraphs_from_lines('\n\n\n')) == 0

    pars = paragraphs_from_lines("foo")
    assert len(pars) == 1
    assert pars[0] == "foo"

    testlines1 = u"""

par1 lorem
ipsum

par2 lorem


par3 ipsum
lorem
dorem


par4

"""

    pars = paragraphs_from_lines(
        testlines1)  # with default break_on_num_newlines=2

    assert len(pars) == 4
    assert pars[0] == 'par1 lorem ipsum'
    assert pars[1] == 'par2 lorem'
    assert pars[2] == 'par3 ipsum lorem dorem'
    assert pars[3] == 'par4'

    assert paragraphs_from_lines(testlines1.split('\n'),
                                 splitchar=None) == pars

    pars = paragraphs_from_lines(testlines1, break_on_num_newlines=1)
    assert len(pars) == 7
    assert pars[0] == 'par1 lorem'
    assert pars[1] == 'ipsum'
    assert pars[6] == 'par4'

    pars = paragraphs_from_lines(testlines1, break_on_num_newlines=3)
    assert len(pars) == 3
    assert pars[0] == 'par1 lorem ipsum par2 lorem'
    assert pars[1] == 'par3 ipsum lorem dorem'
    assert pars[2] == 'par4'