예제 #1
0
def test_corpus_apply(texts):
    c = Corpus({str(i): t for i, t in enumerate(texts)})
    c_orig = c.copy()
    orig_doc_labels = c.doc_labels
    orig_doc_lengths = c.doc_lengths

    assert isinstance(c.apply(str.upper), Corpus)

    assert c.doc_labels == orig_doc_labels
    assert c.doc_lengths == orig_doc_lengths

    for dl, dt in c.items():
        assert c_orig[dl].upper() == dt
예제 #2
0
def test_corpus_copy(texts):
    c1 = Corpus({str(i): t for i, t in enumerate(texts)})
    c2 = c1.copy()

    assert c1.docs is not c2.docs
    assert c1.docs == c2.docs

    assert c1.doc_paths is not c2.doc_paths
    assert c1.doc_paths == c2.doc_paths

    assert c1.doc_labels == c2.doc_labels
    assert c1.doc_lengths == c2.doc_lengths
    assert c1.unique_characters == c2.unique_characters
예제 #3
0
def test_corpus_filter_characters(texts):
    c = Corpus({str(i): t for i, t in enumerate(texts)})
    c_orig = c.copy()

    orig_doc_labels = c.doc_labels
    orig_doc_lengths = c.doc_lengths
    orig_uniq_chars = c.unique_characters

    assert isinstance(c.filter_characters(orig_uniq_chars), Corpus)
    assert c.doc_labels == orig_doc_labels
    assert c.doc_lengths == orig_doc_lengths
    assert c.unique_characters == orig_uniq_chars

    not_in_corpus_chars = set(string.printable) - orig_uniq_chars
    if len(not_in_corpus_chars) > 0:
        c.filter_characters(not_in_corpus_chars)
        assert c.doc_labels == orig_doc_labels
        assert c.doc_lengths == {dl: 0 for dl in c.doc_labels}
        assert c.unique_characters == set()

    c = c_orig.copy()
    c.filter_characters(set())
    assert c.doc_labels == orig_doc_labels
    assert c.doc_lengths == {dl: 0 for dl in c.doc_labels}
    assert c.unique_characters == set()

    if len(orig_uniq_chars) > 3:
        c = c_orig.copy()
        only_chars = set(sample(list(orig_uniq_chars), 3))
        c.filter_characters(only_chars)
        assert c.doc_labels == orig_doc_labels
        assert c.doc_lengths != orig_doc_lengths
        assert c.unique_characters == only_chars

        c = c_orig.copy()
        only_chars = set(sample(list(orig_uniq_chars), 3))
        c.filter_characters(''.join(only_chars))  # as char sequence
        assert c.doc_labels == orig_doc_labels
        assert c.doc_lengths != orig_doc_lengths
        assert c.unique_characters == only_chars