예제 #1
0
def test1():
    # check length
    data = ['abc d abc de abc def', 'abc defg abc def gh abc def ghi']
    shingled = [ks.shingleseqs_k(s, k=9) for s in data]
    for n in range(1, 15):
        VOCAB = ks.identify_vocab(shingled, n_max_vocab=n)
        assert len(VOCAB) == n
예제 #2
0
def test5():
    data = ['a', 'ab']
    shingled = [ks.shingleseqs_k(s, k=2) for s in data]
    VOCAB = ks.identify_vocab(
        shingled, sortmode='log-x-length', n_min_count=1, n_max_vocab=None)
    assert VOCAB == ['ab', 'a', 'b']
예제 #3
0
def test2():
    data = ['a', 'ab']
    shingled = [ks.shingleseqs_k(s, k=2) for s in data]
    VOCAB = ks.identify_vocab(
        shingled, sortmode='prefer-shorter', n_min_count=1, n_max_vocab=None)
    assert VOCAB == ['a', 'b', 'ab']