Exemplo n.º 1
0
def test_doc2weight():
    from microtc.textmodel import TextModel
    from microtc.weighting import TFIDF
    from microtc.utils import tweet_iterator
    import os
    fname = join(os.path.dirname(__file__), 'text.json')
    tw = list(tweet_iterator(fname))
    docs = [x['text'] for x in tw]
    text = TextModel(docs, token_list=[-1, 3])
    # print(text['buenos dias'])
    docs = [text.tokenize(d) for d in docs]
    sp = TFIDF(docs)
    assert len(sp.doc2weight(text.tokenize('odio odio los los'))) == 3
Exemplo n.º 2
0
def test_getitem():
    from microtc.textmodel import TextModel
    from microtc.weighting import TFIDF
    from microtc.utils import tweet_iterator
    import os
    fname = join(os.path.dirname(__file__), 'text.json')
    tw = list(tweet_iterator(fname))
    docs = [x['text'] for x in tw]
    text = TextModel(docs, token_list=[-1, 3])
    # print(text['buenos dias'])
    docs = [text.tokenize(d) for d in docs]
    sp = TFIDF(docs)
    tok = text.tokenize('buenos dias')
    bow = sp.doc2weight(tok)
    ids = bow[0]
    assert len(ids) == len(sp[tok])