Пример #1
0
def test_load_nips_raw():
    docs = data.load_nips(years=[2008], raw=True)
    assert_equal(len(docs), 250)
    assert_true(isinstance(docs[0], basestring))

    docs = data.load_nips(raw=True)
    assert_equal(len(docs), 2261)
    assert_true(isinstance(docs[0], basestring))
Пример #2
0
def test_load_nips_tokenized():
    docs = data.load_nips(years=[2008], raw=False)
    assert_equal(len(docs), 250)
    assert_true(isinstance(docs[0], list))

    docs = data.load_nips(raw=False)
    assert_equal(len(docs), 2261)
    assert_true(isinstance(docs[0], list))
Пример #3
0
def test_label_finder():
    finder = BigramLabelFinder(measure='pmi', pos=None)
    labels = finder.find(load_nips(years=[2009]), top_n=5)
    assert_equal(labels, [(u'monte', u'carlo'),
                          (u'high', u'dimensional'),
                          (u'does', u'not'),  # not so good
                          (u'experimental', u'results'),
                          (u'nonparametric', u'bayesian')])
Пример #4
0
def test_label_finder():
    finder = BigramLabelFinder(measure='pmi', pos=None)
    labels = finder.find(load_nips(years=[2009]), top_n=5)
    assert_equal(
        labels,
        [
            (u'monte', u'carlo'),
            (u'high', u'dimensional'),
            (u'does', u'not'),  # not so good
            (u'experimental', u'results'),
            (u'nonparametric', u'bayesian')
        ])
Пример #5
0
def test_label_finder_with_pos():
    tagger = CorpusPOSTagger()
    finder = BigramLabelFinder(measure='pmi', pos=[('NN', 'NN'), ('JJ', 'NN')])

    docs = load_nips(years=[2009])
    docs = tagger.transform(docs)

    labels = finder.find(docs, top_n=5, strip_tags=False)

    assert_equal(labels, [((u'monte', 'NN'), (u'carlo', 'NN')),
                          ((u'nonparametric', 'JJ'), (u'bayesian', 'NN')),
                          ((u'active', 'JJ'), (u'learning', 'NN')),
                          ((u'machine', 'NN'), (u'learning', 'NN')),
                          ((u'semi-supervised', 'JJ'), (u'learning', 'NN'))])

    labels = finder.find(docs, top_n=5)

    assert_equal(labels, [(u'monte', u'carlo'),
                          (u'nonparametric', u'bayesian'),
                          (u'active', u'learning'), (u'machine', u'learning'),
                          (u'semi-supervised', u'learning')])
Пример #6
0
def test_label_finder_with_pos():
    tagger = CorpusPOSTagger()
    finder = BigramLabelFinder(measure='pmi', pos=[('NN', 'NN'),
                                                   ('JJ', 'NN')])

    docs = load_nips(years=[2009])
    docs = tagger.transform(docs)

    labels = finder.find(docs, top_n=5, strip_tags=False)
    
    assert_equal(labels, [((u'monte', 'NN'), (u'carlo', 'NN')),
                          ((u'nonparametric', 'JJ'), (u'bayesian', 'NN')),
                          ((u'active', 'JJ'), (u'learning', 'NN')),
                          ((u'machine', 'NN'), (u'learning', 'NN')),
                          ((u'semi-supervised', 'JJ'), (u'learning', 'NN'))])

    labels = finder.find(docs, top_n=5)
    
    assert_equal(labels, [(u'monte', u'carlo'),
                          (u'nonparametric', u'bayesian'),
                          (u'active', u'learning'),
                          (u'machine', u'learning'),
                          (u'semi-supervised', u'learning')])