Пример #1
0
def test_corpus_builtin_corpora():
    builtin_corp = Corpus.builtin_corpora()
    assert len(builtin_corp) == 2

    for corp in builtin_corp:
        c = Corpus.from_builtin_corpus(corp)
        assert len(c) > 0
Пример #2
0
def test_corpus_builtin_corpora():
    builtin_corp = Corpus.builtin_corpora()
    assert sorted(builtin_corp) == sorted(
        Corpus._BUILTIN_CORPORA_LOAD_KWARGS.keys())

    for corp in builtin_corp:
        c = Corpus.from_builtin_corpus(corp)
        assert len(c) > 0
Пример #3
0
def load_corpus_bg_en(sample_n):
    from tmtoolkit.corpus import Corpus

    builtin_corp_en = Corpus.from_builtin_corpus('en-NewsArticles')
    return builtin_corp_en.sample(sample_n)
Пример #4
0
from tmtoolkit.corpus import Corpus
from tmtoolkit.preprocess import TMPreproc

from examples._benchmarktools import add_timing, print_timings

logging.basicConfig(level=logging.INFO)
tmtoolkit_log = logging.getLogger('tmtoolkit')
tmtoolkit_log.setLevel(logging.INFO)
tmtoolkit_log.propagate = True

random.seed(20200320)

#%%

corpus = Corpus.from_builtin_corpus('en-NewsArticles').sample(1000)

print('%d documents' % len(corpus))

#%%

add_timing('start')

preproc = TMPreproc(corpus, language='en', n_max_processes=cpu_count())
add_timing('load and tokenize')

preproc.expand_compound_tokens()
add_timing('expand_compound_tokens')

preproc.pos_tag()
add_timing('pos_tag')
import logging
from multiprocessing import cpu_count

from tmtoolkit.corpus import Corpus
from tmtoolkit.preprocess import TMPreproc

from examples._benchmarktools import add_timing, print_timings

logging.basicConfig(level=logging.INFO)
tmtoolkit_log = logging.getLogger('tmtoolkit')
tmtoolkit_log.setLevel(logging.INFO)
tmtoolkit_log.propagate = True

#%%

corpus = Corpus.from_builtin_corpus('en-NewsArticles')

print('%d documents' % len(corpus))

#%%

add_timing('start')

preproc = TMPreproc(corpus, language='en', n_max_processes=4)
add_timing('load and tokenize')

preproc.pos_tag()
add_timing('pos_tag')

preproc.lemmatize()
add_timing('lemmatize')