Пример #1
0
def evaluate_model(file_name, date, n_iter, scope, lang, n_eval=5):
  #
  corpus = Corpus()
  corpus.add_files(file_name, encoding='utf8')
  #
  preproc = TMPreproc(corpus)
  dtm_bg = preproc.dtm
  #
  var_params = [{'n_topics': k} for k in range(5, int(n_eval*10), n_eval)]
  #
  const_params = {
    'n_iter': n_iter,
    'random_state': 20200713  # to make results reproducible
  }
  eval_results = evaluate_topic_models(dtm_bg,
                                     varying_parameters=var_params,
                                     constant_parameters=const_params,
                                     metric=['loglikelihood', 'cao_juan_2009', 'arun_2010']#,
                                     #return_models=True
                                     )
  #
  eval_results_by_topics = results_by_parameter(eval_results, 'n_topics')
  #
  name = "evaluate_model_{}_{}iter_{}eval_{}_{}.png".format(date, n_iter, n_eval, scope, lang)
  plot_eval_results(eval_results_by_topics, figsize=(8, 6), metric_direction_font_size='x-small', title_fontsize='small', axes_title_fontsize='x-small')
  plt.tight_layout()
  plt.savefig('out/'+name)
  return
Пример #2
0
def test_corpus_replace_characters_simple():
    c = Corpus({'doc1': 'ABC', 'doc2': 'abcDeF'})
    c.replace_characters({'a': None, 'C': 'c', 'e': ord('X')})

    assert c.docs == {
        'doc1': 'ABc',
        'doc2': 'bcDXF',
    }

    c.replace_characters({ord('A'): None})

    assert c.docs == {
        'doc1': 'Bc',
        'doc2': 'bcDXF',
    }

    c.replace_characters(str.maketrans('DXFY', '1234'))

    assert c.docs == {
        'doc1': 'Bc',
        'doc2': 'bc123',
    }

    c.replace_characters({})

    assert c.docs == {
        'doc1': 'Bc',
        'doc2': 'bc123',
    }
Пример #3
0
def test_corpus_unique_characters(texts):
    all_chars = set(''.join(texts))

    c = Corpus({str(i): t for i, t in enumerate(texts)})
    res_chars = c.unique_characters
    assert isinstance(res_chars, set)
    assert res_chars == all_chars
Пример #4
0
def test_corpus_from_pickle():
    c1 = Corpus({'a': '1', 'b': '22', 'c': '333'})

    with tempfile.TemporaryFile(suffix='.pickle') as f:
        c1.to_pickle(f)
        f.seek(0)
        c2 = Corpus.from_pickle(f)

    assert c1.docs == c2.docs
Пример #5
0
def test_corpus_pass_tmpreproc():
    c = Corpus()
    c['doc1'] = 'A simple example in simple English.'
    c['doc2'] = 'It contains only three very simple documents.'
    c['doc3'] = 'Simply written documents are very brief.'

    preproc = TMPreproc(c)
    tok = preproc.tokenize().tokens
    assert set(tok.keys()) == set(c.keys())
    assert len(tok['doc1']) == 7
Пример #6
0
def test_corpus_apply(texts):
    c = Corpus({str(i): t for i, t in enumerate(texts)})
    c_orig = c.copy()
    orig_doc_labels = c.doc_labels
    orig_doc_lengths = c.doc_lengths

    assert isinstance(c.apply(str.upper), Corpus)

    assert c.doc_labels == orig_doc_labels
    assert c.doc_lengths == orig_doc_lengths

    for dl, dt in c.items():
        assert c_orig[dl].upper() == dt
Пример #7
0
def test_corpus_copy(texts):
    c1 = Corpus({str(i): t for i, t in enumerate(texts)})
    c2 = c1.copy()

    assert c1.docs is not c2.docs
    assert c1.docs == c2.docs

    assert c1.doc_paths is not c2.doc_paths
    assert c1.doc_paths == c2.doc_paths

    assert c1.doc_labels == c2.doc_labels
    assert c1.doc_lengths == c2.doc_lengths
    assert c1.unique_characters == c2.unique_characters
Пример #8
0
def test_corpus_add_doc():
    c = Corpus()
    with pytest.raises(ValueError):
        c.add_doc('', 'x')
    with pytest.raises(ValueError):
        c.add_doc(123, 'x')
    with pytest.raises(ValueError):
        c.add_doc('d1', None)

    c.add_doc('d1', 'd1 text')
    with pytest.raises(ValueError):
        c.add_doc('d1', 'd1 text')

    c.add_doc('d2', '')

    assert set(c.keys()) == {'d1', 'd2'}
Пример #9
0
def test_corpus_from_files():
    doc_path = 'examples/data/gutenberg/kafka_verwandlung.txt'
    c1 = Corpus.from_files([doc_path])
    c2 = Corpus().add_files([doc_path])

    assert len(c1.docs) == len(c1.doc_paths) == 1
    assert len(c2.docs) == len(c2.doc_paths) == 1
    assert c1.docs.keys() == c2.docs.keys() == c1.doc_paths.keys(
    ) == c2.doc_paths.keys()

    only_doc_label = next(iter(c1.docs.keys()))
    assert only_doc_label.endswith('kafka_verwandlung')

    only_doc = c1.docs[only_doc_label]
    assert len(only_doc) > 0

    assert c1.doc_paths[only_doc_label] == doc_path
Пример #10
0
def test_corpus_dict_methods():
    c = Corpus()
    assert len(c) == 0
    with pytest.raises(KeyError):
        x = c['x']

    with pytest.raises(KeyError):
        c[1] = 'abc'

    with pytest.raises(KeyError):
        c[''] = 'abc'

    with pytest.raises(ValueError):
        c['d1'] = None

    c['d1'] = 'd1 text'
    assert len(c) == 1
    assert 'd1' in c
    assert set(c.keys()) == {'d1'}
    assert c['d1'] == 'd1 text'

    c['d2'] = 'd2 text'
    assert len(c) == 2
    for dl in c:
        assert dl in {'d1', 'd2'}
    assert set(c.keys()) == {'d1', 'd2'}

    for dl, dt in c.items():
        assert dl in {'d1', 'd2'}
        assert c[dl] == dt

    with pytest.raises(KeyError):
        del c['d3']

    del c['d1']
    assert len(c) == 1
    assert set(c.keys()) == {'d2'}

    del c['d2']
    assert len(c) == 0
    assert set(c.keys()) == set()
Пример #11
0
def test_corpus_filter_characters(texts):
    c = Corpus({str(i): t for i, t in enumerate(texts)})
    c_orig = c.copy()

    orig_doc_labels = c.doc_labels
    orig_doc_lengths = c.doc_lengths
    orig_uniq_chars = c.unique_characters

    assert isinstance(c.filter_characters(orig_uniq_chars), Corpus)
    assert c.doc_labels == orig_doc_labels
    assert c.doc_lengths == orig_doc_lengths
    assert c.unique_characters == orig_uniq_chars

    not_in_corpus_chars = set(string.printable) - orig_uniq_chars
    if len(not_in_corpus_chars) > 0:
        c.filter_characters(not_in_corpus_chars)
        assert c.doc_labels == orig_doc_labels
        assert c.doc_lengths == {dl: 0 for dl in c.doc_labels}
        assert c.unique_characters == set()

    c = c_orig.copy()
    c.filter_characters(set())
    assert c.doc_labels == orig_doc_labels
    assert c.doc_lengths == {dl: 0 for dl in c.doc_labels}
    assert c.unique_characters == set()

    if len(orig_uniq_chars) > 3:
        c = c_orig.copy()
        only_chars = set(sample(list(orig_uniq_chars), 3))
        c.filter_characters(only_chars)
        assert c.doc_labels == orig_doc_labels
        assert c.doc_lengths != orig_doc_lengths
        assert c.unique_characters == only_chars

        c = c_orig.copy()
        only_chars = set(sample(list(orig_uniq_chars), 3))
        c.filter_characters(''.join(only_chars))  # as char sequence
        assert c.doc_labels == orig_doc_labels
        assert c.doc_lengths != orig_doc_lengths
        assert c.unique_characters == only_chars
Пример #12
0
#

MAX_DOC_LEN = 5000
N_DOCS_EN = 7
N_DOCS_DE = 3  # given from corpus size

all_docs_en = {
    f_id: nltk.corpus.gutenberg.raw(f_id)
    for f_id in nltk.corpus.gutenberg.fileids()
}
smaller_docs_en = [(dl, txt[:min(nchar, MAX_DOC_LEN)])
                   for dl, txt, nchar in map(lambda x: (x[0], x[1], len(x[1])),
                                             all_docs_en.items())]

corpus_en = Corpus(
    dict(
        sample([(dl, txt) for dl, txt in smaller_docs_en
                if dl != u'melville-moby_dick.txt'], N_DOCS_EN - 2)))
corpus_en.docs['empty_doc'] = ''  # additionally test empty document
corpus_en.docs[u'melville-moby_dick.txt'] = dict(smaller_docs_en)[
    u'melville-moby_dick.txt']  # make sure we always have moby dick
#corpus_en = Corpus(dict(smaller_docs_en))
corpus_de = Corpus.from_folder('examples/data/gutenberg',
                               read_size=MAX_DOC_LEN)


@pytest.fixture
def tmpreproc_en():
    return TMPreproc(corpus_en.docs, language='english')


@pytest.fixture
Пример #13
0
def test_empty_corpora():
    c1 = Corpus()
    c2 = Corpus.from_files([])
    c3 = Corpus.from_files([]).add_files([])
    assert c1.docs == c2.docs == c3.docs == {}
Пример #14
0
from tmtoolkit.utils import pickle_data
from tmtoolkit.topicmod.evaluate import results_by_parameter
from tmtoolkit.topicmod.visualize import plot_eval_results

logging.basicConfig(level=logging.INFO)
tmtoolkit_log = logging.getLogger('tmtoolkit')
tmtoolkit_log.setLevel(logging.INFO)
tmtoolkit_log.propagate = True

print('loading data...')
bt18 = pd.read_pickle('data/bt18_sample_1000.pickle')
print('loaded %d documents' % len(bt18))
doc_labels = [u'%s_%s' % info for info in zip(bt18.sitzung, bt18.sequence)]

print('preprocessing data...')
bt18corp = Corpus(dict(zip(doc_labels, bt18.text)))
preproc = TMPreproc(bt18corp, language='german')
preproc.tokenize().stem().clean_tokens()

doc_labels = list(preproc.tokens.keys())
texts = list(preproc.tokens.values())

print('creating gensim corpus...')
gnsm_dict = gensim.corpora.Dictionary.from_documents(texts)
gnsm_corpus = [gnsm_dict.doc2bow(text) for text in texts]

# evaluate topic models with different parameters
const_params = dict(update_every=0, passes=10)
ks = list(range(10, 140, 10)) + list(range(140, 200, 20))
varying_params = [dict(num_topics=k, alpha=1.0 / k) for k in ks]
Пример #15
0
logging.basicConfig(level=logging.DEBUG)
tmtoolkit_log = logging.getLogger('tmtoolkit')
tmtoolkit_log.setLevel(logging.DEBUG)
tmtoolkit_log.propagate = True

if __name__ == '__main__':  # this is necessary for multiprocessing on Windows!
    if os.path.exists(DTM_PICKLE):
        print("loading DTM data from pickle file '%s'..." % DTM_PICKLE)

        dtm, vocab, doc_labels = load_dtm_from_pickle(DTM_PICKLE)
    else:
        europarl = nltk.corpus.util.LazyCorpusLoader(
            'europarl_raw', nltk.corpus.EuroparlCorpusReader, fileids=FILEIDS)

        corpus = Corpus(
            {f: europarl.raw(f_id)
             for f, f_id in zip(FILES, FILEIDS)})

        print("all loaded documents:")
        for dl, text in corpus.docs.items():
            print("%s: %d chars" % (dl, len(text)))
        print("-----")

        start_time = time.time()
        preproc = TMPreproc(corpus.docs, language=u'german')
        print('tokenizing...')
        preproc.tokenize()
        print('POS tagging...')
        preproc.pos_tag()
        print('lemmatization...')
        preproc.lemmatize()
Пример #16
0
del bt18pickle, bt18zip

#%% Generate document labels

# format of the document labels: <session_number>_<speech_number>
bt18_data['doc_label'] = ['%s_%s' % (str(sitzung).zfill(3), str(seq).zfill(5))
                          for sitzung, seq in zip(bt18_data.sitzung, bt18_data.sequence)]

print('loaded data frame with %d rows:' % bt18_data.shape[0])
print(bt18_data.head())


#%% Generate a Corpus object to preprocess the raw, untokenized text data

# we use the column "doc_label" as document labels and "text" as raw text
corpus = Corpus(dict(zip(bt18_data.doc_label, bt18_data.text)))
print('created corpus')

print('document lengths in number of characters:')
pprint(corpus.doc_lengths)

# we don't need this anymore, remove it to free memory
del bt18_data

#%% Investigate the set of characters used in the whole corpus

# we can see that there are several "strange" characters and unprintable unicode characters which may later cause
# trouble
print('used set of characters used in the whole corpus:')
pprint(corpus.unique_characters)
Пример #17
0
def test_corpus_doc_lengths():
    c = Corpus({'a': '1', 'b': '22', 'c': '333'})
    assert isinstance(c.doc_lengths, dict)
    assert c.doc_lengths == {'a': 1, 'b': 2, 'c': 3}
Пример #18
0
def test_corpus_doc_labels():
    c = Corpus({'a': 'Doc A text', 'b': 'Doc B text', 'c': 'Doc C text'})
    assert isinstance(c.doc_labels, list)
    assert list(c.doc_labels) == c.get_doc_labels(sort=True) == list('abc')
Пример #19
0
def test_corpus_n_docs():
    c = Corpus({'a': 'Doc A text', 'b': 'Doc B text', 'c': 'Doc C text'})
    assert c.n_docs == len(c) == 3
Пример #20
0
from tmtoolkit.corpus import Corpus
from tmtoolkit.preprocess import TMPreproc

logging.basicConfig(level=logging.INFO)
tmtoolkit_log = logging.getLogger('tmtoolkit')
tmtoolkit_log.setLevel(logging.INFO)
tmtoolkit_log.propagate = True

#%%

use_paragraphs = len(sys.argv) > 1 and sys.argv[1] == 'paragraphs'

#%%

corpus = Corpus({
    f_id: nltk.corpus.gutenberg.raw(f_id)
    for f_id in nltk.corpus.gutenberg.fileids() if f_id != 'bible-kjv.txt'
})

if use_paragraphs:
    print('using paragraphs as documents')
    corpus.split_by_paragraphs()

print('%d documents' % len(corpus))

#%%

timings = []
timing_labels = []


def add_timing(label):