preproc = TMPreproc(corpus.docs, language=u'german')
        print('tokenizing...')
        preproc.tokenize()
        print('POS tagging...')
        preproc.pos_tag()
        print('lemmatization...')
        preproc.lemmatize()
        print('lowercase transform...')
        preproc.tokens_to_lowercase()
        print('cleaning...')
        preproc.clean_tokens()

        proc_time = time.time() - start_time
        print('-- processing took %f sec. so far' % proc_time)

        preproc.save_state('data/read_preproc_lda_de_state.pickle')

        print('token samples:')
        for dl, tokens in preproc.tokens_with_pos_tags.items():
            print("> %s:" % dl)
            print(">>", sample(tokens, 10))

        print('generating DTM...')
        doc_labels, vocab, dtm = preproc.get_dtm()

        print("saving DTM data to pickle file '%s'..." % DTM_PICKLE)
        save_dtm_to_pickle(dtm, vocab, doc_labels, DTM_PICKLE)

    print("running LDA...")
    model = lda.LDA(n_topics=30, n_iter=500)
    model.fit(dtm)
예제 #2
0
preproc.expand_compound_tokens()
add_timing('expand_compound_tokens')

preproc.pos_tag()
add_timing('pos_tag')

preproc.lemmatize()
add_timing('lemmatize')

preproc_copy = preproc.copy()
preproc_copy.shutdown_workers()
del preproc_copy
add_timing('copy')

_, statepickle = mkstemp('.pickle')
preproc.save_state(statepickle)
add_timing('save_state')

preproc_copy = TMPreproc.from_state(statepickle)
preproc_copy.shutdown_workers()
del preproc_copy
add_timing('from_state')

preproc_copy = TMPreproc.from_tokens(preproc.tokens_with_metadata,
                                     language='en')
preproc_copy.shutdown_workers()
del preproc_copy
add_timing('from_tokens')

preproc_copy = TMPreproc.from_tokens_datatable(preproc.tokens_datatable,
                                               language='en')