def test_corpus_pass_tmpreproc(): c = Corpus() c['doc1'] = 'A simple example in simple English.' c['doc2'] = 'It contains only three very simple documents.' c['doc3'] = 'Simply written documents are very brief.' preproc = TMPreproc(c) tok = preproc.tokenize().tokens assert set(tok.keys()) == set(c.keys()) assert len(tok['doc1']) == 7
def evaluate_model(file_name, date, n_iter, scope, lang, n_eval=5): # corpus = Corpus() corpus.add_files(file_name, encoding='utf8') # preproc = TMPreproc(corpus) dtm_bg = preproc.dtm # var_params = [{'n_topics': k} for k in range(5, int(n_eval*10), n_eval)] # const_params = { 'n_iter': n_iter, 'random_state': 20200713 # to make results reproducible } eval_results = evaluate_topic_models(dtm_bg, varying_parameters=var_params, constant_parameters=const_params, metric=['loglikelihood', 'cao_juan_2009', 'arun_2010']#, #return_models=True ) # eval_results_by_topics = results_by_parameter(eval_results, 'n_topics') # name = "evaluate_model_{}_{}iter_{}eval_{}_{}.png".format(date, n_iter, n_eval, scope, lang) plot_eval_results(eval_results_by_topics, figsize=(8, 6), metric_direction_font_size='x-small', title_fontsize='small', axes_title_fontsize='x-small') plt.tight_layout() plt.savefig('out/'+name) return
def _check_save_load_state(preproc, repeat=1, recreate_from_state=False): # copy simple attribute states simple_state_attrs = ('language', 'stopwords', 'punctuation', 'special_chars', 'n_workers', 'tokenized', 'pos_tagged', 'ngrams_generated', 'ngrams_as_tokens') pre_state = { attr: deepcopy(getattr(preproc, attr)) for attr in simple_state_attrs } # copy complex attribute states pre_state['docs'] = deepcopy(preproc.docs) if preproc.tokenized: pre_state['tokens'] = preproc.tokens pre_state['vocabulary'] = preproc.vocabulary if preproc.pos_tagged: pre_state['tokens_with_pos_tags'] = preproc.tokens_with_pos_tags if preproc.ngrams_generated: pre_state['ngrams'] = preproc.ngrams # save and then load the same state for _ in range(repeat): if recreate_from_state: preproc.save_state(TMPREPROC_TEMP_STATE_FILE) preproc = TMPreproc.from_state(TMPREPROC_TEMP_STATE_FILE) else: preproc.save_state(TMPREPROC_TEMP_STATE_FILE).load_state( TMPREPROC_TEMP_STATE_FILE) # check if states are the same now for attr in simple_state_attrs: assert pre_state[attr] == getattr(preproc, attr) assert set(pre_state['docs'].keys()) == set(preproc.docs.keys()) assert preproc.n_docs == len(pre_state['docs']) assert all(pre_state['docs'][k] == preproc.docs[k] for k in preproc.docs.keys()) if preproc.tokenized: assert set(pre_state['tokens'].keys()) == set(preproc.tokens.keys()) assert all(pre_state['tokens'][k] == preproc.tokens[k] for k in preproc.tokens.keys()) assert pre_state['vocabulary'] == preproc.vocabulary if preproc.pos_tagged: assert set(pre_state['tokens_with_pos_tags'].keys()) == set( preproc.tokens_with_pos_tags.keys()) assert all(pre_state['tokens_with_pos_tags'][k] == preproc.tokens_with_pos_tags[k] for k in preproc.tokens_with_pos_tags.keys()) if preproc.ngrams_generated: assert set(pre_state['ngrams'].keys()) == set(preproc.ngrams.keys()) assert all(pre_state['ngrams'][k] == preproc.ngrams[k] for k in preproc.ngrams.keys())
An example for preprocessing documents in German language and generating a document-term-matrix (DTM). """ from pprint import pprint import pandas as pd from tmtoolkit.preprocess import TMPreproc from tmtoolkit.utils import pickle_data if __name__ == '__main__': # this is necessary for multiprocessing on Windows! corpus = { u'doc1': u'Ein einfaches Beispiel in einfachem Deutsch.', u'doc2': u'Es enthält nur drei sehr einfache Dokumente.', u'doc3': u'Die Dokumente sind sehr kurz.', } preproc = TMPreproc(corpus, language='german') print('tokenized:') preproc.tokenize() pprint(preproc.tokens) # preproc.stem() # pprint(preproc.tokens) print('POS tagged:') preproc.pos_tag() pprint(preproc.tokens_with_pos_tags) print('lemmatized:') preproc.lemmatize() pprint(preproc.tokens_with_pos_tags)
print(corpus.docs.keys()) print("-----") corpus.split_by_paragraphs() print("documents split into paragraphs") print(corpus.docs.keys()) print("-----") print("first 5 paragraphs of Werther:") for par_num in range(1, 6): doclabel = u'werther-goethe_werther1-%d' % par_num print(u"par%d (document label '%s'):" % (par_num, doclabel)) print(corpus.docs[doclabel]) print("-----") preproc = TMPreproc(corpus.docs, language=u'german') preproc.tokenize().tokens_to_lowercase() print("tokenized first 5 paragraphs of Werther:") for par_num in range(1, 6): doclabel = u'werther-goethe_werther1-%d' % par_num print(u"par%d (document label '%s'):" % (par_num, doclabel)) print(preproc.tokens[doclabel]) preproc.generate_ngrams(2, join=False).use_ngrams_as_tokens(join=True) print("bigrams from first 5 paragraphs of Werther:") for par_num in range(1, 6): doclabel = u'werther-goethe_werther1-%d' % par_num print(u"par%d (document label '%s'):" % (par_num, doclabel)) print(preproc.tokens[doclabel])
#%% timings = [] timing_labels = [] def add_timing(label): timings.append(datetime.today()) timing_labels.append(label) #%% add_timing('start') preproc = TMPreproc(corpus, n_max_processes=cpu_count()) add_timing('load') preproc.tokenize() add_timing('tokenize') preproc.expand_compound_tokens() add_timing('expand_compound_tokens') preproc.pos_tag() add_timing('pos_tag') preproc.lemmatize() add_timing('lemmatize') preproc.remove_special_chars_in_tokens()
""" Script that generates "eval_table/eval_table.csv" from text samples in folder "eval_texts". This table is later used to manually add correct lemmata. Markus Konrad <*****@*****.**>, Wissenschaftszentrum Berlin für Sozialforschung January 2019 """ import pandas as pd from tmtoolkit.corpus import Corpus from tmtoolkit.preprocess import TMPreproc corpus = Corpus.from_folder('eval_texts') preproc = TMPreproc(corpus.docs, language='german') postagged = preproc.tokenize().pos_tag() postagged = postagged.filter_for_pos({'N', 'V', 'ADJ', 'ADV'}) tok_pos_df = pd.DataFrame() for doc_id, tok_pos in postagged.tokens_with_pos_tags.items(): tok, pos = zip(*tok_pos) tok_pos_df = tok_pos_df.append(pd.DataFrame({ 'doc_id': doc_id, 'token': tok, 'pos': pos }), ignore_index=True) tok_pos_df.drop_duplicates(['token', 'pos'], inplace=True) tok_pos_df.to_csv('eval_table/eval_table.csv')
dtm, vocab, doc_labels = load_dtm_from_pickle(DTM_PICKLE) else: europarl = nltk.corpus.util.LazyCorpusLoader( 'europarl_raw', nltk.corpus.EuroparlCorpusReader, fileids=FILEIDS) corpus = Corpus( {f: europarl.raw(f_id) for f, f_id in zip(FILES, FILEIDS)}) print("all loaded documents:") for dl, text in corpus.docs.items(): print("%s: %d chars" % (dl, len(text))) print("-----") start_time = time.time() preproc = TMPreproc(corpus.docs, language=u'german') print('tokenizing...') preproc.tokenize() print('POS tagging...') preproc.pos_tag() print('lemmatization...') preproc.lemmatize() print('lowercase transform...') preproc.tokens_to_lowercase() print('cleaning...') preproc.clean_tokens() proc_time = time.time() - start_time print('-- processing took %f sec. so far' % proc_time) preproc.save_state('data/read_preproc_lda_de_state.pickle')
from tmtoolkit.topicmod.evaluate import results_by_parameter from tmtoolkit.topicmod.visualize import plot_eval_results logging.basicConfig(level=logging.INFO) tmtoolkit_log = logging.getLogger('tmtoolkit') tmtoolkit_log.setLevel(logging.INFO) tmtoolkit_log.propagate = True print('loading data...') bt18 = pd.read_pickle('data/bt18_sample_1000.pickle') print('loaded %d documents' % len(bt18)) doc_labels = [u'%s_%s' % info for info in zip(bt18.sitzung, bt18.sequence)] print('preprocessing data...') bt18corp = Corpus(dict(zip(doc_labels, bt18.text))) preproc = TMPreproc(bt18corp, language='german') preproc.tokenize().stem().clean_tokens() doc_labels = list(preproc.tokens.keys()) texts = list(preproc.tokens.values()) print('creating gensim corpus...') gnsm_dict = gensim.corpora.Dictionary.from_documents(texts) gnsm_corpus = [gnsm_dict.doc2bow(text) for text in texts] # evaluate topic models with different parameters const_params = dict(update_every=0, passes=10) ks = list(range(10, 140, 10)) + list(range(140, 200, 20)) varying_params = [dict(num_topics=k, alpha=1.0 / k) for k in ks] print('evaluating %d topic models' % len(varying_params))
tmtoolkit_log.setLevel(logging.INFO) tmtoolkit_log.propagate = True random.seed(20200320) #%% corpus = Corpus.from_builtin_corpus('en-NewsArticles').sample(1000) print('%d documents' % len(corpus)) #%% add_timing('start') preproc = TMPreproc(corpus, language='en', n_max_processes=cpu_count()) add_timing('load and tokenize') preproc.expand_compound_tokens() add_timing('expand_compound_tokens') preproc.pos_tag() add_timing('pos_tag') preproc.lemmatize() add_timing('lemmatize') preproc_copy = preproc.copy() preproc_copy.shutdown_workers() del preproc_copy add_timing('copy')
An example for preprocessing documents in German language and generating a document-term-matrix (DTM). """ from pprint import pprint from tmtoolkit.preprocess import TMPreproc import pandas as pd if __name__ == '__main__': # this is necessary for multiprocessing on Windows! corpus = { 'doc1': u'A simple example in simple English.', 'doc2': u'It contains only three very simple documents.', 'doc3': u'Simply written documents are very brief.', } preproc = TMPreproc(corpus, language='english') print('input corpus:') pprint(corpus) print('running preprocessing pipeline...') preproc.tokenize().pos_tag().lemmatize().tokens_to_lowercase().clean_tokens() print('final tokens:') pprint(preproc.tokens) print('DTM:') doc_labels, vocab, dtm = preproc.get_dtm() # using pandas just for a nice tabular output print(pd.DataFrame(dtm.todense(), columns=vocab, index=doc_labels))
#%% Correct contractions # some contractions have a stray space in between, like "EU -Hilfen" where it should be "EU-Hilfen" # correct this by applying a custom function with a regular expression (RE) to each document in the corpus pttrn_contraction_ws = re.compile(r'(\w+)(\s+)(-\w+)') print('correcting wrong contractions') # in each document text `t`, remove the RE group 2 (the stray white space "(\s+)") for each match `m` corpus.apply(lambda t: pttrn_contraction_ws.sub(lambda m: m.group(1) + m.group(3), t)) #%% Create a TMPreproc object for token processing # this takes some time because the documents are directly tokenized print('creating TMPreproc object from corpus') preproc = TMPreproc(corpus, language='german') print('created: %s' % preproc) # we don't need this anymore, remove it to free memory del corpus #%% Calculate the total number of tokens in the whole corpus print('total number of tokens in the whole corpus:') print(sum(preproc.doc_lengths.values())) #%% Have a glimpse at the tokens # Note that "preproc.tokens_datatable" (*table* instead of *frame*) is much faster but the "datatable" package is still # in early development stages. If you like to have a pandas dataframe instead, use the property "tokens_dataframe".
def tmpreproc_de(): return TMPreproc(corpus_de.docs, language='german')
def tmpreproc_en(): return TMPreproc(corpus_en.docs, language='english')
from tmtoolkit.corpus import Corpus from tmtoolkit.preprocess import TMPreproc from examples._benchmarktools import add_timing, print_timings logging.basicConfig(level=logging.INFO) tmtoolkit_log = logging.getLogger('tmtoolkit') tmtoolkit_log.setLevel(logging.INFO) tmtoolkit_log.propagate = True #%% corpus = Corpus.from_builtin_corpus('en-NewsArticles') print('%d documents' % len(corpus)) #%% add_timing('start') preproc = TMPreproc(corpus, language='en', n_max_processes=4) add_timing('load and tokenize') preproc.pos_tag() add_timing('pos_tag') preproc.lemmatize() add_timing('lemmatize') print_timings()
u'einzig', u'beim', u'darin', u'innerhalb', u'daraus', u'dadurch', u'allerdings' ] print('preparing corpus...') corpus = {} for speech_id, speech in speeches_df.iterrows(): doc_label = '%d_sess%d_top%d_spk_%s_seq%d' % ( speech_id, speech.sitzung, speech.top_id, speech.speaker_fp, speech.sequence) corpus[doc_label] = speech.text assert len(corpus) == len(speeches_df) print('starting preprocessing...') preproc = TMPreproc(corpus, language='german') preproc.add_stopwords(CUSTOM_STOPWORDS) preproc.add_special_chars(CUSTOM_SPECIALCHARS) print('tokenizing...') preproc.tokenize() vocab = preproc.vocabulary pttrn_token_w_specialchar = re.compile( u'[^A-Za-z0-9ÄÖÜäöüß' + re.escape(string.punctuation) + u']', re.UNICODE) pttrn_token_w_specialchar_inv = re.compile( u'[A-Za-z0-9ÄÖÜäöüß' + re.escape(string.punctuation) + u']', re.UNICODE) tokens_w_specialchars = [ t for t in vocab if pttrn_token_w_specialchar.search(t) ] uncommon_special_chars = set(