preproc.pos_tag() add_timing('pos_tag') preproc.lemmatize() add_timing('lemmatize') preproc.remove_special_chars_in_tokens() add_timing('remove_special_chars_in_tokens') preproc.tokens_to_lowercase() add_timing('tokens_to_lowercase') preproc.clean_tokens() add_timing('clean_tokens') preproc.remove_common_tokens(0.9) preproc.remove_uncommon_tokens(0.05) add_timing('remove_common_tokens / remove_uncommon_tokens') vocab = preproc.vocabulary add_timing('get vocab') tokens = preproc.tokens add_timing('get tokens') tokens_tagged = preproc.get_tokens(with_metadata=True, as_datatables=False) add_timing('get tagged tokens') dtm = preproc.get_dtm() add_timing('get dtm')
vocab_doc_freq_df = pd.DataFrame({'token': list(vocab_doc_freq.keys()), 'freq': list(vocab_doc_freq.values())}) print('top 50 tokens by relative document frequency:') vocab_top = vocab_doc_freq_df.sort_values('freq', ascending=False).head(50) print(vocab_top) # plot this plt.figure() vocab_top.plot(x='token', y='freq', kind='bar') plt.show() #%% Further token cleanup # we can remove tokens above a certain threshold of (relative or absolute) document frequency preproc.remove_common_tokens(0.8) # this will only remove "müssen" # since we'll later use tf-idf, common words don't have much influence on the result and can remain #%% Document lengths (number of tokens per document) doc_labels = np.array(list(preproc.doc_lengths.keys())) doc_lengths = np.array(list(preproc.doc_lengths.values())) print('range of document lengths: %d tokens minimum, %d tokens maximum' % (np.min(doc_lengths), np.max(doc_lengths))) print('mean document length:', np.mean(doc_lengths)) print('mean document length:', np.median(doc_lengths)) plt.figure() plt.hist(doc_lengths, bins=100) plt.title('Histogram of document lengths')