preproc.tokenize() pprint(preproc.tokens) # preproc.stem() # pprint(preproc.tokens) print('POS tagged:') preproc.pos_tag() pprint(preproc.tokens_with_pos_tags) print('lemmatized:') preproc.lemmatize() pprint(preproc.tokens_with_pos_tags) print('lowercase:') preproc.tokens_to_lowercase() pprint(preproc.tokens) print('cleaned:') preproc.clean_tokens() pprint(preproc.tokens_with_pos_tags) pprint(preproc.tokens) print('filtered:') preproc.filter_for_token(u'einfach', remove_found_token=True) preproc.filter_for_pos('N') pprint(preproc.tokens_with_pos_tags) print('saving tokens as pickle...') pickle_data(preproc.tokens, 'data/preproc_gen_dtm_de_tokens.pickle')
#%% Saving / loading state # at any time you can save the current processing state to disk via `save_state(<path to file>)` and later # restore it via `from_state(<path to file>)` # this is extremely useful when you have computations that take a long time and after which you want to create # "save points" in order to load the state and continue experimenting with the data without having to run the # whole processing pipeline again # preproc.save_state('data/bt18_tagged_lemmatized_state.pickle') # preproc = TMPreproc.from_state('data/bt18_tagged_lemmatized_state.pickle') #%% Further token normalization # convert all tokens to lowercase and apply several "cleaning" methods (see `clean_tokens` for details) print('applying further token normalization') preproc.tokens_to_lowercase().clean_tokens().remove_tokens(r'^-.+', match_type='regex') print('vocabulary:') pprint(preproc.vocabulary) print('\nvocabulary contains %d tokens' % len(preproc.vocabulary)) # there are still some stray tokens which should be removed: preproc.remove_tokens(['#en', "''", "'s", '+++', '+40', ',50', '...', '.plädieren']) #%% Let's have a look at the most frequent tokens print('retrieving document frequencies for all tokens in the vocabulary') vocab_doc_freq = preproc.vocabulary_rel_doc_frequency vocab_doc_freq_df = pd.DataFrame({'token': list(vocab_doc_freq.keys()), 'freq': list(vocab_doc_freq.values())})