def timelines(data, args): print('timelines') data.date = data.date.apply(lambda x: pd.to_datetime(x, errors='coerce')) data = pvtm_utils.extract_time_info(data, 'date') print( 'Extracted datetime information, starting topic importance aggregation..' ) topic_importance_df = pvtm_utils.get_topic_importance_df( args['agg_lvl'], data) topic_importance_df.to_csv('{}/timelines_df.csv'.format(args['path'])) mean_of_means = topic_importance_df.mean().mean() savepath = '{}/topics/timelines'.format(args['path']) print('Store timelines in folder:', savepath) pvtm_utils.check_path(savepath) for topic in data.gmm_top_topic.unique(): topic_importance_df.loc[:, topic].ewm(span=3).mean().plot( label='Probability', linestyle=':') plt.axhline(topic_importance_df[topic].mean(), c='b', label='Mean Probability current Topic', linestyle=':') plt.axhline(mean_of_means, c='r', label='Mean Probability all Topics', linestyle='--') plt.plot(np.nan, '-g', label='Number of Documents (right)' ) # Make an agent for the twinx axis plt.legend(loc='best', prop={'size': 8}) plt.title('Topic importance over time. Topic: {}'.format(topic)) plt.grid() timesteps = data.sort_values('date')[args['agg_lvl']].unique() _list = [ pvtm_utils.show_topics_per_choosen_granularity( data, 'gmm_top_topic', [topic], args['agg_lvl'], granular) for granular in timesteps ] df = pd.concat(_list).fillna(0) ax2 = plt.twinx() ax2.plot(df.ewm(span=3).mean(), c='g', label='Number of Documents', linestyle='--') plt.xlim(topic_importance_df.index[0], topic_importance_df.index[-1]) file_name = 'timeline_Topic_{}'.format(topic) plt.savefig('{}/topics/timelines/{}.pdf'.format( args['path'], file_name), bbox_inches='tight') plt.savefig('{}/topics/timelines/{}.png'.format( args['path'], file_name), bbox_inches='tight') plt.close()
def wordclouds(data, args): print('wordclouds..') pvtm_utils.check_path('{}/wordclouds'.format(args['path'])) print('get tf-idf vocabulary..') vocabulary = get_vocabulary_from_tfidf(data.text.values, args['vectorizermin'], args['vectorizermax']) # stopwords = pvtm_utils.get_all_stopwords() print('# stopwords:', len(stopwords)) # popularity based pre-filtering. Ignore rare and common words. And we don't want stopwords and digits. print('start pop based prefiltering') pp = [] for i, line in enumerate(data.text.values): rare_removed = list( filter(lambda word: word in vocabulary, line.split())) stops_removed = [ word.strip() for word in rare_removed if word not in stopwords and not word.isdigit() ] pp.append(stops_removed) print('finished pop based prefiltering') data['data_clean'] = pp topicgroup = data.groupby('gmm_top_topic') for i, group in topicgroup: cc = [ word.lower().strip().replace('ä', 'ae').replace('ü', 'ue').replace( 'ö', 'oe').replace('ß', 'ss') for _list in group['data_clean'].values for word in _list ] with open('{}/wordclouds/topic_{}.txt'.format(args['path'], i), 'w', encoding='utf-8') as textfile: textfile.write('\n'.join(cc)) # create pdf wordclouds commands = ["RScript", "wordclouds.R", args['path']] subprocess.call(commands) print('Wordclouds to svg..') command = 'FOR %A IN ({}\wordclouds\*.pdf) DO inkscape %A --export-plain-svg=%A.svg --export-area-drawing'.format( args['path']) os.system(command=command) print('Wordclouds to png..') command = 'FOR %A IN ({}\wordclouds\*.pdf) DO inkscape %A --export-png=%A.png --export-area-drawing -b "white" -d 800'.format( args['path']) os.system(command=command)
def similarity_wordclouds(data, args): print('wordclouds..') pvtm_utils.check_path('{}/wordclouds'.format(args['path'])) print('get tf-idf vocabulary..') vocabulary = get_vocabulary_from_tfidf(data.text.values, args['vectorizermin'], args['vectorizermax']) # stopwords = pvtm_utils.get_all_stopwords() print('# stopwords:', len(stopwords)) # popularity based pre-filtering. Ignore rare and common words. And we don't want stopwords and digits. print('start pop based prefiltering') topn = 5000 topics_numbers = data.gmm_top_topic.unique() print(topics_numbers) for topic in topics_numbers: print(topic) path = '{}/wordclouds/topic_{}.txt'.format(args['path'], topic) wordlist_df, words_df = similarity_wordclouds_to_text( model, center, topic, path, topn, vocabulary, stopwords) print('finished pop based prefiltering') # create pdf wordclouds commands = ["RScript", "wordclouds.R", args['path']] subprocess.call(commands) print('Wordclouds to png..') command = 'FOR %A IN ({}\wordclouds\*.pdf) DO inkscape %A --export-pdf=%A --export-area-drawing -b "white" '.format( args['path']) os.system(command=command) print('Wordclouds to svg..') command = 'FOR %A IN ({}\wordclouds\*.pdf) DO inkscape %A --export-plain-svg=%A.svg --export-area-drawing'.format( args['path']) os.system(command=command) print('Wordclouds to png..') command = 'FOR %A IN ({}\wordclouds\*.pdf) DO inkscape %A --export-png=%A.png --export-area-drawing -b "white" -d 800'.format( args['path']) os.system(command=command)
print( 'Extracted datetime information, starting topic importance aggregation..') topic_importance_df = pvtm_utils.get_topic_importance_df(agg_lvl, out) new_index = pd.DatetimeIndex([ pd.to_datetime('{}-01-01'.format(t)) for t in topic_importance_df.index.values ]) topic_importance_df.index = new_index top_n_trending_topics = pvtm_utils.get_top_n_trending_topics( topic_importance_df, 1, 'gmm_top_topic') imp_per_my = topic_importance_df.copy() mean_of_means = imp_per_my.mean().mean() pvtm_utils.check_path('{}/topics/timelines'.format(args['path'])) for topic in out.gmm_top_topic.unique(): plt.axhline(imp_per_my[topic].mean(), c='b', label='Mean Probability current Topic', linestyle=':') plt.axhline(mean_of_means, c='r', label='Mean Probability all Topics', linestyle='--') imp_per_my[topic].plot(label='Probability', linestyle=':') plt.plot(np.nan, '-g', label='Number of Documents (right)' ) # Make an agent for the twinx axis
import pandas as pd import matplotlib.pyplot as plt import numpy as np import re, json, random import subprocess from sklearn.externals import joblib from sklearn import mixture # custom functions import pvtm_utils import clustering import doc2vec import stopwords_generator pvtm_utils.check_path(args["output"]) # store settings to file for later reference with open('{}/file.txt'.format(args['output']), 'w') as file: file.write(json.dumps(args)) # use `json.loads` to do the reverse args['gmmrange'] = range(args['gmmrange'][0], args['gmmrange'][1], args['gmmrange'][2]) ################################### # # Load Model, Data and Stopwords ################################### # Load the specified data into a dataframe, 'out', # and load the trained Doc2Vec model(or train a new one, if NEW_Doc2Vec = 1). # train a new model if specified, otherwise load pretrained model if args['d2v_model'] == "": print('Training New Doc2Vec Model.')