示例#1
0
def timelines(data, args):
    print('timelines')
    data.date = data.date.apply(lambda x: pd.to_datetime(x, errors='coerce'))
    data = pvtm_utils.extract_time_info(data, 'date')

    print(
        'Extracted datetime information, starting topic importance aggregation..'
    )
    topic_importance_df = pvtm_utils.get_topic_importance_df(
        args['agg_lvl'], data)
    topic_importance_df.to_csv('{}/timelines_df.csv'.format(args['path']))

    mean_of_means = topic_importance_df.mean().mean()

    savepath = '{}/topics/timelines'.format(args['path'])
    print('Store timelines in folder:', savepath)
    pvtm_utils.check_path(savepath)

    for topic in data.gmm_top_topic.unique():
        topic_importance_df.loc[:, topic].ewm(span=3).mean().plot(
            label='Probability', linestyle=':')
        plt.axhline(topic_importance_df[topic].mean(),
                    c='b',
                    label='Mean Probability current Topic',
                    linestyle=':')
        plt.axhline(mean_of_means,
                    c='r',
                    label='Mean Probability all Topics',
                    linestyle='--')

        plt.plot(np.nan, '-g', label='Number of Documents (right)'
                 )  # Make an agent for the twinx axis
        plt.legend(loc='best', prop={'size': 8})
        plt.title('Topic importance over time. Topic: {}'.format(topic))
        plt.grid()

        timesteps = data.sort_values('date')[args['agg_lvl']].unique()
        _list = [
            pvtm_utils.show_topics_per_choosen_granularity(
                data, 'gmm_top_topic', [topic], args['agg_lvl'], granular)
            for granular in timesteps
        ]
        df = pd.concat(_list).fillna(0)

        ax2 = plt.twinx()
        ax2.plot(df.ewm(span=3).mean(),
                 c='g',
                 label='Number of Documents',
                 linestyle='--')
        plt.xlim(topic_importance_df.index[0], topic_importance_df.index[-1])

        file_name = 'timeline_Topic_{}'.format(topic)
        plt.savefig('{}/topics/timelines/{}.pdf'.format(
            args['path'], file_name),
                    bbox_inches='tight')
        plt.savefig('{}/topics/timelines/{}.png'.format(
            args['path'], file_name),
                    bbox_inches='tight')
        plt.close()
示例#2
0
def wordclouds(data, args):
    print('wordclouds..')
    pvtm_utils.check_path('{}/wordclouds'.format(args['path']))

    print('get tf-idf vocabulary..')
    vocabulary = get_vocabulary_from_tfidf(data.text.values,
                                           args['vectorizermin'],
                                           args['vectorizermax'])
    #
    stopwords = pvtm_utils.get_all_stopwords()
    print('# stopwords:', len(stopwords))

    # popularity based pre-filtering. Ignore rare and common words. And we don't want stopwords and digits.
    print('start pop based prefiltering')
    pp = []
    for i, line in enumerate(data.text.values):
        rare_removed = list(
            filter(lambda word: word in vocabulary, line.split()))

        stops_removed = [
            word.strip() for word in rare_removed
            if word not in stopwords and not word.isdigit()
        ]
        pp.append(stops_removed)

    print('finished pop based prefiltering')
    data['data_clean'] = pp
    topicgroup = data.groupby('gmm_top_topic')

    for i, group in topicgroup:
        cc = [
            word.lower().strip().replace('ä', 'ae').replace('ü', 'ue').replace(
                'ö', 'oe').replace('ß', 'ss')
            for _list in group['data_clean'].values for word in _list
        ]

        with open('{}/wordclouds/topic_{}.txt'.format(args['path'], i),
                  'w',
                  encoding='utf-8') as textfile:
            textfile.write('\n'.join(cc))

    # create pdf wordclouds
    commands = ["RScript", "wordclouds.R", args['path']]
    subprocess.call(commands)

    print('Wordclouds to svg..')
    command = 'FOR %A IN ({}\wordclouds\*.pdf) DO inkscape %A --export-plain-svg=%A.svg --export-area-drawing'.format(
        args['path'])
    os.system(command=command)

    print('Wordclouds to png..')
    command = 'FOR %A IN ({}\wordclouds\*.pdf) DO inkscape %A --export-png=%A.png --export-area-drawing -b "white" -d 800'.format(
        args['path'])
    os.system(command=command)
示例#3
0
def similarity_wordclouds(data, args):
    print('wordclouds..')
    pvtm_utils.check_path('{}/wordclouds'.format(args['path']))

    print('get tf-idf vocabulary..')
    vocabulary = get_vocabulary_from_tfidf(data.text.values,
                                           args['vectorizermin'],
                                           args['vectorizermax'])
    #
    stopwords = pvtm_utils.get_all_stopwords()
    print('# stopwords:', len(stopwords))

    # popularity based pre-filtering. Ignore rare and common words. And we don't want stopwords and digits.
    print('start pop based prefiltering')

    topn = 5000
    topics_numbers = data.gmm_top_topic.unique()
    print(topics_numbers)
    for topic in topics_numbers:
        print(topic)
        path = '{}/wordclouds/topic_{}.txt'.format(args['path'], topic)
        wordlist_df, words_df = similarity_wordclouds_to_text(
            model, center, topic, path, topn, vocabulary, stopwords)

    print('finished pop based prefiltering')

    # create pdf wordclouds
    commands = ["RScript", "wordclouds.R", args['path']]
    subprocess.call(commands)

    print('Wordclouds to png..')
    command = 'FOR %A IN ({}\wordclouds\*.pdf) DO inkscape %A --export-pdf=%A --export-area-drawing -b "white" '.format(
        args['path'])
    os.system(command=command)

    print('Wordclouds to svg..')
    command = 'FOR %A IN ({}\wordclouds\*.pdf) DO inkscape %A --export-plain-svg=%A.svg --export-area-drawing'.format(
        args['path'])
    os.system(command=command)

    print('Wordclouds to png..')
    command = 'FOR %A IN ({}\wordclouds\*.pdf) DO inkscape %A --export-png=%A.png --export-area-drawing -b "white" -d 800'.format(
        args['path'])
    os.system(command=command)
示例#4
0
print(
    'Extracted datetime information, starting topic importance aggregation..')
topic_importance_df = pvtm_utils.get_topic_importance_df(agg_lvl, out)
new_index = pd.DatetimeIndex([
    pd.to_datetime('{}-01-01'.format(t))
    for t in topic_importance_df.index.values
])
topic_importance_df.index = new_index

top_n_trending_topics = pvtm_utils.get_top_n_trending_topics(
    topic_importance_df, 1, 'gmm_top_topic')

imp_per_my = topic_importance_df.copy()
mean_of_means = imp_per_my.mean().mean()

pvtm_utils.check_path('{}/topics/timelines'.format(args['path']))

for topic in out.gmm_top_topic.unique():
    plt.axhline(imp_per_my[topic].mean(),
                c='b',
                label='Mean Probability current Topic',
                linestyle=':')
    plt.axhline(mean_of_means,
                c='r',
                label='Mean Probability all Topics',
                linestyle='--')

    imp_per_my[topic].plot(label='Probability', linestyle=':')

    plt.plot(np.nan, '-g', label='Number of Documents (right)'
             )  # Make an agent for the twinx axis
示例#5
0
    import pandas as pd
    import matplotlib.pyplot as plt
    import numpy as np
    import re, json, random
    import subprocess
    from sklearn.externals import joblib
    from sklearn import mixture

    # custom functions
    import pvtm_utils
    import clustering
    import doc2vec

    import stopwords_generator

    pvtm_utils.check_path(args["output"])

    # store settings to file for later reference
    with open('{}/file.txt'.format(args['output']), 'w') as file:
        file.write(json.dumps(args))  # use `json.loads` to do the reverse

    args['gmmrange'] = range(args['gmmrange'][0], args['gmmrange'][1],
                             args['gmmrange'][2])
    ###################################
    # # Load Model, Data and Stopwords
    ###################################
    # Load the specified data into a dataframe, 'out',
    # and load the trained Doc2Vec model(or train a new one, if NEW_Doc2Vec = 1).
    # train a new model if specified, otherwise load pretrained model
    if args['d2v_model'] == "":
        print('Training New Doc2Vec Model.')