Пример #1
0
def change_data_set_format():
    raw_dataset_path = './data/all-comments.csv'
    df = pandas.read_csv(raw_dataset_path, header=0, encoding="ISO-8859-1")

    pandas.set_option('max_colwidth', 1000)
    submission_texts_df = df.Submission_Text
    submission_numbers_df = df.Submission_Number

    cv = CountVectorizer(
        stop_words=stopwords.words('english'))  # Remove some basic stop words
    combined_data = get_combined_data(submission_numbers_df,
                                      submission_texts_df)
    data_df = pandas.DataFrame.from_dict(combined_data).transpose()
    data_df.columns = ['Submission_Text']
    data_df.drop_duplicates(subset='Submission_Text',
                            keep='first',
                            inplace=True)  # Remove duplicated comments
    data_df = data_df.sort_index()
    submission_nums = []
    for sub_num in data_df.axes[0]:
        submission_nums.append(sub_num)
    data_df['Submission_Num'] = submission_nums

    csv_file_result_path = './dataset.csv'
    create_file(csv_file_result_path)
    data_df.to_csv(csv_file_result_path)
Пример #2
0
def graph_coherence_scores_alpha(coherence_values, graph_path, fr, to, step):  # This method is not being used
    plt.plot(coherence_values)
    plt.xlabel("Alpha")
    plt.ylabel("Coherence score")
    plt.legend("coherence_values", loc='best')
    create_file(graph_path)
    plt.savefig(graph_path)
Пример #3
0
def generate_topic_words():  # output to csv files.
    pbar = tqdm.tqdm(total=len(models_path))
    i = 0
    for model_path in models_path:
        df = pd.DataFrame()
        lda_model = LdaMallet.load(model_path)
        lda_model = models.wrappers.ldamallet.malletmodel2ldamodel(
            lda_model, iterations=iteration)
        topics_dictionary = lda_model.show_topics(num_topics=num_topics[i],
                                                  num_words=30,
                                                  formatted=False)

        for topic in topics_dictionary:
            topic_num = topic[0] + 1
            terms_string = ''
            for term in topic[1]:
                terms_string += term[0] + ', '
            df = df.append(pd.Series([topic_num, terms_string[:-2]]),
                           ignore_index=True)

        output_path = f'./turn-in/{bigram_threshold}/topic_terms/{num_topics[i]}.csv'
        create_file(output_path)
        df.columns = ['Topic', 'Terms']
        df.sort_values(by=['Topic'], ascending=True,
                       inplace=True)  # Sort columns in ascending order
        df.to_csv(output_path, index=False)
        pbar.update(1)
        i = i + 1
    pbar.close()
Пример #4
0
def graph_coherence_scores_num_topics(coherence_values, graph_path, fr, to, step):
    x = range(fr, to, step)
    plt.plot(x, coherence_values)
    plt.xlabel("Num Topics")
    plt.ylabel("Coherence score")
    plt.legend("coherence_values", loc='best')
    create_file(graph_path)
    plt.savefig(graph_path)
Пример #5
0
def generate_topic_proportion_terms(
):  # This calculation is based on dominant topic belonged to each doc.
    with open(corpus_path, 'rb') as f:
        corpus = pickle.load(f)

    pbar = tqdm.tqdm(total=len(models_path))
    i = 0
    for model_path in models_path:
        lda_model = LdaMallet.load(model_path)
        lda_model = models.wrappers.ldamallet.malletmodel2ldamodel(
            lda_model, iterations=iteration)

        df_dominant_topic_document = dominant_topics(lda_model=lda_model,
                                                     corpus=corpus)
        # Number of Documents for Each Topic
        topic_counts = df_dominant_topic_document[
            'Dominant_Topic'].value_counts()

        # Percentage of Documents for Each Topic
        topic_contribution = round(topic_counts / topic_counts.sum(), 4)

        # Topic Nums
        topic_nums = pd.Series(topic_contribution.index,
                               topic_contribution.index)

        topic_terms = pd.Series()
        # Topic Terms
        topics_dictionary = lda_model.show_topics(num_topics=num_topics[i],
                                                  num_words=30,
                                                  formatted=False)
        for topic in topics_dictionary:
            topic_num = topic[0] + 1
            terms_string = ''
            for term in topic[1]:
                terms_string += term[0] + ', '
            topic_terms = topic_terms.append(
                pd.Series(terms_string[:-2], index=[topic_num * 1.0]))

        # Concatenate Column wise
        df_dominant_topics = pd.concat(
            [topic_nums, topic_counts, topic_contribution, topic_terms],
            axis=1)

        # Change Column names
        df_dominant_topics.columns = [
            'Topic', 'Count_Documents', 'Proportion_Over_Documents', 'Terms'
        ]
        df_dominant_topics.sort_values(
            by=['Topic'], ascending=True,
            inplace=True)  # Sort columns in ascending order

        output_path = f'./turn-in/{bigram_threshold}/topic_proportion_terms/{num_topics[i]}.csv'
        create_file(output_path)
        df_dominant_topics.to_csv(output_path, index=False)
        pbar.update(1)
        i = i + 1
    pbar.close()
Пример #6
0
def get_models_log_perplexity_covergence_coherence(paths, num_topics,
                                                   iterations, passes,
                                                   log_path):
    create_file(log_path)
    logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s',
                        filename=log_path,
                        filemode='a',
                        level=logging.NOTSET)

    with open(paths[1], 'rb') as f:
        corpus = pickle.load(f)  # bag-of-words

    dictionary = Dictionary.load(paths[9])
    temp = dictionary[0]  # This is only to "load" the dictionary.
    id2word = dictionary.id2token

    perplexity_logger = PerplexityMetric(corpus=corpus, logger='shell')
    convergence_logger = ConvergenceMetric(logger='shell')
    coherence_cv_logger = CoherenceMetric(corpus=corpus,
                                          logger='shell',
                                          coherence='u_mass')

    for iteration in tqdm(iterations):
        logging.debug(f'Start of model: {iteration} iterations')

        # Create model with callbacks argument uses list of created callback loggers
        model = models.ldamodel.LdaModel(corpus=corpus,
                                         id2word=dictionary,
                                         num_topics=num_topics,
                                         eval_every=1,
                                         chunksize=5932,
                                         passes=passes,
                                         random_state=100,
                                         iterations=iteration,
                                         callbacks=[
                                             convergence_logger,
                                             perplexity_logger,
                                             coherence_cv_logger
                                         ])

        logging.debug(f'End of model: {iteration} iterations')
        if not os.path.exists(
                f"./evaluations/graph_scores/{num_topics}_topics_{iterations[-1]}_iterations_{passes}_passes/models/lda_{iteration}i{passes}p_models/"
        ):
            os.makedirs(
                f"./evaluations/graph_scores/{num_topics}_topics_{iterations[-1]}_iterations_{passes}_passes/models/lda_{iteration}i{passes}p_models/"
            )
        model.save(
            f"./evaluations/graph_scores/{num_topics}_topics_{iterations[-1]}_iterations_{passes}_passes/models/lda_{iteration}i{passes}p_models/lda_{iteration}i{passes}p.gensim"
        )
Пример #7
0
def save_lda_model(paths,
                   lda_model,
                   num_topics,
                   passes,
                   alpha=None,
                   beta=None):
    # save lda model
    if alpha is not None and beta is not None:
        lda_model_path = paths[16] + str(alpha) + '_alpha_' + str(
            beta) + '_beta_' + str(num_topics) + '-' + str(passes) + '.gensim'
    else:
        lda_model_path = paths[16] + str(num_topics) + '-' + str(
            passes) + '.gensim'
    create_file(lda_model_path)
    lda_model.save(lda_model_path)
Пример #8
0
def print_topic_coherence_to_text_file(text_file_path, num_topics, lda_model, corpus, path_dict):
    create_file(text_file_path)
    # top_topics = lda_model.top_topics(corpus)
    # avg_topic_coherence = sum([topic[1] for topic in top_topics]) / num_topics
    # append_row_to_text_file(
    #     str('Average topic coherence: %.9f\n' % avg_topic_coherence),
    #     text_file_path
    # )

    dictionary = Dictionary.load(path_dict)
    temp = dictionary[0]  # This is only to "load" the dictionary.
    id2word = dictionary.id2token
    coherence_model_lda = CoherenceModel(model=lda_model, corpus=corpus, dictionary=id2word, coherence='u_mass')
    coherence_lda = coherence_model_lda.get_coherence()
    append_row_to_text_file(str('Coherence Score: %.9f\n' % coherence_lda),
                            text_file_path)
Пример #9
0
def generate_bigram_list():  # output to csv files.
    dictionary = Dictionary.load(get_paths_without_reset('all')[9])
    temp = dictionary[0]  # This is only to "load" the dictionary.
    id2word = dictionary.id2token
    df = pd.DataFrame()
    pbar = tqdm.tqdm(total=len(dictionary.cfs))

    for id, count in dictionary.cfs.items():
        if '_' in id2word[id]:
            df = df.append(pd.Series([id2word[id], count]), ignore_index=True)
        pbar.update(1)

    output_path = f'./turn-in/{bigram_threshold}/bigram-list.csv'
    create_file(output_path)
    df.columns = ['Bigram', 'Count']
    df.sort_values(by=['Count', 'Bigram'], ascending=False,
                   inplace=True)  # Sort columns in descending order
    df.to_csv(output_path, index=False)
    pbar.close()
Пример #10
0
def calculate_mean_std_deviation2(raw_dataset_csv_path, corpus_path,
                                  dictionary_path,
                                  output_path):  # output to csv files.
    with open(corpus_path, 'rb') as f:
        corpus = pickle.load(f)  # Bag of words
    # dictionary = Dictionary.load(dictionary_path)
    raw_dataset = pandas.read_csv(raw_dataset_csv_path)

    raw_word_counts_list = list()
    for doc in raw_dataset['Submission_Text']:  # Get words count from raw data
        doc_word_count = len(doc.split())
        raw_word_counts_list.append(doc_word_count)
    raw_word_count_series = pandas.Series(data=raw_word_counts_list)
    raw_mean = raw_word_count_series.mean()
    raw_std_deviation = raw_word_count_series.std()

    word_counts_list = list()
    for doc in corpus:
        x = 0
        for word in doc:
            x += word[1]
        word_counts_list.append(x)
    word_counts_series = pandas.Series(data=word_counts_list)
    mean = word_counts_series.mean()
    std_deviation = word_counts_series.std()

    if not os.path.exists(output_path):
        create_file(output_path)
        df = pd.DataFrame()
        df = df.append(pd.Series([bigram_threshold, mean, std_deviation]),
                       ignore_index=True)
        df.columns = ['Bigram_Threshold', 'Mean', 'Standard_Deviation']
    else:
        df = pandas.read_csv(output_path, header=0)
        df = df.append(
            {
                'Bigram_Threshold': bigram_threshold,
                'Mean': mean,
                'Standard_Deviation': std_deviation
            },
            ignore_index=True)
    df.to_csv(output_path, index=False)
Пример #11
0
def show_docs_has_entropy_threshold(threshold, num_topics):
    folder_path = f'./turn-in/{bigram_threshold}/model_entropy/'
    dataset = pandas.read_csv(dataset_csv_path)  # Load dataset.csv file

    csv_paths = list()
    j = 0
    while j < len(num_topics):
        csv_paths.append(folder_path + str(num_topics[j]) + '.csv')
        j = j + 1

    index = 0
    pbar = tqdm.tqdm(total=len(csv_paths))
    while index < len(csv_paths):
        output_path = f'./turn-in/{bigram_threshold}/docs_entropy_less_than_0.2/{num_topics[index]}.csv'
        create_file(output_path)
        data_df = pd.read_csv(csv_paths[index])
        output_df = pd.DataFrame()
        doc_id = 0
        for entropy_value in data_df.Entropy:
            if entropy_value < threshold:  # Apply threshold
                output_df = output_df.append(pd.Series([
                    str(doc_id), dataset['Submission_Num'][doc_id],
                    data_df.Probabilities[doc_id],
                    str(entropy_value), dataset['Submission_Text'][doc_id]
                ]),
                                             ignore_index=True)
            doc_id = doc_id + 1
        if output_df.empty:
            output_df = output_df.append(pd.Series([
                'Document_No', 'Submission_Num', 'Probabilities', 'Entropy',
                'Submission_Text'
            ]),
                                         ignore_index=True)
        else:
            output_df.columns = [
                'Document_No', 'Submission_Num', 'Probabilities', 'Entropy',
                'Submission_Text'
            ]
        output_df.to_csv(output_path, index=False, header=False)
        index = index + 1
        pbar.update(1)
    pbar.close()
Пример #12
0
def visualize_LDA(paths, num_topics, passes, alpha=None, beta=None):
    dictionary = Dictionary.load(paths[9])
    with open(paths[1], 'rb') as f:
        corpus = pickle.load(f)
    temp = dictionary[0]  # This is only to "load" the dictionary.

    if alpha is not None and beta is not None:
        model_path = paths[16] + str(alpha) + '_alpha_' + str(beta) + '_beta_' + str(num_topics) + '-' + str(passes) + '.gensim'
        html_path = paths[15] + str(alpha) + '_alpha_' + str(beta) + '_beta_' + str(num_topics) + '-' + str(passes) + '.html'
    else:
        model_path = paths[16] + str(num_topics) + '-' + str(passes) + '.gensim'
        html_path = paths[15] + str(num_topics) + '-' + str(passes) + '.html'
    lda_model = models.ldamodel.LdaModel.load(model_path)
    lda_display = pyLDAvis.gensim.prepare(
        lda_model, corpus,
        dictionary, sort_topics=True, mds='mmds'
    )

    create_file(html_path)
    pyLDAvis.save_html(lda_display, html_path)
Пример #13
0
def calculate_mean_std_deviation(raw_dataset_csv_path, corpus_path,
                                 dictionary_path, output_path):
    with open(corpus_path, 'rb') as f:
        corpus = pickle.load(f)  # Bag of words
    dictionary = Dictionary.load(dictionary_path)

    raw_dataset = pandas.read_csv(raw_dataset_csv_path)
    raw_words_count = 0
    for text in raw_dataset[
            'Submission_Text']:  # Get words count from raw data
        raw_words_count += len(text.split())
    raw_mean = raw_words_count / len(raw_dataset['Submission_Text'])

    words_count = 0
    for word_id, word_count in dictionary.cfs.items(
    ):  # Get words count from cleaned data
        words_count += word_count
    mean = words_count / len(corpus)

    raw_std_deviation = 0  # standard deviation (from raw data)
    for text in raw_dataset['Submission_Text']:
        x = len(text.split())
        raw_std_deviation += (x - raw_mean) * (x - raw_mean)
    raw_std_deviation /= len(raw_dataset['Submission_Text'])
    raw_std_deviation = math.sqrt(raw_std_deviation)

    std_deviation = 0  # standard deviation (from cleaned data)
    for doc in corpus:
        x = 0  # word count for doc
        for word in doc:
            x += word[1]
        std_deviation += (x - mean) * (x - mean)
    std_deviation /= len(corpus)
    std_deviation = math.sqrt(std_deviation)

    if not os.path.exists(output_path):
        create_file(output_path)
    rs_text = f'raw_mean = {raw_mean}\traw_stdDeviation = {raw_std_deviation}\n' \
              f'mean = {mean}\tstd_deviation = {std_deviation}'
    append_row_to_text_file(string=rs_text,
                            path=output_path)  # Output to txt file
Пример #14
0
def generate_pyLDAvis_with_models(models_path, num_topics):
    with open(corpus_path, 'rb') as f:
        corpus = pickle.load(f)
    dict_path = './output/dictionary/all.gensim'
    dictionary = Dictionary.load(dict_path)
    tmp = dictionary[0]

    index = 0

    while index < len(models_path):
        lda_model = models.ldamodel.LdaModel.load(models_path[index])
        lda_display = pyLDAvis.gensim.prepare(
            lda_model,
            corpus,
            dictionary,
            sort_topics=True,
            mds='mmds'  # , R = 50
        )
        html_path = './turn-in/' + str(num_topics[index]) + '-' + '.html'
        index = index + 1
        create_file(html_path)
        pyLDAvis.save_html(lda_display, html_path)
Пример #15
0
def generate_topic_weight_terms():
    with open(corpus_path, 'rb') as f:
        corpus = pickle.load(f)

    pbar = tqdm.tqdm(total=len(models_path))
    i = 0
    for model_path in models_path:
        lda_model = LdaMallet.load(model_path)
        lda_model = models.wrappers.ldamallet.malletmodel2ldamodel(
            lda_model, iterations=iteration)
        df = topics_proportion(lda_model=lda_model,
                               corpus=corpus,
                               num_topics=num_topics[i])
        df.sort_values(by=['Topic'], ascending=True,
                       inplace=True)  # Sort columns in ascending order

        output_path = f'./turn-in/{bigram_threshold}/topic_proportion_terms/{num_topics[i]}.csv'
        create_file(output_path)
        df.to_csv(output_path, index=False)
        pbar.update(1)
        i = i + 1
    pbar.close()
Пример #16
0
def coherence_scores_to_csv(models_path, num_topics):
    with open(corpus_path, 'rb') as f:
        corpus = pickle.load(f)

    index = 0
    df = pd.DataFrame()
    output_path = f'./turn-in/{bigram_threshold}/topic_coherence.csv'
    create_file(output_path)

    while index < len(models_path):
        lda_model = LdaModel.load(models_path[index])
        if type(lda_model) is LdaMallet:
            lda_model = models.wrappers.ldamallet.malletmodel2ldamodel(
                lda_model, iterations=iteration)
        top_topics = lda_model.top_topics(corpus)
        avg_topic_coherence = sum([topic[1] for topic in top_topics]) \
                              / num_topics[index]
        df = df.append(pd.Series([num_topics[index], avg_topic_coherence]),
                       ignore_index=True)
        index = index + 1
    df.columns = ['Num_Topics', 'Coherence_Score']
    df.to_csv(output_path, index=False)
Пример #17
0
def calculate_entropy_mallet_models():  # output to csv files.
    with open(corpus_path, 'rb') as f:
        corpus = pickle.load(f)

    index = 0
    dataset = pandas.read_csv(dataset_csv_path)
    for model_path in models_path:
        lda_model = LdaMallet.load(model_path)
        lda_model = models.wrappers.ldamallet.malletmodel2ldamodel(
            lda_model, iterations=iteration)

        df = pd.DataFrame()
        pbar = tqdm.tqdm(total=len(lda_model[corpus]))

        for i, row in enumerate(lda_model[corpus]):
            topic_dist = sorted(row, key=lambda x: (x[1]), reverse=True)
            rs_string = ''
            topic_entropy = 0
            for topic in topic_dist:
                rs_string = rs_string + 'Topic ' + str(topic[0] +
                                                       1) + ': ' + str(
                                                           topic[1]) + '; '
                topic_entropy = topic_entropy + (-math.log2(topic[1]))
            df = df.append(pd.Series([
                str(i), dataset['Submission_Num'][i], rs_string,
                str(topic_entropy), dataset['Submission_Text'][i]
            ]),
                           ignore_index=True)
            pbar.update(1)
        df.columns = [
            'Document_No', 'Submission_Num', 'Probabilities', 'Entropy',
            'Submission_Text'
        ]

        csv_file_result_path = f'./turn-in/{bigram_threshold}/model_entropy/{num_topics[index]}.csv'
        index = index + 1
        create_file(csv_file_result_path)
        df.to_csv(csv_file_result_path, index=False)
        pbar.close()
Пример #18
0
def evaluate_LDA_models_multicores(paths, scores_path, corpus, id2word, fr, to,
                                   step, passes):
    alpha, beta = get_alpha_beta_lists()
    models_list = []
    corpus_sets = [  # ClippedCp(corpus, num_of_docs * 0.25), ClippedCp(corpus, num_of_docs * 0.5), ClippedCp(corpus, num_of_docs * 0.75),
        corpus
    ]
    corpus_title = [  #'25% Corpus', '50% Corpus', '75% Corpus',
        '100% Corpus'
    ]  # This match with the corpus-sets
    grid = {'Validation_Set': {}}
    model_results = {
        'Validation_Set': [],
        'Topics': [],
        'Alpha': [],
        'Beta': [],
        'Coherence_Score': []
    }
    topics_range = range(fr, to, step)

    if 1 == 1:
        pbar = tqdm.tqdm(
            total=(len(beta) * len(alpha) * len(topics_range) *
                   len(corpus_title)))  # Progress bar to keep track
        for i in range(
                len(corpus_sets)):  # iterate through validation corpuses
            for a in alpha:  # iterate through alpha values
                for b in beta:  # iterate through beta values
                    # coherence_values = []  # Empty coherence_values every time for graphing
                    for num_topics in topics_range:  # iterate through number of topics
                        lda_model = get_LDA_model_multi_cores(
                            paths,
                            corpus,
                            id2word,
                            num_topics=num_topics,
                            passes=passes,
                            a=a,
                            b=b)
                        models_list.append(lda_model)

                        text_file_path = paths[13] + str(
                            num_topics) + '-' + str(passes) + '.txt'

                        # visualize to html file
                        visualize_LDA(paths,
                                      num_topics,
                                      passes,
                                      alpha=a,
                                      beta=b)
                        print_topic_coherence_to_text_file(
                            text_file_path, num_topics, lda_model, corpus,
                            paths[9])
                        print_perplexity_to_text_file(text_file_path,
                                                      lda_model, corpus)
                        coherence_model_lda = CoherenceModel(
                            model=lda_model,
                            corpus=corpus,
                            dictionary=id2word,
                            coherence='u_mass')
                        # coherence_values.append(coherence_model_lda.get_coherence())

                        # Save the model results with alpha, beta and coherence score
                        model_results['Validation_Set'].append(corpus_title[i])
                        model_results['Topics'].append(num_topics)
                        model_results['Alpha'].append(a)
                        model_results['Beta'].append(b)
                        model_results['Coherence_Score'].append(
                            coherence_model_lda.get_coherence())
                        pbar.update(1)

                    # Save coherence scores graph, this is crashed #FIXME
                    # graph_path = scores_path + str(a) + '_alpha_' + str(b) + '_beta_' + str(fr) + '-' + str(to-1) + '-' + str(passes) + '.pdf'
                    # graph_coherence_scores_num_topics(coherence_values, graph_path, fr, to, step)
        csv_results_path = paths[14] + str(fr) + '-' + str(to - 1) + '-' + str(
            passes) + '.csv'
        create_file(csv_results_path)
        rs = pandas.DataFrame(model_results)
        rs.sort_values(by=['Coherence_Score'], inplace=True, ascending=False)
        rs.to_csv(csv_results_path, index=False)
        pbar.close()
    return models_list