Exemplo n.º 1
0
def DTM(path, time_slices, num_topics, corpus):

    """Returns the results of the dynamic topic model and the document-topic matrix.

        Arguments:

            path: The path to the binary dtm.
            time_slices: A sequence of timestamps.
            num_topics: The number of topics.
            corpus: A collection of texts in bow format.

        Returns:

            dtm_results: A list of lists of lists containing the results over the time slices.
            doc_topic_matrix: The proportion of the topics for each document.

    """

# Set the DTM model
    model = DtmModel(dtm_path=path, time_slices=time_slices, num_topics=num_topics, id2word=corpus.dictionary, 
                     top_chain_var=0.01, alpha=50/num_topics, rng_seed=101, initialize_lda=True) # Use LDA in DTM analysis
# Save the DTM model for later use                    
    model.save('DTM_model')

# # Create a list of lists of lists of the top words for each topic
    dtm_results = []

    for topic in range(num_topics):
        dtm_results.append([[model.show_topic(topicid=topic, time=i, topn=top_words)[j][1] for j in range(top_words)] \
                             for i in range(len(time_slices))])

# Generate the document-topic matrix
    doc_topic_matrix = model.dtm_vis(corpus, time=0)[0]

    return dtm_results, doc_topic_matrix
Exemplo n.º 2
0
def dtm_run(data, times, dtm_path, **kw):
    """Run DTM model."""
    sname = kw.pop("name", '_temp_')
    save = kw.pop('save', True)
    d, bow = lda_get_dictionary(data, save=save, name=sname)

    key = f"lda_dtm_{ncomps}_" + sname
    if os.path.exists(os.path.join(PKLDIR, key)):
        return pickle_load(key)
    else:
        mod = DtmModel(dtm_path=dtm_path, corpus=bow, id2word=d,
                    time_slices=times, **dtm_defaults)
        mod.save(os.path.join(PKLDIR, key))
        return mod
Exemplo n.º 3
0
def getCoherenceScores(nTopics):
    model = DtmModel(path_to_dtm_binary,
                     corpus=corpus,
                     num_topics=nTopics,
                     id2word=dictionary,
                     time_slices=timeSlice)
    model.save(f'./Models/model{nTopics}Topics')
    wordRepresentationTopics = [
        model.dtm_coherence(time=time) for time in range(0, len(timeSlice))
    ]
    coherenceModels = [
        CoherenceModel(topics=wordRepresentationTopics[time],
                       corpus=corpus,
                       dictionary=dictionary,
                       coherence='u_mass')
        for time in range(0, len(timeSlice))
    ]
    coherenceScores = [
        coherenceModels[time].get_coherence()
        for time in range(0, len(timeSlice))
    ]
    return coherenceScores
Exemplo n.º 4
0
class DtmModel:
    def __init__(self, date_col, time_ref_col, path_to_dtm_binary, dictionary,
                 doc_term_matrix, seed, num_topics, output_file_path, files):

        self.date_col = date_col
        self.time_ref_col = time_ref_col
        self.path_to_dtm_binary = path_to_dtm_binary
        self.dictionary = dictionary
        self.doc_term_matrix = doc_term_matrix
        self.seed = seed
        self.num_topics = num_topics
        self.output_file_path = output_file_path
        self.files = files

        self.time_slice_labels = None
        self.time_slices = None
        self.model = None
        self.topic_df_list = None

    def prepare_data(self, df):

        # Add year column to data frame
        def get_year(x):
            return x.year

        yrs = df[self.date_col].apply(lambda x: get_year(x))
        df["year"] = yrs

        # Get time slice labels
        self.time_slice_labels = df[self.time_ref_col].unique()
        self.time_slices = df.groupby(self.time_ref_col).size()

        print("Time_slices\n", self.time_slices)

        return df

    def train_model(self):

        # train DTM model
        print("Start time of DTM training: {}".format(datetime.datetime.now()))

        self.model = DtmModelClass(self.path_to_dtm_binary,
                                   corpus=self.doc_term_matrix,
                                   id2word=self.dictionary,
                                   time_slices=self.time_slices,
                                   num_topics=self.num_topics,
                                   rng_seed=self.seed)

        print("End time of DTM training: {}".format(datetime.datetime.now()))

    def save_model(self):

        # Save to file
        self.model.save(self.output_file_path)

        print(f"Dynamic topic model saved to {self.output_file_path}")

    def load_model(self):

        # Load model
        self.model = DtmModelClass.load(self.output_file_path)

        print(f"Model loaded from {self.output_file_path}")

    def top_term_table(self, topic, slices, topn=10):
        """Returns a dataframe with the top n terms in the topic for each of
        the given time slices."""

        data = {
            "Topic_ID": [topic] * topn,
            "Word_Rank": [i for i in range(topn)]
        }
        for time_slice in slices:
            time = np.where(self.time_slice_labels == time_slice)[0][0]
            data[time_slice] = [
                term for p, term in self.model.show_topic(
                    topic, time=time, topn=topn)
            ]
        df = pd.DataFrame(data)
        return df

    def get_doc_topics(self, doc_term_matrix, df_agg):

        # Get topic assignment for each document
        doc_topic, topic_term, doc_lengths, term_frequency, vocab = self.model.dtm_vis(
            doc_term_matrix, 0)

        # Create topic label vector
        doc_topic_no = [np.argmax(array) for array in doc_topic]

        # Create document topic matrix
        topic_cols = [
            "topic_0", "topic_1", "topic_2", "topic_3", "topic_4", "topic_5",
            "topic_6", "topic_7", "topic_8", "topic_9", "topic_10", "topic_11",
            "topic_12", "topic_13", "topic_14", "topic_15", "topic_16",
            "topic_17", "topic_18", "topic_19"
        ]

        df_doc_topic = pd.DataFrame(doc_topic, columns=topic_cols)
        df_doc_topic["topic_no"] = doc_topic_no

        df_output = pd.concat([df_agg, df_doc_topic], axis=1)

        return df_output

    def generate_topic_tables(self):
        """
        Generate a list with a data frame for each topic, where rows denote a word and columns a time slice.
        :param files: Needed for the column names of the data frames
        :return: List of data frames for each topic
        """

        time_slices = self.files

        # topic_df_list = []

        # Gather data for each words in each topic in each time slice
        all_topics = []

        # For each time slice
        for time_id in range(len(time_slices)):

            def safe_div(x, y):
                if y == 0:
                    return 0
                return x / y

            time = time_slices[time_id]

            # Create data frame with dummy column having the length of the vocab
            # df_topic = pd.DataFrame([0] * len(vocab))

            # Get all topic-word distributions for time slice i
            _, topic_term, _, _, vocab = self.model.dtm_vis(
                self.doc_term_matrix, time_id)

            for topic_id in range(len(topic_term)):

                # Topic-word distribution for one topic at time slice i
                topic_at_time_slice = topic_term[topic_id]

                # For each word in this topic
                for word_id in range(len(topic_at_time_slice)):

                    # Gather all data records
                    data_word = vocab[word_id]
                    data_topic = topic_id
                    data_time = time
                    data_time_no = time_id
                    data_load = topic_at_time_slice[word_id]

                    # Calculate difference of word load in previous time slice
                    if data_time == time_slices[0]:
                        data_dif = 0
                        data_dif_big = 0
                        data_dif_fraq = 0
                    else:
                        data_load_prev = all_topics[len(all_topics) -
                                                    (len(topic_at_time_slice) *
                                                     len(topic_term))][4]
                        data_dif = data_load - data_load_prev
                        data_dif_fraq = safe_div(data_dif, data_load_prev)

                        data_dif_big = data_dif * 100000

                    data = [
                        data_word, data_topic, data_time, data_time_no,
                        data_load, data_dif_big, data_dif_fraq
                    ]
                    all_topics.append(data)

            print(f"Finished gathering data from time slice {time}\n")

        df_output = pd.DataFrame(all_topics,
                                 columns=[
                                     "word", "topic", "time", "time_no",
                                     "load", "dif_e5", "dif_fraq"
                                 ])

        return df_output

    def generate_topic_detail_tables(self):
        """
        Generate a list with a data frame for each topic, where rows denote a word and columns a time slice.
        :param files: Needed for the column names of the data frames
        :return: List of data frames for each topic
        """

        time_slices = self.files

        topic_df_list = []

        # Gather data for each words in each topic in each time slice

        _, topic_term, _, _, vocab = self.model.dtm_vis(
            self.doc_term_matrix, 0)

        for topic_id in range(len(topic_term)):

            # Create data frame with dummy column having the length of the vocab
            df_topic = pd.DataFrame([0] * len(vocab))

            # For each time slice
            for time_id in range(len(time_slices)):

                # Get all topic-word distributions for time slice i
                _, topic_term, _, _, vocab = self.model.dtm_vis(
                    self.doc_term_matrix, time_id)

                # Topic-word distribution for one topic at time slice i
                topic_at_time_slice = topic_term[topic_id]

                df_topic[time_slices[time_id]] = topic_at_time_slice

            df_topic.index = vocab
            df_topic = df_topic.drop(columns=[0])
            df_topic["topic"] = topic_id

            print(f"Finished gathering data for topic {topic_id}")

            file_path = f"output/topics/topic_{topic_id}.csv"
            df_topic.to_csv(file_path)

            print(f"Topic detail data frame written to {file_path}")

            topic_df_list.append(df_topic)

        self.topic_df_list = topic_df_list

    # def write_topic_df_to_excel(self, file_path):
    #
    #     # Create a Pandas Excel writer using XlsxWriter as the engine.
    #     writer = pd.ExcelWriter(file_path, engine='xlsxwriter')
    #
    #     # Write each topic dataframe to a different worksheet.
    #     for i in range(len(self.topic_df_list)):
    #         self.topic_df_list[i].to_excel(writer, sheet_name=f'topic_{i}')
    #
    #         print(f"Topic {i} written to excel sheet")
    #
    #     # Close the Pandas Excel writer and output the Excel file.
    #     writer.save()
    #
    #     print(f"Topic dataframes written to excel file under {file_path}")

    def calculate_word_dif(self, folder_path):

        for topic_id in range(self.num_topics):
            df = pd.read_csv(folder_path + f"topic_{topic_id}.csv",
                             index_col=0)

            # Calculate difference of word probabilities to last time slice
            for i in range(len(self.files) - 1):
                df[f"dif_{self.files[i+1]}"] = df[self.files[i + 1]] - df[
                    self.files[i]]

                # Calculate difference of word probabilities differences to last time slice
                if i > 0:
                    df[f"dif_dif_{self.files[i + 1]}"] = df[
                        f"dif_{self.files[i + 1]}"] - df[f"dif_{self.files[i]}"]

            # Change order of columns
            columns = [
                "fp1_projects", "fp2_projects", "fp3_projects", "fp4_projects",
                "fp5_projects", "fp6_projects", "fp7_projects",
                "h2020_projects", "dif_fp2_projects", "dif_fp3_projects",
                "dif_fp4_projects", "dif_fp5_projects", "dif_fp6_projects",
                "dif_fp7_projects", "dif_h2020_projects",
                "dif_dif_fp3_projects", "dif_dif_fp4_projects",
                "dif_dif_fp5_projects", "dif_dif_fp6_projects",
                "dif_dif_fp7_projects", "dif_dif_h2020_projects", "topic"
            ]
            df = df[columns]

            output_file_path = folder_path + f"topic_dif_{topic_id}.csv"
            df.to_csv(output_file_path)

            print(
                f"Finished calculating differences and created file {output_file_path}"
            )

    def construct_final_topic_data(self, folder_path):

        df_ref = pd.read_csv(folder_path + "topic_dif_0.csv", index_col=0)

        vocab = df_ref.index

        topic_list = []

        for topic_id in range(self.num_topics):

            df = pd.read_csv(folder_path + f"topic_dif_{topic_id}.csv",
                             index_col=0)
            df.dropna()

            time_slices = self.files

            for word in vocab:
                # Account for the nan word
                try:
                    for i in range(len(time_slices)):
                        time = time_slices[i]

                        load = df.loc[word, time]

                        if i == 0:
                            dif = 0
                        else:
                            dif = df.loc[word, "dif_" + time]

                        data = [word, topic_id, time, i, load, dif]
                        topic_list.append(data)
                except:
                    print(f"Error at topic {topic_id} and word {word}")

            print(f"Finished reformating topic {topic_id}")

        df_output = pd.DataFrame(
            topic_list,
            columns=["word", "topic", "time", "time_no", "load", "dif"])

        df_output.to_csv(folder_path + "all_topics.csv")

    def generate_project_topic_table(self, df_raw):

        multicol1 = pd.MultiIndex.from_tuples([('topic', 'topic_0'),
                                               ('topic', 'topic_1'),
                                               ('topic', 'topic_2'),
                                               ('topic', 'topic_3'),
                                               ('topic', 'topic_4'),
                                               ('topic', 'topic_5'),
                                               ('topic', 'topic_6'),
                                               ('topic', 'topic_7'),
                                               ('topic', 'topic_8'),
                                               ('topic', 'topic_9'),
                                               ('topic', 'topic_10'),
                                               ('topic', 'topic_11'),
                                               ('topic', 'topic_12'),
                                               ('topic', 'topic_13'),
                                               ('topic', 'topic_14'),
                                               ('topic', 'topic_15'),
                                               ('topic', 'topic_16'),
                                               ('topic', 'topic_17'),
                                               ('topic', 'topic_18'),
                                               ('topic', 'topic_19')])

        columns = [
            "topic_0", "topic_1", "topic_2", "topic_3", "topic_4", "topic_5",
            "topic_6", "topic_7", "topic_8", "topic_9", "topic_10", "topic_11",
            "topic_12", "topic_13", "topic_14", "topic_15", "topic_16",
            "topic_17", "topic_18", "topic_19"
        ]

        # Set rcn as index
        df_raw = df_raw.set_index("rcn")

        # Only keep topic columns
        df_flat = df_raw[columns]

        # Create multi index data frame
        df_flat_multi = pd.DataFrame(df_flat.values,
                                     index=df_flat.index,
                                     columns=multicol1)

        # Stack data frame
        df_stacked = df_flat_multi.stack()

        # Set rcn as single index
        df_stacked = df_stacked.reset_index().set_index("rcn")

        # Rename columns
        df_stacked = df_stacked.rename(columns={
            "level_1": "topic",
            "topic": "load"
        })

        # Remove unnecessary prefix from topic column
        def remove_prefix(text):
            return re.sub("topic_", "", text)

        df_stacked["topic"] = df_stacked["topic"].apply(
            lambda text: remove_prefix(text))

        # Join project information and make rcn normal column again
        df_project_info = df_raw[["startDate", "fp", "fp_no", "title"]]
        df_project_topics = df_stacked.join(df_project_info,
                                            how="left").reset_index()

        return df_project_topics