Exemplo n.º 1
0
def main():
    trace('---train topics---', config.log_file)
    model = DtmModel(dtm_path,
                     corpus=gensim_data.corpus,
                     id2word=gensim_data.dictionary,
                     time_slices=train_set.time_slices[:-1],
                     num_topics=config.z_dim,
                     lda_sequence_min_iter=50,
                     lda_sequence_max_iter=config.epochs)
    trace('---model trained---', config.log_file)
    #
    sample_topic = model.dtm_coherence(time=0, num_words=10)
    print('sample topic is like: {}'.format(' '.join(sample_topic[0])),
          config.log_file)

    #
    tw_nps = model.show_topics(num_topics=config.z_dim,
                               times=-1,
                               num_words=train_set.vocab_size(),
                               formatted=False)

    for t in range(T):
        # topics in time t
        tw_np = tw_nps[t * config.z_dim:(t + 1) * config.z_dim]

        tw_np = get_topic_np(tw_np, config.z_dim,
                             gensim_data.dictionary.token2id)
        tw_tensor = torch.from_numpy(tw_np)
        tw_list_t = get_tw_list(tw_tensor, gensim_data.dictionary)

        # coh
        cohs_t = get_cohs(tw_list_t)
        p = ppl(gensim_data.test, tw_tensor)

        TWmatrix.append(tw_np)
        TWlist.append(tw_list_t)
        COHs.append(cohs_t)
        PPLs.append(p)

        avg_COHs.append((sum(cohs_t) / len(cohs_t)))

        seg = '---------- topics in time {}/{} ----------'.format(t + 1, T)
        display_topics(tw_list=tw_list_t,
                       cohs=cohs_t,
                       head='topics',
                       seg=seg,
                       file=config.topic_file)
        trace('topic result(coherence) written.', file=config.log_file)

    p_file = os.path.join(config.output_path, 'ppl.jpg')
    draw_ppl(PPLs, title='perplexities over time', file=p_file)
    a_file = os.path.join(config.output_path, 'avg_coh.jpg')
    draw_ppl(avg_COHs, title='avg coherence over time', file=a_file)
Exemplo n.º 2
0
def dtm_run(data, times, dtm_path, **kw):
    """Run DTM model."""
    sname = kw.pop("name", '_temp_')
    save = kw.pop('save', True)
    d, bow = lda_get_dictionary(data, save=save, name=sname)

    key = f"lda_dtm_{ncomps}_" + sname
    if os.path.exists(os.path.join(PKLDIR, key)):
        return pickle_load(key)
    else:
        mod = DtmModel(dtm_path=dtm_path, corpus=bow, id2word=d,
                    time_slices=times, **dtm_defaults)
        mod.save(os.path.join(PKLDIR, key))
        return mod
Exemplo n.º 3
0
def DTM(path, time_slices, num_topics, corpus):

    """Returns the results of the dynamic topic model and the document-topic matrix.

        Arguments:

            path: The path to the binary dtm.
            time_slices: A sequence of timestamps.
            num_topics: The number of topics.
            corpus: A collection of texts in bow format.

        Returns:

            dtm_results: A list of lists of lists containing the results over the time slices.
            doc_topic_matrix: The proportion of the topics for each document.

    """

# Set the DTM model
    model = DtmModel(dtm_path=path, time_slices=time_slices, num_topics=num_topics, id2word=corpus.dictionary, 
                     top_chain_var=0.01, alpha=50/num_topics, rng_seed=101, initialize_lda=True) # Use LDA in DTM analysis
# Save the DTM model for later use                    
    model.save('DTM_model')

# # Create a list of lists of lists of the top words for each topic
    dtm_results = []

    for topic in range(num_topics):
        dtm_results.append([[model.show_topic(topicid=topic, time=i, topn=top_words)[j][1] for j in range(top_words)] \
                             for i in range(len(time_slices))])

# Generate the document-topic matrix
    doc_topic_matrix = model.dtm_vis(corpus, time=0)[0]

    return dtm_results, doc_topic_matrix
Exemplo n.º 4
0
    def train_model(self):

        # train DTM model
        print("Start time of DTM training: {}".format(datetime.datetime.now()))

        self.model = DtmModelClass(self.path_to_dtm_binary,
                                   corpus=self.doc_term_matrix,
                                   id2word=self.dictionary,
                                   time_slices=self.time_slices,
                                   num_topics=self.num_topics,
                                   rng_seed=self.seed)

        print("End time of DTM training: {}".format(datetime.datetime.now()))
Exemplo n.º 5
0
def getCoherenceScores(nTopics):
    model = DtmModel(path_to_dtm_binary,
                     corpus=corpus,
                     num_topics=nTopics,
                     id2word=dictionary,
                     time_slices=timeSlice)
    model.save(f'./Models/model{nTopics}Topics')
    wordRepresentationTopics = [
        model.dtm_coherence(time=time) for time in range(0, len(timeSlice))
    ]
    coherenceModels = [
        CoherenceModel(topics=wordRepresentationTopics[time],
                       corpus=corpus,
                       dictionary=dictionary,
                       coherence='u_mass')
        for time in range(0, len(timeSlice))
    ]
    coherenceScores = [
        coherenceModels[time].get_coherence()
        for time in range(0, len(timeSlice))
    ]
    return coherenceScores
Exemplo n.º 6
0
    def load_model(self):

        # Load model
        self.model = DtmModelClass.load(self.output_file_path)

        print(f"Model loaded from {self.output_file_path}")
Exemplo n.º 7
0
class DtmModel:
    def __init__(self, date_col, time_ref_col, path_to_dtm_binary, dictionary,
                 doc_term_matrix, seed, num_topics, output_file_path, files):

        self.date_col = date_col
        self.time_ref_col = time_ref_col
        self.path_to_dtm_binary = path_to_dtm_binary
        self.dictionary = dictionary
        self.doc_term_matrix = doc_term_matrix
        self.seed = seed
        self.num_topics = num_topics
        self.output_file_path = output_file_path
        self.files = files

        self.time_slice_labels = None
        self.time_slices = None
        self.model = None
        self.topic_df_list = None

    def prepare_data(self, df):

        # Add year column to data frame
        def get_year(x):
            return x.year

        yrs = df[self.date_col].apply(lambda x: get_year(x))
        df["year"] = yrs

        # Get time slice labels
        self.time_slice_labels = df[self.time_ref_col].unique()
        self.time_slices = df.groupby(self.time_ref_col).size()

        print("Time_slices\n", self.time_slices)

        return df

    def train_model(self):

        # train DTM model
        print("Start time of DTM training: {}".format(datetime.datetime.now()))

        self.model = DtmModelClass(self.path_to_dtm_binary,
                                   corpus=self.doc_term_matrix,
                                   id2word=self.dictionary,
                                   time_slices=self.time_slices,
                                   num_topics=self.num_topics,
                                   rng_seed=self.seed)

        print("End time of DTM training: {}".format(datetime.datetime.now()))

    def save_model(self):

        # Save to file
        self.model.save(self.output_file_path)

        print(f"Dynamic topic model saved to {self.output_file_path}")

    def load_model(self):

        # Load model
        self.model = DtmModelClass.load(self.output_file_path)

        print(f"Model loaded from {self.output_file_path}")

    def top_term_table(self, topic, slices, topn=10):
        """Returns a dataframe with the top n terms in the topic for each of
        the given time slices."""

        data = {
            "Topic_ID": [topic] * topn,
            "Word_Rank": [i for i in range(topn)]
        }
        for time_slice in slices:
            time = np.where(self.time_slice_labels == time_slice)[0][0]
            data[time_slice] = [
                term for p, term in self.model.show_topic(
                    topic, time=time, topn=topn)
            ]
        df = pd.DataFrame(data)
        return df

    def get_doc_topics(self, doc_term_matrix, df_agg):

        # Get topic assignment for each document
        doc_topic, topic_term, doc_lengths, term_frequency, vocab = self.model.dtm_vis(
            doc_term_matrix, 0)

        # Create topic label vector
        doc_topic_no = [np.argmax(array) for array in doc_topic]

        # Create document topic matrix
        topic_cols = [
            "topic_0", "topic_1", "topic_2", "topic_3", "topic_4", "topic_5",
            "topic_6", "topic_7", "topic_8", "topic_9", "topic_10", "topic_11",
            "topic_12", "topic_13", "topic_14", "topic_15", "topic_16",
            "topic_17", "topic_18", "topic_19"
        ]

        df_doc_topic = pd.DataFrame(doc_topic, columns=topic_cols)
        df_doc_topic["topic_no"] = doc_topic_no

        df_output = pd.concat([df_agg, df_doc_topic], axis=1)

        return df_output

    def generate_topic_tables(self):
        """
        Generate a list with a data frame for each topic, where rows denote a word and columns a time slice.
        :param files: Needed for the column names of the data frames
        :return: List of data frames for each topic
        """

        time_slices = self.files

        # topic_df_list = []

        # Gather data for each words in each topic in each time slice
        all_topics = []

        # For each time slice
        for time_id in range(len(time_slices)):

            def safe_div(x, y):
                if y == 0:
                    return 0
                return x / y

            time = time_slices[time_id]

            # Create data frame with dummy column having the length of the vocab
            # df_topic = pd.DataFrame([0] * len(vocab))

            # Get all topic-word distributions for time slice i
            _, topic_term, _, _, vocab = self.model.dtm_vis(
                self.doc_term_matrix, time_id)

            for topic_id in range(len(topic_term)):

                # Topic-word distribution for one topic at time slice i
                topic_at_time_slice = topic_term[topic_id]

                # For each word in this topic
                for word_id in range(len(topic_at_time_slice)):

                    # Gather all data records
                    data_word = vocab[word_id]
                    data_topic = topic_id
                    data_time = time
                    data_time_no = time_id
                    data_load = topic_at_time_slice[word_id]

                    # Calculate difference of word load in previous time slice
                    if data_time == time_slices[0]:
                        data_dif = 0
                        data_dif_big = 0
                        data_dif_fraq = 0
                    else:
                        data_load_prev = all_topics[len(all_topics) -
                                                    (len(topic_at_time_slice) *
                                                     len(topic_term))][4]
                        data_dif = data_load - data_load_prev
                        data_dif_fraq = safe_div(data_dif, data_load_prev)

                        data_dif_big = data_dif * 100000

                    data = [
                        data_word, data_topic, data_time, data_time_no,
                        data_load, data_dif_big, data_dif_fraq
                    ]
                    all_topics.append(data)

            print(f"Finished gathering data from time slice {time}\n")

        df_output = pd.DataFrame(all_topics,
                                 columns=[
                                     "word", "topic", "time", "time_no",
                                     "load", "dif_e5", "dif_fraq"
                                 ])

        return df_output

    def generate_topic_detail_tables(self):
        """
        Generate a list with a data frame for each topic, where rows denote a word and columns a time slice.
        :param files: Needed for the column names of the data frames
        :return: List of data frames for each topic
        """

        time_slices = self.files

        topic_df_list = []

        # Gather data for each words in each topic in each time slice

        _, topic_term, _, _, vocab = self.model.dtm_vis(
            self.doc_term_matrix, 0)

        for topic_id in range(len(topic_term)):

            # Create data frame with dummy column having the length of the vocab
            df_topic = pd.DataFrame([0] * len(vocab))

            # For each time slice
            for time_id in range(len(time_slices)):

                # Get all topic-word distributions for time slice i
                _, topic_term, _, _, vocab = self.model.dtm_vis(
                    self.doc_term_matrix, time_id)

                # Topic-word distribution for one topic at time slice i
                topic_at_time_slice = topic_term[topic_id]

                df_topic[time_slices[time_id]] = topic_at_time_slice

            df_topic.index = vocab
            df_topic = df_topic.drop(columns=[0])
            df_topic["topic"] = topic_id

            print(f"Finished gathering data for topic {topic_id}")

            file_path = f"output/topics/topic_{topic_id}.csv"
            df_topic.to_csv(file_path)

            print(f"Topic detail data frame written to {file_path}")

            topic_df_list.append(df_topic)

        self.topic_df_list = topic_df_list

    # def write_topic_df_to_excel(self, file_path):
    #
    #     # Create a Pandas Excel writer using XlsxWriter as the engine.
    #     writer = pd.ExcelWriter(file_path, engine='xlsxwriter')
    #
    #     # Write each topic dataframe to a different worksheet.
    #     for i in range(len(self.topic_df_list)):
    #         self.topic_df_list[i].to_excel(writer, sheet_name=f'topic_{i}')
    #
    #         print(f"Topic {i} written to excel sheet")
    #
    #     # Close the Pandas Excel writer and output the Excel file.
    #     writer.save()
    #
    #     print(f"Topic dataframes written to excel file under {file_path}")

    def calculate_word_dif(self, folder_path):

        for topic_id in range(self.num_topics):
            df = pd.read_csv(folder_path + f"topic_{topic_id}.csv",
                             index_col=0)

            # Calculate difference of word probabilities to last time slice
            for i in range(len(self.files) - 1):
                df[f"dif_{self.files[i+1]}"] = df[self.files[i + 1]] - df[
                    self.files[i]]

                # Calculate difference of word probabilities differences to last time slice
                if i > 0:
                    df[f"dif_dif_{self.files[i + 1]}"] = df[
                        f"dif_{self.files[i + 1]}"] - df[f"dif_{self.files[i]}"]

            # Change order of columns
            columns = [
                "fp1_projects", "fp2_projects", "fp3_projects", "fp4_projects",
                "fp5_projects", "fp6_projects", "fp7_projects",
                "h2020_projects", "dif_fp2_projects", "dif_fp3_projects",
                "dif_fp4_projects", "dif_fp5_projects", "dif_fp6_projects",
                "dif_fp7_projects", "dif_h2020_projects",
                "dif_dif_fp3_projects", "dif_dif_fp4_projects",
                "dif_dif_fp5_projects", "dif_dif_fp6_projects",
                "dif_dif_fp7_projects", "dif_dif_h2020_projects", "topic"
            ]
            df = df[columns]

            output_file_path = folder_path + f"topic_dif_{topic_id}.csv"
            df.to_csv(output_file_path)

            print(
                f"Finished calculating differences and created file {output_file_path}"
            )

    def construct_final_topic_data(self, folder_path):

        df_ref = pd.read_csv(folder_path + "topic_dif_0.csv", index_col=0)

        vocab = df_ref.index

        topic_list = []

        for topic_id in range(self.num_topics):

            df = pd.read_csv(folder_path + f"topic_dif_{topic_id}.csv",
                             index_col=0)
            df.dropna()

            time_slices = self.files

            for word in vocab:
                # Account for the nan word
                try:
                    for i in range(len(time_slices)):
                        time = time_slices[i]

                        load = df.loc[word, time]

                        if i == 0:
                            dif = 0
                        else:
                            dif = df.loc[word, "dif_" + time]

                        data = [word, topic_id, time, i, load, dif]
                        topic_list.append(data)
                except:
                    print(f"Error at topic {topic_id} and word {word}")

            print(f"Finished reformating topic {topic_id}")

        df_output = pd.DataFrame(
            topic_list,
            columns=["word", "topic", "time", "time_no", "load", "dif"])

        df_output.to_csv(folder_path + "all_topics.csv")

    def generate_project_topic_table(self, df_raw):

        multicol1 = pd.MultiIndex.from_tuples([('topic', 'topic_0'),
                                               ('topic', 'topic_1'),
                                               ('topic', 'topic_2'),
                                               ('topic', 'topic_3'),
                                               ('topic', 'topic_4'),
                                               ('topic', 'topic_5'),
                                               ('topic', 'topic_6'),
                                               ('topic', 'topic_7'),
                                               ('topic', 'topic_8'),
                                               ('topic', 'topic_9'),
                                               ('topic', 'topic_10'),
                                               ('topic', 'topic_11'),
                                               ('topic', 'topic_12'),
                                               ('topic', 'topic_13'),
                                               ('topic', 'topic_14'),
                                               ('topic', 'topic_15'),
                                               ('topic', 'topic_16'),
                                               ('topic', 'topic_17'),
                                               ('topic', 'topic_18'),
                                               ('topic', 'topic_19')])

        columns = [
            "topic_0", "topic_1", "topic_2", "topic_3", "topic_4", "topic_5",
            "topic_6", "topic_7", "topic_8", "topic_9", "topic_10", "topic_11",
            "topic_12", "topic_13", "topic_14", "topic_15", "topic_16",
            "topic_17", "topic_18", "topic_19"
        ]

        # Set rcn as index
        df_raw = df_raw.set_index("rcn")

        # Only keep topic columns
        df_flat = df_raw[columns]

        # Create multi index data frame
        df_flat_multi = pd.DataFrame(df_flat.values,
                                     index=df_flat.index,
                                     columns=multicol1)

        # Stack data frame
        df_stacked = df_flat_multi.stack()

        # Set rcn as single index
        df_stacked = df_stacked.reset_index().set_index("rcn")

        # Rename columns
        df_stacked = df_stacked.rename(columns={
            "level_1": "topic",
            "topic": "load"
        })

        # Remove unnecessary prefix from topic column
        def remove_prefix(text):
            return re.sub("topic_", "", text)

        df_stacked["topic"] = df_stacked["topic"].apply(
            lambda text: remove_prefix(text))

        # Join project information and make rcn normal column again
        df_project_info = df_raw[["startDate", "fp", "fp_no", "title"]]
        df_project_topics = df_stacked.join(df_project_info,
                                            how="left").reset_index()

        return df_project_topics
Exemplo n.º 8
0
corpus = DTMcorpus(docs_lem)

# creating decade variable that serves as a time frame for DTM
data['decade'] = (data['year'] - 1850) / 10
data['decade'] = data['decade'].apply(np.floor)

time_slices = list(data['decade'].value_counts())[::-1]

# dtm path
dtm_path = "/home/sami/dtm/dtm/main"

# estimating DTM with 2 topics per decade
model = DtmModel(dtm_path,
                 corpus,
                 time_slices,
                 num_topics=2,
                 id2word=corpus.dictionary)

# displaying top 10 words in topic number 1 during the second decade
model.show_topic(topicid=1, time=1, topn=10)

doc_number = 0
num_topics = 2

# topic distribution during first time frame (decade)
for i in range(0, num_topics):
    print("Distribution of topic %d %f" % (i, model.gamma_[doc_number, i]))

results = pd.DataFrame()
Exemplo n.º 9
0
        frequency[token] += 1

texts = [[token for token in text if frequency[token] > 0] for text in texts]

dictionary = corpora.Dictionary(texts)
dictionary.save('dict.pickle')  # store the dictionary, for future reference

#Corpus Created
corpus = [dictionary.doc2bow(text) for text in texts]

print("Corpus Created")

num_topics = 30
path_to_dtm_binary = "/home/khan/DTM/dtm/dtm/main"
model = DtmModel(path_to_dtm_binary,
corpus=corpus, time_slices=ts,
mode='fit', model='dtm', num_topics=num_topics)

training_time = time.time() - start_time

print("Model fitted")
id2token = id2TokenFn()
topics = []
for t in range(len(ts)):
    for j in range(num_topics):
        topic = model.show_topic(topicid = j, time = t, topn = 50)
        new_topic = []
        info = ("Time,", t, "TopicID,", j)
        for item in topic:
            new_topic.append((id2token[int(item[1])], round(item[0], 4)))
        new_topic.append(info)
Exemplo n.º 10
0
def _dtm(table, input_col, topic_name='topic', num_topic=5, num_topic_word=10, max_iter=20, time_slice=None,
         coherence='u_mass', vis_time=0, seed=None):
    running_os = platform.system()
    is_os_64bit = platform.machine().endswith('64')
    if running_os == 'Linux':
        if is_os_64bit:
            dtm_filename = 'dtm-linux64'
        else:
            dtm_filename = 'dtm-linux32'
    elif running_os == 'Windows':
        if is_os_64bit:
            dtm_filename = 'dtm-win64.exe'
        else:
            dtm_filename = 'dtm-win32.exe'
    else:  # Mac
        dtm_filename = 'dtm-darwin64'
    dtm_path = os.path.join(str(pathlib.Path(__file__).parent.absolute()), 'dtm', dtm_filename)
    if running_os != 'Windows':
        bash_command = "chmod +x {}".format(dtm_path)
        os.system(bash_command)
    tokenized_doc = np.array(table[input_col])
    num_doc = len(tokenized_doc)
    if time_slice is None:
        time_slice = [num_doc]
    elif sum(time_slice) != num_doc:
        raise_runtime_error("The sum of time slice list does not match the number of documents.")
    if vis_time < 0 or vis_time >= len(time_slice):
        raise_runtime_error("Invalid time parameter: {}".format(vis_time))
    dictionary = corpora.Dictionary(tokenized_doc)
    corpus = [dictionary.doc2bow(text) for text in tokenized_doc]
    dtm_params = {"corpus": corpus,
                  "id2word": dictionary,
                  "time_slices": time_slice,
                  "num_topics": num_topic,
                  "lda_sequence_max_iter": max_iter,
                  "model": 'dtm'}
    if seed is not None:
        dtm_params["rng_seed"] = seed
    dtm_model = DtmModel(dtm_path, **dtm_params)

    topic_time = [[dtm_model.show_topic(topicid=id, time=t, topn=num_topic_word) for id in range(num_topic)]
                  for t in range(len(time_slice))]
    topic_time = [[["{}: {}".format(tup[1], tup[0]) for tup in topic] for topic in time] for time in topic_time]
    timeline = ["{} ({} docs)".format(ind, t) for ind, t in enumerate(time_slice)]
    columns = ["topic_{}".format(i + 1) for i in range(num_topic)]
    topic_table = pd.DataFrame(topic_time, columns=columns)
    topic_table['time'] = timeline
    topic_table = topic_table[['time'] + columns]

    prop_arr = dtm_model.gamma_
    out_table = pd.DataFrame.copy(table, deep=True)
    if topic_name in table.columns:
        raise BrighticsFunctionException.from_errors(
            [{'0100': "Existing table contains Topic Column Name. Please choose again."}])
    out_table[topic_name] = [item.argmax() + 1 for item in prop_arr]
    out_table['topic_distribution'] = prop_arr.tolist()

    coherence_topic_arr = [dtm_model.dtm_coherence(time) for time in range(len(time_slice))]
    if coherence == 'u_mass':
        coh_arr = [CoherenceModel(topics=item, dictionary=dictionary, corpus=corpus, coherence='u_mass').get_coherence()
                   for item in coherence_topic_arr]
    else:
        coh_arr = [CoherenceModel(topics=item, dictionary=dictionary, corpus=corpus, texts=tokenized_doc,
                                  coherence='c_v').get_coherence() for item in coherence_topic_arr]

    doc_topic, topic_term, doc_lengths, term_frequency, vocab = dtm_model.dtm_vis(corpus, vis_time)
    prepared_data = plv.prepare(topic_term, doc_topic, doc_lengths, vocab, term_frequency, sort_topics=False)
    html_result = plv.prepared_data_to_html(prepared_data)

    params = {'Input column': input_col,
              'Topic column name': topic_name,
              'Number of topics': num_topic,
              'Number of words for each topic': num_topic_word,
              'Maximum number of iterations': max_iter,
              'Time slice': time_slice,
              'Coherence measure': coherence,
              'Time to visualize': vis_time}
    rb = BrtcReprBuilder()
    rb.addMD(strip_margin("""
    | ## Dynamic Topic Modeling Result
    | ### Summary
    |
    """))
    rb.addHTML(html_result)
    rb.addMD(strip_margin("""
    | ### Coherence for each period
    | {coh_arr}
    |
    | ### Parameters
    | {params}
    """.format(coh_arr=coh_arr, params=dict2MD(params))))

    model = _model_dict('dtm_model')
    model['params'] = params
    model['dtm_model'] = dtm_model
    model['coherences'] = coh_arr
    model['corpus'] = corpus
    model['_repr_brtc_'] = rb.get()

    return {'out_table': out_table, 'topic_table': topic_table, 'model': model}
Exemplo n.º 11
0
texts = [[token for token in text if frequency[token] > 1] for text in texts]

dictionary = corpora.Dictionary(texts)
dictionary.save(
    'deerwester.dict')  # store the dictionary, for future reference

#Corpus Created
corpus = [dictionary.doc2bow(text) for text in texts]

print("Corpus Created")
path_to_dtm_binary = "/home/khan/DTM/dtm/dtm/main"

model = DtmModel(path_to_dtm_binary,
                 corpus,
                 time_slices=[1] * len(corpus),
                 mode='fit',
                 model='dtm',
                 num_topics=20)

print("Model fitted")
topics = model.show_topic(topicid=1, time=1, topn=10)
print(topics)
print("Topics finding")
training_time = time.time() - start_time

# Coverting topic into a excel file
print("Putting in DataFrame started")
df = pd.DataFrame(topics)
writer = pd.ExcelWriter("DTM_topics.xlsx")
df.to_excel(writer, 'Sheet1')
Exemplo n.º 12
0
                    time_slices=times, **dtm_defaults)
        mod.save(os.path.join(PKLDIR, key))
        return mod

# DTM Analysis
nips = NipsData()
dat = nips.load_data(sample_frac=1)
sections = ['title', 'abstract']
ncomps = dtm_defaults['num_topics']
dat = nips.combined_sections(sections, data=dat)
yrs, cnts = papers_per_year(nips.raw)
sname = get_save_name(sections)
docs = lda_get_corpus(dat, name=sname, save=True)
d, bow = lda_get_dictionary(dat, name=sname, save=True)
## mod = dtm_run(dat, cnts, dtm_path, name=sname, save=True)
mod = DtmModel.load(os.path.join(PKLDIR, 'lda_dtm_15_title_abstract'))


def print_dtm_top_words_for_year(model, years, n_topics, n_words):
    """Print top n_words from top n_topics for year in years."""
    print(f"Top {n_words} from top {n_topics} for year(s) {years}:")
    yrs = enumerate(range(1987, 2018))
    inds = [(i,yr) for i, yr in yrs if yr in years]
    
    for i, yr in inds:
        print(f"Year {yr}:")
        for topic, words in enumerate(model.dtm_coherence(i, n_words)[:n_topics]):
            print(f"  Topic #{topic}: " + ', '.join(words))

def dtm_coherence(model, corpus, d, year):
    """Get coherence for DTM model at year."""
Exemplo n.º 13
0
import gensim.models
from gensim.models.wrappers import DtmModel
import gensim.corpora
import gensim.matutils
import numpy as np
import pickle
corpus = pickle.load(open("collections/tmp/test_corpus", "rb"))
dictionary = pickle.load(open("collections/tmp/test_dictionary", "rb"))
num_topics = pickle.load(open("collections/tmp/num_topics", "rb"))
top_chain_var = pickle.load(open("collections/tmp/top_chain_variance", "rb"))
time_slices = pickle.load(open("collections/tmp/time_slices", "rb"))
alpha = pickle.load(open("collections/tmp/alpha", "rb"))
path_to_dtm_binary = "global/dtm-linux64"
model = DtmModel(path_to_dtm_binary,
                 corpus=corpus,
                 id2word=dictionary,
                 time_slices=time_slices,
                 num_topics=num_topics,
                 mode="fit",
                 top_chain_var=top_chain_var,
                 initialize_lda=True,
                 alpha=alpha)
Exemplo n.º 14
0
def loadDTM(amounttopics):
    modelAmountTopics = DtmModel.load(f'./Models/model{amounttopics}Topics')
    return modelAmountTopics