Exemplo n.º 1
0
    def test_from_corpus(self):
        """build `Dictionary` from an existing corpus"""

        documents = [
            "Human machine interface for lab abc computer applications",
            "A survey of user opinion of computer system response time",
            "The EPS user interface management system",
            "System and human system engineering testing of EPS",
            "Relation of user perceived response time to error measurement",
            "The generation of random binary unordered trees",
            "The intersection graph of paths in trees",
            "Graph minors IV Widths of trees and well quasi ordering",
            "Graph minors A survey"
        ]
        stoplist = set('for a of the and to in'.split())
        texts = [[
            word for word in document.lower().split() if word not in stoplist
        ] for document in documents]

        # remove words that appear only once
        all_tokens = sum(texts, [])
        tokens_once = set(word for word in set(all_tokens)
                          if all_tokens.count(word) == 1)
        texts = [[word for word in text if word not in tokens_once]
                 for text in texts]

        dictionary = Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]

        # Create dictionary from corpus without a token map
        dictionary_from_corpus = Dictionary.from_corpus(corpus)

        dict_token2id_vals = sorted(dictionary.token2id.values())
        dict_from_corpus_vals = sorted(
            dictionary_from_corpus.token2id.values())
        self.assertEqual(dict_token2id_vals, dict_from_corpus_vals)
        self.assertEqual(dictionary.dfs, dictionary_from_corpus.dfs)
        self.assertEqual(dictionary.num_docs, dictionary_from_corpus.num_docs)
        self.assertEqual(dictionary.num_pos, dictionary_from_corpus.num_pos)
        self.assertEqual(dictionary.num_nnz, dictionary_from_corpus.num_nnz)

        # Create dictionary from corpus with an id=>token map
        dictionary_from_corpus_2 = Dictionary.from_corpus(corpus,
                                                          id2word=dictionary)

        self.assertEqual(dictionary.token2id,
                         dictionary_from_corpus_2.token2id)
        self.assertEqual(dictionary.dfs, dictionary_from_corpus_2.dfs)
        self.assertEqual(dictionary.num_docs,
                         dictionary_from_corpus_2.num_docs)
        self.assertEqual(dictionary.num_pos, dictionary_from_corpus_2.num_pos)
        self.assertEqual(dictionary.num_nnz, dictionary_from_corpus_2.num_nnz)

        # Ensure Sparse2Corpus is compatible with from_corpus
        bow = gensim.matutils.Sparse2Corpus(scipy.sparse.rand(10, 100))
        dictionary = Dictionary.from_corpus(bow)
        self.assertEqual(dictionary.num_docs, 100)
Exemplo n.º 2
0
    def test_from_corpus(self):
        """build `Dictionary` from an existing corpus"""

        documents = [
            "Human machine interface for lab abc computer applications",
            "A survey of user opinion of computer system response time",
            "The EPS user interface management system",
            "System and human system engineering testing of EPS",
            "Relation of user perceived response time to error measurement",
            "The generation of random binary unordered trees",
            "The intersection graph of paths in trees",
            "Graph minors IV Widths of trees and well quasi ordering",
            "Graph minors A survey"
        ]
        stoplist = set('for a of the and to in'.split())
        texts = [
            [word for word in document.lower().split() if word not in stoplist]
            for document in documents]

        # remove words that appear only once
        all_tokens = sum(texts, [])
        tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
        texts = [[word for word in text if word not in tokens_once] for text in texts]

        dictionary = Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]

        # Create dictionary from corpus without a token map
        dictionary_from_corpus = Dictionary.from_corpus(corpus)

        dict_token2id_vals = sorted(dictionary.token2id.values())
        dict_from_corpus_vals = sorted(dictionary_from_corpus.token2id.values())
        self.assertEqual(dict_token2id_vals, dict_from_corpus_vals)
        self.assertEqual(dictionary.dfs, dictionary_from_corpus.dfs)
        self.assertEqual(dictionary.num_docs, dictionary_from_corpus.num_docs)
        self.assertEqual(dictionary.num_pos, dictionary_from_corpus.num_pos)
        self.assertEqual(dictionary.num_nnz, dictionary_from_corpus.num_nnz)

        # Create dictionary from corpus with an id=>token map
        dictionary_from_corpus_2 = Dictionary.from_corpus(corpus, id2word=dictionary)

        self.assertEqual(dictionary.token2id, dictionary_from_corpus_2.token2id)
        self.assertEqual(dictionary.dfs, dictionary_from_corpus_2.dfs)
        self.assertEqual(dictionary.num_docs, dictionary_from_corpus_2.num_docs)
        self.assertEqual(dictionary.num_pos, dictionary_from_corpus_2.num_pos)
        self.assertEqual(dictionary.num_nnz, dictionary_from_corpus_2.num_nnz)

        # Ensure Sparse2Corpus is compatible with from_corpus
        bow = gensim.matutils.Sparse2Corpus(scipy.sparse.rand(10, 100))
        dictionary = Dictionary.from_corpus(bow)
        self.assertEqual(dictionary.num_docs, 100)
Exemplo n.º 3
0
    def test_from_corpus(self):
        """build `Dictionary` from an existing corpus"""

        documents = ["Human machine interface for lab abc computer applications",
                "A survey of user opinion of computer system response time",
                "The EPS user interface management system",
                "System and human system engineering testing of EPS",
                "Relation of user perceived response time to error measurement",
                "The generation of random binary unordered trees",
                "The intersection graph of paths in trees",
                "Graph minors IV Widths of trees and well quasi ordering",
                "Graph minors A survey"]
        stoplist = set('for a of the and to in'.split())
        texts = [[word for word in document.lower().split() if word not in stoplist]
                for document in documents]

        # remove words that appear only once
        all_tokens = sum(texts, [])
        tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
        texts = [[word for word in text if word not in tokens_once]
                for text in texts]
        dictionary = Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]
        dictionary_from_corpus = Dictionary.from_corpus(corpus)

        #we have to compare values, because in creating dictionary from corpus
        #informations about words are lost
        dict_token2id_vals = sorted(dictionary.token2id.values())
        dict_from_corpus_vals = sorted(dictionary_from_corpus.token2id.values())
        self.assertEqual(dict_token2id_vals, dict_from_corpus_vals)
        self.assertEqual(dictionary.dfs, dictionary_from_corpus.dfs)
        self.assertEqual(dictionary.num_docs, dictionary_from_corpus.num_docs)
        self.assertEqual(dictionary.num_pos, dictionary_from_corpus.num_pos)
        self.assertEqual(dictionary.num_nnz, dictionary_from_corpus.num_nnz)
def getLDAvis(topics, min_df, max_features):
    ldavis_key = f'{int(min_df*1000):d}_{max_features}_{topics}'
    ldavis_path = Path('./pyldavis') / f'{ldavis_key}_tsne.html'
    if not ldavis_path.exists():
        key = f'{max_features}'
        dtm_path = corpus_path / f'dtm_{key}.npz'
        dtm = sparse.load_npz(dtm_path)
        token_path = corpus_path / f'tokens_{key}.csv'
        tokens = pd.read_csv(token_path,
                             header=None,
                             squeeze=True,
                             na_values=[],
                             keep_default_na=False)
        model_file = datapath(
            (experiment_path / 'models' / f'{key}_{topics}').resolve())
        lda_model = LdaModel.load(model_file)
        id2word = tokens.to_dict()
        corpus = Sparse2Corpus(dtm, documents_columns=False)
        dictionary = Dictionary.from_corpus(corpus, id2word)
        vis = prepare(lda_model, corpus, dictionary, mds='tsne')
        kwargs = {"ldavis_url": "/static/ldavis.js"}
        pyLDAvis.save_html(vis, str(ldavis_path), **kwargs)
    with open(str(ldavis_path), 'r') as myfile:
        data = myfile.read()
    return data


#getLDAvis(5, 0.001, 10000)
#getLDAvis(5, 0.001, 25000)
#getLDAvis(10, 0.001, 10000)
#getLDAvis(10, 0.001, 25000)
#getLDAvis(20, 0.001, 10000)
#getLDAvis(20, 0.001, 25000)
Exemplo n.º 5
0
def graphLDA(name):

    embedFile = 'backendOutput/embeddings-' + name + '.pkl'
    bow, tfidf, _, id2word = loadData(embedFile)

    for (docRep, docRepName) in [(bow, 'bow'), (tfidf, 'tfidf')]:
        ldamodel = loadData('backendOutput/ldamodel-' + name + "-" +
                            docRepName + '.pkl')
        corpus = Sparse2Corpus(docRep, documents_columns=False)
        dictionary = Dictionary.from_corpus(corpus, id2word)
        #This could be more descriptive if we wanted
        document_labels = ["Document " + str(i) for i in range(len(corpus))]

        grapher = LDAGrapher(docRepName, corpus, dictionary, ldamodel,
                             document_labels, name)

        print("Graphing t-SNE for " + docRepName + "...")
        grapher.graphTSNE(perplexity=30)
        print("Graphing pyLDAvis for " + docRepName + "...")
        grapher.graphPyLDAvis()
        print("Creating word cloud for " + docRepName + "...")
        grapher.graphWordCloud()
        print("Graphing word weights for " + docRepName + "...")
        grapher.graphWordWeight()

    print("Done graphing!")
Exemplo n.º 6
0
    def test_from_corpus(self):
        """build `Dictionary` from an existing corpus"""

        documents = ["Human machine interface for lab abc computer applications",
                "A survey of user opinion of computer system response time",
                "The EPS user interface management system",
                "System and human system engineering testing of EPS",
                "Relation of user perceived response time to error measurement",
                "The generation of random binary unordered trees",
                "The intersection graph of paths in trees",
                "Graph minors IV Widths of trees and well quasi ordering",
                "Graph minors A survey"]
        stoplist = set('for a of the and to in'.split())
        texts = [[word for word in document.lower().split() if word not in stoplist]
                for document in documents]

        # remove words that appear only once
        all_tokens = sum(texts, [])
        tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1)
        texts = [[word for word in text if word not in tokens_once]
                for text in texts]
        dictionary = Dictionary(texts)
        corpus = [dictionary.doc2bow(text) for text in texts]
        dictionary_from_corpus = Dictionary.from_corpus(corpus)

        #we have to compare values, because in creating dictionary from corpus
        #informations about words are lost
        dict_token2id_vals = sorted(dictionary.token2id.values())
        dict_from_corpus_vals = sorted(dictionary_from_corpus.token2id.values())
        self.assertEqual(dict_token2id_vals, dict_from_corpus_vals)
        self.assertEqual(dictionary.dfs, dictionary_from_corpus.dfs)
        self.assertEqual(dictionary.num_docs, dictionary_from_corpus.num_docs)
        self.assertEqual(dictionary.num_pos, dictionary_from_corpus.num_pos)
        self.assertEqual(dictionary.num_nnz, dictionary_from_corpus.num_nnz)
Exemplo n.º 7
0
def main():
    from_http = bool(int(sys.argv[1]))
    file_name = str(sys.argv[2])
    data = read_data(file_name=file_name, from_http=from_http)
    data = process_data(data)
    save_name = str(sys.argv[3])
    if save_name == 'harmonized_shipper_sym':
        bag_of_words = create_BoW_harmonized_shipper(data)
        alpha = 'symmetric'
    elif save_name == 'harmonized_shipper_asym':
        bag_of_words = create_BoW_harmonized_shipper(data)
        alpha = 'asymmetric'
    elif save_name == 'shipper_harmonized_sym':
        bag_of_words = create_BoW_shipper_harmonized(data)
        alpha = 'symmetric'
    elif save_name == 'shipper_harmonized_asym':
        bag_of_words = create_BoW_shipper_harmonized(data)
        alpha = 'asymmetric'
    else:
        print('not reconize')
    corpus = create_corpus(bag_of_words, save_name, save=True)
    id2word = create_id2word(bag_of_words, save_name, save=True)
    num_topics = int(sys.argv[4])
    model = compute_lda(save_name, corpus, num_topics, id2word, alpha=alpha)

    # Fro visualization
    dictionary = Dictionary.from_corpus(corpus, id2word=id2word)
    save_pyldavis2html(model, corpus, dictionary, save_name, num_topics)
    # For document_topic_distribution
    document_topic_distribution(corpus,
                                bag_of_words,
                                model,
                                save_name,
                                num_topics,
                                minimum_probability=0.10)
Exemplo n.º 8
0
def main():
    model_name = str(sys.argv[1])
    num_topics = int(sys.argv[2])
    # Fro visualization
    corpus = load_corpus(model_name)
    id2word = load_id2word(model_name)
    dictionary = Dictionary.from_corpus(corpus, id2word=id2word)
    # Load LDAModel
    model = load_model(model_name, num_topics)
    save_pyldavis2html(model, corpus, dictionary, model_name, num_topics)

    # Load data to caculate matrix
    data = load_data()
    if (model_name == 'Dc_v1'):
        matrix_object = compute_Dc_v1(data)
    if (model_name == 'Dc_v2'):
        matrix_object = compute_Dc_v2(data)
    if (model_name == 'Dc_v3'):
        matrix_object = compute_Dc_v3(data)
    if (model_name == 'Dc_v4'):
        matrix_object = compute_Dc_v4(data)
    if (model_name == 'Tc_v1'):
        matrix_object = compute_Tc_v1(data)
    # Save document_topic_distribution
    document_topic_distribution(corpus, matrix_object, model, model_name,
                                num_topics)
Exemplo n.º 9
0
 def transform_sklearn_to_gensim(corpus_vect):
     # transform sparse matrix into gensim corpus
     corpus_vect_gensim = matutils.Sparse2Corpus(corpus_vect,
                                                 documents_columns=False)
     dictionary = Dictionary.from_corpus(
         corpus_vect_gensim,
         id2word=dict(
             (id, word) for word, id in corpus_vect.vocabulary_.items()))
     return corpus_vect_gensim, dictionary
Exemplo n.º 10
0
def main():
    model_name = str(sys.argv[1])
    num_topics = int(sys.argv[2])
    # Fro visualization
    corpus = load_corpus(model_name)
    id2word = load_id2word(model_name)
    dictionary = Dictionary.from_corpus(corpus,id2word=id2word)
    # Load LDAModel
    model = load_model(model_name,num_topics)
    save_pyldavis2html(model, corpus, dictionary,model_name,num_topics)
Exemplo n.º 11
0
def main():
    # ---------------- Set MLK Enviroment Variables for better Gensim LDA performance  ----------------
    os.environ["MKL_NUM_THREADS"] = "1"
    os.environ["NUMEXPR_NUM_THREADS"] = "1"
    os.environ["OMP_NUM_THREADS"] = "1"
    # ---------------- Prepare LDA Inputs & Run LDA ----------------
    # Parse command line args
    save_name = str(sys.argv[1])
    cap = str(sys.argv[2])
    num_topics = int(sys.argv[3])
    # Load data
    data = load_pickle("FINRA_TRACE_2015.pkl.zip")
    data = data.append(load_pickle("FINRA_TRACE_2014.pkl.zip"),
                       ignore_index=True)
    #data = data.append(load_pickle("FINRA_TRACE_2013.pkl.zip"),ignore_index=True)
    #data = data.append(load_pickle("FINRA_TRACE_2012.pkl.zip"),ignore_index=True)
    # Compute a version of bag_of_words given the save_name
    if save_name == "trade_vol_BoW":
        bag_of_words = trade_vol_BoW(data, cap)
        del data
        save_name = save_name + "_" + cap
    elif save_name == "trade_vol_BoW_norm":
        bag_of_words = trade_vol_BoW_normw(data, cap)
        del data
        save_name = save_name + "_" + cap
    elif save_name == "Dc_v4":
        bag_of_words = compute_Dc_v4(data)
        del data
    else:
        print("the save_name does not have a corresponding bag_of_words")
    # Compute input for gensim LDA
    corpus = compute_corpus(bag_of_words, save_name)
    id2word = compute_id2word(bag_of_words, save_name)
    # Run Gensim LDA
    lda = compute_topic(save_name,
                        corpus,
                        num_topics,
                        id2word,
                        workers=11,
                        chunksize=12500,
                        passes=40,
                        iterations=600)
    # ---------------- LDA Analysis  ----------------
    #os.environ["MKL_NUM_THREADS"] = "4"
    #os.environ["NUMEXPR_NUM_THREADS"] = "4"
    #os.environ["OMP_NUM_THREADS"] = "4"
    # Run PyLDAvis
    dictionary = Dictionary.from_corpus(corpus, id2word=id2word)
    save_pyldavis2html(lda, corpus, dictionary, save_name, num_topics)
    # Save document X topic matrix to csv
    document_topic_distribution(corpus, bag_of_words, lda, save_name,
                                num_topics)
Exemplo n.º 12
0
def createTopicDistribution(topics, min_df, max_features):
    topic_dist_key = f'{int(min_df*1000):d}_{max_features}_{topics}'
    topic_dist_path = Path(
        '../data/topic_distribution') / f'{topic_dist_key}.csv'
    key = f'{max_features}'
    dtm_path = corpus_path / f'dtm_{key}.npz'
    dtm = sparse.load_npz(dtm_path)
    token_path = corpus_path / f'tokens_{key}.csv'
    tokens = pd.read_csv(token_path,
                         header=None,
                         squeeze=True,
                         na_values=[],
                         keep_default_na=False)
    model_file = datapath(
        (experiment_path / 'models' / f'{key}_{topics}').resolve())
    lda_model = LdaModel.load(model_file)
    id2word = tokens.to_dict()
    corpus = Sparse2Corpus(dtm, documents_columns=False)
    dictionary = Dictionary.from_corpus(corpus, id2word)

    text_path = Path('../data/clean_stop')
    text_files = text_path.glob('*.txt')
    docs = [(f.name, f.read_text()) for f in text_files]

    topic_labels = [f'Topic {i}' for i in range(1, topics + 1)]
    document_topics = pd.DataFrame(index=topic_labels)

    for i, doc in enumerate(docs):
        bow = dictionary.doc2bow(doc[1].split())
        document_topics[doc[0]] = pd.Series({
            f'Topic {k+1}': v
            for k, v in lda_model.get_document_topics(bow=bow,
                                                      minimum_probability=1e-3)
        })

    document_topics.to_csv(topic_dist_path)
Exemplo n.º 13
0
def main():
    # ---------------- Set MLK Enviroment Variables for better Gensim LDA performance  ----------------
    os.environ["MKL_NUM_THREADS"] = "1"
    os.environ["NUMEXPR_NUM_THREADS"] = "1"
    os.environ["OMP_NUM_THREADS"] = "1"
    # ---------------- Prepare LDA Inputs & Run LDA ----------------
    # Parse command line args
    save_name = str(sys.argv[1])
    cap = str(sys.argv[2])
    num_topics = int(sys.argv[3])
    # Load data
    bow_matrix_train_path = save_name + "_train_sparse.npz"
    bow_matrix_test_path = save_name + "_test_sparse.npz"

    if False: #os.path.exists(bow_matrix_train_path):
        #X_train = scipy.sparse.load_npz(bow_matrix_train_path)
        #X_test = scipy.sparse.load_npz(bow_matrix_test_path)
        pass
    else: 
        data = load_pickle("FINRA_TRACE_2014.pkl.zip")
        #data = data.append(load_pickle("FINRA_TRACE_2014.pkl.zip"),ignore_index=True)
        #data = data.append(load_pickle("FINRA_TRACE_2013.pkl.zip"),ignore_index=True)
        #data = data.append(load_pickle("FINRA_TRACE_2012.pkl.zip"),ignore_index=True)
        # Compute a version of bag_of_words given the save_name
        if save_name=="trade_frac_out":
            bag_of_words = trade_frac_out(data)
            del data
            save_name = save_name
        elif save_name=="trade_vol_BoW":
            bag_of_words = trade_vol_BoW(data,cap)
            del data
            save_name = save_name + "_" + cap
        elif save_name=="trade_vol_BoW_norm":
            bag_of_words = trade_vol_BoW_norm(data,cap)
            del data
            save_name = save_name + "_" + cap
        elif save_name=="trade_count":
            bag_of_words = compute_count(data)
            del data
        else:
            raise Exception("the save_name does not have a corresponding bag_of_words")
            
        dtype = pd.SparseDtype(float, fill_value=0)
        X = scipy.sparse.csr_matrix(bag_of_words.astype(dtype).sparse.to_coo()) 
        #X = bag_of_words.astype(dtype)
        #cutoff = int(X.shape[0]*0.9)
        #X_train = X[:cutoff]
        #X_test = X[cutoff:]
        X_train, X_test, train_idx, test_idx = train_test_split(X, np.arange(X.shape[0]), test_size=0.1, random_state=42)
        scipy.sparse.save_npz(save_name + "_train_sparse.npz", X_train) 
        scipy.sparse.save_npz(save_name + "_test_sparse.npz", X_test)
	# slice our matrix to be just the training data
        #bag_of_words = bag_of_words.iloc[train_idx]
        train_index = bag_of_words.index[train_idx]

    # Compute input for gensim LDA
    corpus = compute_corpus(X_train,save_name)
    test_corpus = compute_corpus(X_test,save_name + "_test") 
    id2word = compute_id2word(bag_of_words,save_name)
    # Run Gensim LDA
    start = time.time()
    lda = compute_topic(save_name,corpus,num_topics,id2word,workers=11,chunksize=12500,passes=10,iterations=600)
    
    lda_time = time.time()-start
    train_perplex = lda.log_perplexity(corpus)
    test_perplex = lda.log_perplexity(test_corpus)
    
    print("perplexity scores: ", train_perplex, test_perplex)
    with open("perplex_scores.csv","a+") as f:
        writer = csv.writer(f)
        writer.writerow([save_name,num_topics,train_perplex,test_perplex,lda_time])


    # ---------------- LDA Analysis  ----------------
    #os.environ["MKL_NUM_THREADS"] = "4"
    #os.environ["NUMEXPR_NUM_THREADS"] = "4"
    #os.environ["OMP_NUM_THREADS"] = "4"
    # Run PyLDAvis
    dictionary = Dictionary.from_corpus(corpus,id2word=id2word)
    save_pyldavis2html(lda, corpus, dictionary,save_name,num_topics)
    # Save document X topic matrix to csv
    document_topic_distribution(corpus,train_index,lda,save_name,num_topics)
passes = 1
start = time()
for i, (min_df, max_df, binary) in enumerate(dtm_params, 1):

    print(min_df, max_df, binary)

    vocab_path = experiment_path / str(min_df) / str(max_df) / str(int(binary))
    try:
        dtm = sparse.load_npz(vocab_path / f'dtm.npz')
        tokens = pd.read_csv(vocab_path / f'tokens.csv', header=None, squeeze=True)
    except FileNotFoundError:
        print('missing')
        continue
    corpus = Sparse2Corpus(dtm, documents_columns=False)
    id2word = tokens.to_dict()
    dictionary = Dictionary.from_corpus(corpus, id2word)

    for num_topics in topics:
        print(num_topics, end=' ')
        model_path = vocab_path / str(num_topics) / str(passes) / 'lda'
        if model_path.exists():
            lda = LdaModel.load(model_path.as_posix())
        else:
            continue
        start = time()
        vis = prepare(lda, corpus, dictionary, mds='tsne')
        terms = vis.topic_info
        terms = terms[terms.Category != 'Default']
        pyLDAvis.save_html(vis, (model_path / 'ldavis.html').as_posix())
        terms.to_csv(model_path / 'relevant_terms.csv', index=False)
        duration = time() - start
Exemplo n.º 15
0
    x = literal_eval(rows.values[0])
    combined = list(itertools.chain.from_iterable(x))

    model_lists.append(combined)
    return x


print(len(model_texts))
df_grouped1 = df.groupby(['year'])['token_lists'].apply(get_lists)

tfidf_model = TfidfVectorizer(lowercase=False)
tfidf = tfidf_model.fit_transform(model_texts)

corpus = gensim.matutils.Sparse2Corpus(tfidf, documents_columns=False)
dictionary = Dictionary.from_corpus(
    corpus,
    id2word=dict((id, word) for word, id in tfidf_model.vocabulary_.items()))
model = LdaModel(corpus=corpus,
                 id2word=dictionary,
                 iterations=50,
                 num_topics=80)
model.save('lda_congo.model')

# def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
#     """
#     Compute c_v coherence for various number of topics

#     Parameters:
#     ----------
#     dictionary : Gensim dictionary
#     corpus : Gensim corpus
Exemplo n.º 16
0
tfidf_model_three = TfidfVectorizer(min_df=0.05, max_df=0.30)
three = tfidf_model_three.fit_transform(
    literal_eval(df_1963.token_texts.values[0]))
tfidf_model_four = TfidfVectorizer(min_df=0.05, max_df=0.30)
four = tfidf_model_four.fit_transform(
    literal_eval(df_1964.token_texts.values[0]))
tfidf_model_five = TfidfVectorizer(min_df=0.05, max_df=0.30)
five = tfidf_model_five.fit_transform(
    literal_eval(df_1965.token_texts.values[0]))
tfidf_model_six = TfidfVectorizer(min_df=0.05, max_df=0.30)
six = tfidf_model_six.fit_transform(literal_eval(
    df_1966.token_texts.values[0]))

corpus_zero = gensim.matutils.Sparse2Corpus(zero, documents_columns=False)
dictionary_zero = Dictionary.from_corpus(
    corpus_zero,
    id2word=dict(
        (id, word) for word, id in tfidf_model_zero.vocabulary_.items()))
model_zero = LdaModel(corpus=corpus_zero,
                      id2word=dictionary_zero,
                      iterations=50,
                      num_topics=40)

corpus_one = gensim.matutils.Sparse2Corpus(one, documents_columns=False)
dictionary_one = Dictionary.from_corpus(
    corpus_one,
    id2word=dict(
        (id, word) for word, id in tfidf_model_one.vocabulary_.items()))
model_one = LdaModel(corpus=corpus_one,
                     id2word=dictionary_one,
                     iterations=50,
                     num_topics=40)
Exemplo n.º 17
0
    genes = pd.read_csv(fname + '_genes.csv', header=0, index_col=0)

    # training set
    rowsT = np.where(ind != i)
    X = np.asarray(phi.iloc[rowsT])
    cols = np.logical_and(fmin < X.sum(axis=0), X.sum(axis=0) < fmax)
    X = X[:, cols]
    X_corp = Dense2Corpus(np.array(X), documents_columns=False)

    # valid set
    rowsV = np.where(ind == i)
    X_test = np.asarray(phi.iloc[rowsV])
    X_test = X_test[:, cols]
    X_testcorp = Dense2Corpus(np.array(X_test), documents_columns=False)

    dic = Dictionary.from_corpus(X_corp)

    model = LdaModel.load(fname + '_model')  # load model

    cm = CoherenceModel(model=model,
                        corpus=X_corp,
                        dictionary=dic,
                        coherence=coh_meas)
    coh_tr[i - 1] = cm.get_coherence()
    cm = CoherenceModel(model=model,
                        corpus=X_testcorp,
                        dictionary=dic,
                        coherence=coh_meas)
    coh_te[i - 1] = cm.get_coherence()

print("Mean Train Coherence: " + str(np.mean(coh_tr)))