def test_from_corpus(self): """build `Dictionary` from an existing corpus""" documents = [ "Human machine interface for lab abc computer applications", "A survey of user opinion of computer system response time", "The EPS user interface management system", "System and human system engineering testing of EPS", "Relation of user perceived response time to error measurement", "The generation of random binary unordered trees", "The intersection graph of paths in trees", "Graph minors IV Widths of trees and well quasi ordering", "Graph minors A survey" ] stoplist = set('for a of the and to in'.split()) texts = [[ word for word in document.lower().split() if word not in stoplist ] for document in documents] # remove words that appear only once all_tokens = sum(texts, []) tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1) texts = [[word for word in text if word not in tokens_once] for text in texts] dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] # Create dictionary from corpus without a token map dictionary_from_corpus = Dictionary.from_corpus(corpus) dict_token2id_vals = sorted(dictionary.token2id.values()) dict_from_corpus_vals = sorted( dictionary_from_corpus.token2id.values()) self.assertEqual(dict_token2id_vals, dict_from_corpus_vals) self.assertEqual(dictionary.dfs, dictionary_from_corpus.dfs) self.assertEqual(dictionary.num_docs, dictionary_from_corpus.num_docs) self.assertEqual(dictionary.num_pos, dictionary_from_corpus.num_pos) self.assertEqual(dictionary.num_nnz, dictionary_from_corpus.num_nnz) # Create dictionary from corpus with an id=>token map dictionary_from_corpus_2 = Dictionary.from_corpus(corpus, id2word=dictionary) self.assertEqual(dictionary.token2id, dictionary_from_corpus_2.token2id) self.assertEqual(dictionary.dfs, dictionary_from_corpus_2.dfs) self.assertEqual(dictionary.num_docs, dictionary_from_corpus_2.num_docs) self.assertEqual(dictionary.num_pos, dictionary_from_corpus_2.num_pos) self.assertEqual(dictionary.num_nnz, dictionary_from_corpus_2.num_nnz) # Ensure Sparse2Corpus is compatible with from_corpus bow = gensim.matutils.Sparse2Corpus(scipy.sparse.rand(10, 100)) dictionary = Dictionary.from_corpus(bow) self.assertEqual(dictionary.num_docs, 100)
def test_from_corpus(self): """build `Dictionary` from an existing corpus""" documents = [ "Human machine interface for lab abc computer applications", "A survey of user opinion of computer system response time", "The EPS user interface management system", "System and human system engineering testing of EPS", "Relation of user perceived response time to error measurement", "The generation of random binary unordered trees", "The intersection graph of paths in trees", "Graph minors IV Widths of trees and well quasi ordering", "Graph minors A survey" ] stoplist = set('for a of the and to in'.split()) texts = [ [word for word in document.lower().split() if word not in stoplist] for document in documents] # remove words that appear only once all_tokens = sum(texts, []) tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1) texts = [[word for word in text if word not in tokens_once] for text in texts] dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] # Create dictionary from corpus without a token map dictionary_from_corpus = Dictionary.from_corpus(corpus) dict_token2id_vals = sorted(dictionary.token2id.values()) dict_from_corpus_vals = sorted(dictionary_from_corpus.token2id.values()) self.assertEqual(dict_token2id_vals, dict_from_corpus_vals) self.assertEqual(dictionary.dfs, dictionary_from_corpus.dfs) self.assertEqual(dictionary.num_docs, dictionary_from_corpus.num_docs) self.assertEqual(dictionary.num_pos, dictionary_from_corpus.num_pos) self.assertEqual(dictionary.num_nnz, dictionary_from_corpus.num_nnz) # Create dictionary from corpus with an id=>token map dictionary_from_corpus_2 = Dictionary.from_corpus(corpus, id2word=dictionary) self.assertEqual(dictionary.token2id, dictionary_from_corpus_2.token2id) self.assertEqual(dictionary.dfs, dictionary_from_corpus_2.dfs) self.assertEqual(dictionary.num_docs, dictionary_from_corpus_2.num_docs) self.assertEqual(dictionary.num_pos, dictionary_from_corpus_2.num_pos) self.assertEqual(dictionary.num_nnz, dictionary_from_corpus_2.num_nnz) # Ensure Sparse2Corpus is compatible with from_corpus bow = gensim.matutils.Sparse2Corpus(scipy.sparse.rand(10, 100)) dictionary = Dictionary.from_corpus(bow) self.assertEqual(dictionary.num_docs, 100)
def test_from_corpus(self): """build `Dictionary` from an existing corpus""" documents = ["Human machine interface for lab abc computer applications", "A survey of user opinion of computer system response time", "The EPS user interface management system", "System and human system engineering testing of EPS", "Relation of user perceived response time to error measurement", "The generation of random binary unordered trees", "The intersection graph of paths in trees", "Graph minors IV Widths of trees and well quasi ordering", "Graph minors A survey"] stoplist = set('for a of the and to in'.split()) texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents] # remove words that appear only once all_tokens = sum(texts, []) tokens_once = set(word for word in set(all_tokens) if all_tokens.count(word) == 1) texts = [[word for word in text if word not in tokens_once] for text in texts] dictionary = Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] dictionary_from_corpus = Dictionary.from_corpus(corpus) #we have to compare values, because in creating dictionary from corpus #informations about words are lost dict_token2id_vals = sorted(dictionary.token2id.values()) dict_from_corpus_vals = sorted(dictionary_from_corpus.token2id.values()) self.assertEqual(dict_token2id_vals, dict_from_corpus_vals) self.assertEqual(dictionary.dfs, dictionary_from_corpus.dfs) self.assertEqual(dictionary.num_docs, dictionary_from_corpus.num_docs) self.assertEqual(dictionary.num_pos, dictionary_from_corpus.num_pos) self.assertEqual(dictionary.num_nnz, dictionary_from_corpus.num_nnz)
def getLDAvis(topics, min_df, max_features): ldavis_key = f'{int(min_df*1000):d}_{max_features}_{topics}' ldavis_path = Path('./pyldavis') / f'{ldavis_key}_tsne.html' if not ldavis_path.exists(): key = f'{max_features}' dtm_path = corpus_path / f'dtm_{key}.npz' dtm = sparse.load_npz(dtm_path) token_path = corpus_path / f'tokens_{key}.csv' tokens = pd.read_csv(token_path, header=None, squeeze=True, na_values=[], keep_default_na=False) model_file = datapath( (experiment_path / 'models' / f'{key}_{topics}').resolve()) lda_model = LdaModel.load(model_file) id2word = tokens.to_dict() corpus = Sparse2Corpus(dtm, documents_columns=False) dictionary = Dictionary.from_corpus(corpus, id2word) vis = prepare(lda_model, corpus, dictionary, mds='tsne') kwargs = {"ldavis_url": "/static/ldavis.js"} pyLDAvis.save_html(vis, str(ldavis_path), **kwargs) with open(str(ldavis_path), 'r') as myfile: data = myfile.read() return data #getLDAvis(5, 0.001, 10000) #getLDAvis(5, 0.001, 25000) #getLDAvis(10, 0.001, 10000) #getLDAvis(10, 0.001, 25000) #getLDAvis(20, 0.001, 10000) #getLDAvis(20, 0.001, 25000)
def graphLDA(name): embedFile = 'backendOutput/embeddings-' + name + '.pkl' bow, tfidf, _, id2word = loadData(embedFile) for (docRep, docRepName) in [(bow, 'bow'), (tfidf, 'tfidf')]: ldamodel = loadData('backendOutput/ldamodel-' + name + "-" + docRepName + '.pkl') corpus = Sparse2Corpus(docRep, documents_columns=False) dictionary = Dictionary.from_corpus(corpus, id2word) #This could be more descriptive if we wanted document_labels = ["Document " + str(i) for i in range(len(corpus))] grapher = LDAGrapher(docRepName, corpus, dictionary, ldamodel, document_labels, name) print("Graphing t-SNE for " + docRepName + "...") grapher.graphTSNE(perplexity=30) print("Graphing pyLDAvis for " + docRepName + "...") grapher.graphPyLDAvis() print("Creating word cloud for " + docRepName + "...") grapher.graphWordCloud() print("Graphing word weights for " + docRepName + "...") grapher.graphWordWeight() print("Done graphing!")
def main(): from_http = bool(int(sys.argv[1])) file_name = str(sys.argv[2]) data = read_data(file_name=file_name, from_http=from_http) data = process_data(data) save_name = str(sys.argv[3]) if save_name == 'harmonized_shipper_sym': bag_of_words = create_BoW_harmonized_shipper(data) alpha = 'symmetric' elif save_name == 'harmonized_shipper_asym': bag_of_words = create_BoW_harmonized_shipper(data) alpha = 'asymmetric' elif save_name == 'shipper_harmonized_sym': bag_of_words = create_BoW_shipper_harmonized(data) alpha = 'symmetric' elif save_name == 'shipper_harmonized_asym': bag_of_words = create_BoW_shipper_harmonized(data) alpha = 'asymmetric' else: print('not reconize') corpus = create_corpus(bag_of_words, save_name, save=True) id2word = create_id2word(bag_of_words, save_name, save=True) num_topics = int(sys.argv[4]) model = compute_lda(save_name, corpus, num_topics, id2word, alpha=alpha) # Fro visualization dictionary = Dictionary.from_corpus(corpus, id2word=id2word) save_pyldavis2html(model, corpus, dictionary, save_name, num_topics) # For document_topic_distribution document_topic_distribution(corpus, bag_of_words, model, save_name, num_topics, minimum_probability=0.10)
def main(): model_name = str(sys.argv[1]) num_topics = int(sys.argv[2]) # Fro visualization corpus = load_corpus(model_name) id2word = load_id2word(model_name) dictionary = Dictionary.from_corpus(corpus, id2word=id2word) # Load LDAModel model = load_model(model_name, num_topics) save_pyldavis2html(model, corpus, dictionary, model_name, num_topics) # Load data to caculate matrix data = load_data() if (model_name == 'Dc_v1'): matrix_object = compute_Dc_v1(data) if (model_name == 'Dc_v2'): matrix_object = compute_Dc_v2(data) if (model_name == 'Dc_v3'): matrix_object = compute_Dc_v3(data) if (model_name == 'Dc_v4'): matrix_object = compute_Dc_v4(data) if (model_name == 'Tc_v1'): matrix_object = compute_Tc_v1(data) # Save document_topic_distribution document_topic_distribution(corpus, matrix_object, model, model_name, num_topics)
def transform_sklearn_to_gensim(corpus_vect): # transform sparse matrix into gensim corpus corpus_vect_gensim = matutils.Sparse2Corpus(corpus_vect, documents_columns=False) dictionary = Dictionary.from_corpus( corpus_vect_gensim, id2word=dict( (id, word) for word, id in corpus_vect.vocabulary_.items())) return corpus_vect_gensim, dictionary
def main(): model_name = str(sys.argv[1]) num_topics = int(sys.argv[2]) # Fro visualization corpus = load_corpus(model_name) id2word = load_id2word(model_name) dictionary = Dictionary.from_corpus(corpus,id2word=id2word) # Load LDAModel model = load_model(model_name,num_topics) save_pyldavis2html(model, corpus, dictionary,model_name,num_topics)
def main(): # ---------------- Set MLK Enviroment Variables for better Gensim LDA performance ---------------- os.environ["MKL_NUM_THREADS"] = "1" os.environ["NUMEXPR_NUM_THREADS"] = "1" os.environ["OMP_NUM_THREADS"] = "1" # ---------------- Prepare LDA Inputs & Run LDA ---------------- # Parse command line args save_name = str(sys.argv[1]) cap = str(sys.argv[2]) num_topics = int(sys.argv[3]) # Load data data = load_pickle("FINRA_TRACE_2015.pkl.zip") data = data.append(load_pickle("FINRA_TRACE_2014.pkl.zip"), ignore_index=True) #data = data.append(load_pickle("FINRA_TRACE_2013.pkl.zip"),ignore_index=True) #data = data.append(load_pickle("FINRA_TRACE_2012.pkl.zip"),ignore_index=True) # Compute a version of bag_of_words given the save_name if save_name == "trade_vol_BoW": bag_of_words = trade_vol_BoW(data, cap) del data save_name = save_name + "_" + cap elif save_name == "trade_vol_BoW_norm": bag_of_words = trade_vol_BoW_normw(data, cap) del data save_name = save_name + "_" + cap elif save_name == "Dc_v4": bag_of_words = compute_Dc_v4(data) del data else: print("the save_name does not have a corresponding bag_of_words") # Compute input for gensim LDA corpus = compute_corpus(bag_of_words, save_name) id2word = compute_id2word(bag_of_words, save_name) # Run Gensim LDA lda = compute_topic(save_name, corpus, num_topics, id2word, workers=11, chunksize=12500, passes=40, iterations=600) # ---------------- LDA Analysis ---------------- #os.environ["MKL_NUM_THREADS"] = "4" #os.environ["NUMEXPR_NUM_THREADS"] = "4" #os.environ["OMP_NUM_THREADS"] = "4" # Run PyLDAvis dictionary = Dictionary.from_corpus(corpus, id2word=id2word) save_pyldavis2html(lda, corpus, dictionary, save_name, num_topics) # Save document X topic matrix to csv document_topic_distribution(corpus, bag_of_words, lda, save_name, num_topics)
def createTopicDistribution(topics, min_df, max_features): topic_dist_key = f'{int(min_df*1000):d}_{max_features}_{topics}' topic_dist_path = Path( '../data/topic_distribution') / f'{topic_dist_key}.csv' key = f'{max_features}' dtm_path = corpus_path / f'dtm_{key}.npz' dtm = sparse.load_npz(dtm_path) token_path = corpus_path / f'tokens_{key}.csv' tokens = pd.read_csv(token_path, header=None, squeeze=True, na_values=[], keep_default_na=False) model_file = datapath( (experiment_path / 'models' / f'{key}_{topics}').resolve()) lda_model = LdaModel.load(model_file) id2word = tokens.to_dict() corpus = Sparse2Corpus(dtm, documents_columns=False) dictionary = Dictionary.from_corpus(corpus, id2word) text_path = Path('../data/clean_stop') text_files = text_path.glob('*.txt') docs = [(f.name, f.read_text()) for f in text_files] topic_labels = [f'Topic {i}' for i in range(1, topics + 1)] document_topics = pd.DataFrame(index=topic_labels) for i, doc in enumerate(docs): bow = dictionary.doc2bow(doc[1].split()) document_topics[doc[0]] = pd.Series({ f'Topic {k+1}': v for k, v in lda_model.get_document_topics(bow=bow, minimum_probability=1e-3) }) document_topics.to_csv(topic_dist_path)
def main(): # ---------------- Set MLK Enviroment Variables for better Gensim LDA performance ---------------- os.environ["MKL_NUM_THREADS"] = "1" os.environ["NUMEXPR_NUM_THREADS"] = "1" os.environ["OMP_NUM_THREADS"] = "1" # ---------------- Prepare LDA Inputs & Run LDA ---------------- # Parse command line args save_name = str(sys.argv[1]) cap = str(sys.argv[2]) num_topics = int(sys.argv[3]) # Load data bow_matrix_train_path = save_name + "_train_sparse.npz" bow_matrix_test_path = save_name + "_test_sparse.npz" if False: #os.path.exists(bow_matrix_train_path): #X_train = scipy.sparse.load_npz(bow_matrix_train_path) #X_test = scipy.sparse.load_npz(bow_matrix_test_path) pass else: data = load_pickle("FINRA_TRACE_2014.pkl.zip") #data = data.append(load_pickle("FINRA_TRACE_2014.pkl.zip"),ignore_index=True) #data = data.append(load_pickle("FINRA_TRACE_2013.pkl.zip"),ignore_index=True) #data = data.append(load_pickle("FINRA_TRACE_2012.pkl.zip"),ignore_index=True) # Compute a version of bag_of_words given the save_name if save_name=="trade_frac_out": bag_of_words = trade_frac_out(data) del data save_name = save_name elif save_name=="trade_vol_BoW": bag_of_words = trade_vol_BoW(data,cap) del data save_name = save_name + "_" + cap elif save_name=="trade_vol_BoW_norm": bag_of_words = trade_vol_BoW_norm(data,cap) del data save_name = save_name + "_" + cap elif save_name=="trade_count": bag_of_words = compute_count(data) del data else: raise Exception("the save_name does not have a corresponding bag_of_words") dtype = pd.SparseDtype(float, fill_value=0) X = scipy.sparse.csr_matrix(bag_of_words.astype(dtype).sparse.to_coo()) #X = bag_of_words.astype(dtype) #cutoff = int(X.shape[0]*0.9) #X_train = X[:cutoff] #X_test = X[cutoff:] X_train, X_test, train_idx, test_idx = train_test_split(X, np.arange(X.shape[0]), test_size=0.1, random_state=42) scipy.sparse.save_npz(save_name + "_train_sparse.npz", X_train) scipy.sparse.save_npz(save_name + "_test_sparse.npz", X_test) # slice our matrix to be just the training data #bag_of_words = bag_of_words.iloc[train_idx] train_index = bag_of_words.index[train_idx] # Compute input for gensim LDA corpus = compute_corpus(X_train,save_name) test_corpus = compute_corpus(X_test,save_name + "_test") id2word = compute_id2word(bag_of_words,save_name) # Run Gensim LDA start = time.time() lda = compute_topic(save_name,corpus,num_topics,id2word,workers=11,chunksize=12500,passes=10,iterations=600) lda_time = time.time()-start train_perplex = lda.log_perplexity(corpus) test_perplex = lda.log_perplexity(test_corpus) print("perplexity scores: ", train_perplex, test_perplex) with open("perplex_scores.csv","a+") as f: writer = csv.writer(f) writer.writerow([save_name,num_topics,train_perplex,test_perplex,lda_time]) # ---------------- LDA Analysis ---------------- #os.environ["MKL_NUM_THREADS"] = "4" #os.environ["NUMEXPR_NUM_THREADS"] = "4" #os.environ["OMP_NUM_THREADS"] = "4" # Run PyLDAvis dictionary = Dictionary.from_corpus(corpus,id2word=id2word) save_pyldavis2html(lda, corpus, dictionary,save_name,num_topics) # Save document X topic matrix to csv document_topic_distribution(corpus,train_index,lda,save_name,num_topics)
passes = 1 start = time() for i, (min_df, max_df, binary) in enumerate(dtm_params, 1): print(min_df, max_df, binary) vocab_path = experiment_path / str(min_df) / str(max_df) / str(int(binary)) try: dtm = sparse.load_npz(vocab_path / f'dtm.npz') tokens = pd.read_csv(vocab_path / f'tokens.csv', header=None, squeeze=True) except FileNotFoundError: print('missing') continue corpus = Sparse2Corpus(dtm, documents_columns=False) id2word = tokens.to_dict() dictionary = Dictionary.from_corpus(corpus, id2word) for num_topics in topics: print(num_topics, end=' ') model_path = vocab_path / str(num_topics) / str(passes) / 'lda' if model_path.exists(): lda = LdaModel.load(model_path.as_posix()) else: continue start = time() vis = prepare(lda, corpus, dictionary, mds='tsne') terms = vis.topic_info terms = terms[terms.Category != 'Default'] pyLDAvis.save_html(vis, (model_path / 'ldavis.html').as_posix()) terms.to_csv(model_path / 'relevant_terms.csv', index=False) duration = time() - start
x = literal_eval(rows.values[0]) combined = list(itertools.chain.from_iterable(x)) model_lists.append(combined) return x print(len(model_texts)) df_grouped1 = df.groupby(['year'])['token_lists'].apply(get_lists) tfidf_model = TfidfVectorizer(lowercase=False) tfidf = tfidf_model.fit_transform(model_texts) corpus = gensim.matutils.Sparse2Corpus(tfidf, documents_columns=False) dictionary = Dictionary.from_corpus( corpus, id2word=dict((id, word) for word, id in tfidf_model.vocabulary_.items())) model = LdaModel(corpus=corpus, id2word=dictionary, iterations=50, num_topics=80) model.save('lda_congo.model') # def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3): # """ # Compute c_v coherence for various number of topics # Parameters: # ---------- # dictionary : Gensim dictionary # corpus : Gensim corpus
tfidf_model_three = TfidfVectorizer(min_df=0.05, max_df=0.30) three = tfidf_model_three.fit_transform( literal_eval(df_1963.token_texts.values[0])) tfidf_model_four = TfidfVectorizer(min_df=0.05, max_df=0.30) four = tfidf_model_four.fit_transform( literal_eval(df_1964.token_texts.values[0])) tfidf_model_five = TfidfVectorizer(min_df=0.05, max_df=0.30) five = tfidf_model_five.fit_transform( literal_eval(df_1965.token_texts.values[0])) tfidf_model_six = TfidfVectorizer(min_df=0.05, max_df=0.30) six = tfidf_model_six.fit_transform(literal_eval( df_1966.token_texts.values[0])) corpus_zero = gensim.matutils.Sparse2Corpus(zero, documents_columns=False) dictionary_zero = Dictionary.from_corpus( corpus_zero, id2word=dict( (id, word) for word, id in tfidf_model_zero.vocabulary_.items())) model_zero = LdaModel(corpus=corpus_zero, id2word=dictionary_zero, iterations=50, num_topics=40) corpus_one = gensim.matutils.Sparse2Corpus(one, documents_columns=False) dictionary_one = Dictionary.from_corpus( corpus_one, id2word=dict( (id, word) for word, id in tfidf_model_one.vocabulary_.items())) model_one = LdaModel(corpus=corpus_one, id2word=dictionary_one, iterations=50, num_topics=40)
genes = pd.read_csv(fname + '_genes.csv', header=0, index_col=0) # training set rowsT = np.where(ind != i) X = np.asarray(phi.iloc[rowsT]) cols = np.logical_and(fmin < X.sum(axis=0), X.sum(axis=0) < fmax) X = X[:, cols] X_corp = Dense2Corpus(np.array(X), documents_columns=False) # valid set rowsV = np.where(ind == i) X_test = np.asarray(phi.iloc[rowsV]) X_test = X_test[:, cols] X_testcorp = Dense2Corpus(np.array(X_test), documents_columns=False) dic = Dictionary.from_corpus(X_corp) model = LdaModel.load(fname + '_model') # load model cm = CoherenceModel(model=model, corpus=X_corp, dictionary=dic, coherence=coh_meas) coh_tr[i - 1] = cm.get_coherence() cm = CoherenceModel(model=model, corpus=X_testcorp, dictionary=dic, coherence=coh_meas) coh_te[i - 1] = cm.get_coherence() print("Mean Train Coherence: " + str(np.mean(coh_tr)))