def extractTopicModelData(articleList, commentList, commentCount, set_tag, tag): processed_comment_list = extract_global_bag_of_words_processed(commentList) print len(processed_comment_list) train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy') train_list = [] test_list = [] for v in train_v: train_list.append(processed_comment_list[v]) for v in test_v: test_list.append(processed_comment_list[v]) lda = models.LdaModel.load(model_path + set_tag.replace("_","") + "_lda_model") dictionary = corpora.Dictionary.load(model_path + set_tag.replace("_","") + "_dictionary") train = [dictionary.doc2bow(text) for text in train_list] test = [dictionary.doc2bow(text) for text in test_list] docTopicProbMat_train = lda[train] docTopicProbMat_test = lda[test] train_lda=matutils.corpus2dense(docTopicProbMat_train) test_lda=matutils.corpus2dense(docTopicProbMat_test) print train_lda.shape print test_lda.shape save_sparse_csr(feature_set_path + set_tag + "lda" + tag + "_train", train_lda) save_sparse_csr(feature_set_path + set_tag + "lda" + tag + "_test", test_lda) print "DONE LDA"
def extractTopicModelData(articleList, commentList, commentCount, set_tag, tag): processed_comment_list = extract_global_bag_of_words_processed(commentList) print len(processed_comment_list) train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy') train_list = [] test_list = [] for v in train_v: train_list.append(processed_comment_list[v]) for v in test_v: test_list.append(processed_comment_list[v]) lda = models.LdaModel.load(model_path + set_tag.replace("_", "") + "_lda_model") dictionary = corpora.Dictionary.load(model_path + set_tag.replace("_", "") + "_dictionary") train = [dictionary.doc2bow(text) for text in train_list] test = [dictionary.doc2bow(text) for text in test_list] docTopicProbMat_train = lda[train] docTopicProbMat_test = lda[test] train_lda = matutils.corpus2dense(docTopicProbMat_train) test_lda = matutils.corpus2dense(docTopicProbMat_test) print train_lda.shape print test_lda.shape save_sparse_csr(feature_set_path + set_tag + "lda" + tag + "_train", train_lda) save_sparse_csr(feature_set_path + set_tag + "lda" + tag + "_test", test_lda) print "DONE LDA"
def extractWordData(df_comments, tag): processed_comment_list = extract_global_bag_of_words_processed(df_comments) print len(processed_comment_list) train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy') train_list = [] test_list = [] for v in train_v: train_list.append(processed_comment_list[v]) for v in test_v: test_list.append(processed_comment_list[v]) # train_list = [' '.join(sent) for sent in train_list] # test_list = [' '.join(sent) for sent in test_list] print len(train_list) print len(test_list) print 'Unigram Binary' bwd_train, bwd_test = extract_words(CountVectorizer(analyzer=UnigramAnalyzer(), binary=True, dtype=float), train_list, test_list) print 'Unigram Frequency' fwd_train, fwd_test = extract_words(CountVectorizer(analyzer=UnigramAnalyzer(), dtype=float), train_list, test_list) print 'Unigram TFIDF' twd_train, twd_test = extract_words(TfidfVectorizer(analyzer=UnigramAnalyzer(), use_idf=True, smooth_idf=True, dtype=float), train_list, test_list) print 'Bigram Binary' bbwd_train, bbwd_test = extract_words(CountVectorizer(analyzer=UnigramBigramAnalyzer(), binary=True, dtype=float), train_list, test_list) print 'Bigram TFIDF' btwd_train, btwd_test = extract_words(TfidfVectorizer(analyzer=UnigramBigramAnalyzer(), use_idf=True, smooth_idf=True, dtype=float), train_list, test_list) print 'Trigram Binary' tbwd_train, tbwd_test = extract_words(CountVectorizer(analyzer=UnigramBigramTrigramAnalyzer(), binary=True, dtype=float), train_list, test_list) print 'Trigram TFIDF' ttwd_train, ttwd_test = extract_words(TfidfVectorizer(analyzer=UnigramBigramTrigramAnalyzer(), use_idf=True, smooth_idf=True, dtype=float), train_list, test_list) print 'Bigram Only Binary' bowd_train, bowd_test = extract_words(CountVectorizer(analyzer=BigramAnalyzer(), binary=True, dtype=float), train_list, test_list) print 'Bigram Only TFIDF' bowd2_train, bowd2_test = extract_words(TfidfVectorizer(analyzer=BigramAnalyzer(), use_idf=True, smooth_idf=True, dtype=float), train_list, test_list) print 'Trigram Only Binary' towd_train, towd_test = extract_words(CountVectorizer(analyzer=TrigramAnalyzer(), binary=True, dtype=float), train_list, test_list) print 'Trigram Only TFIDF' towd2_train, towd2_test = extract_words(TfidfVectorizer(analyzer=TrigramAnalyzer(), use_idf=True, smooth_idf=True, dtype=float), train_list, test_list) print(feature_set_path + "binaryWordData_train", bwd_train[123, :]) print(feature_set_path + "freqWordData_train", fwd_train[123, :]) print(feature_set_path + "tfidfWordData_train", twd_train[123, :]) print(feature_set_path + "bigramBinaryWordData_train", bbwd_train[123, :]) print(feature_set_path + "bigramTfidfWordData_train", btwd_train[123, :]) print(feature_set_path + "trigramBinaryWordData_train", tbwd_train[123, :]) print(feature_set_path + "trigramTfidfWordData_train", ttwd_train[123, :]) print(feature_set_path + "bigramOnlyBinaryWordData_train", bowd_train[123, :]) print(feature_set_path + "bigramOnlyTfidfWordData_train", bowd2_train[123, :]) print(feature_set_path + "trigramOnlyBinaryWordData_train", towd_train[123, :]) print(feature_set_path + "trigramOnlyTfidfWordData_train", towd2_train[123, :]) save_sparse_csr(feature_set_path + "binaryWordData" + tag + "_train", bwd_train) save_sparse_csr(feature_set_path + "freqWordData" + tag + "_train", fwd_train) save_sparse_csr(feature_set_path + "tfidfWordData" + tag + "_train", twd_train) save_sparse_csr(feature_set_path + "bigramBinaryWordData" + tag + "_train", bbwd_train) save_sparse_csr(feature_set_path + "bigramTfidfWordData" + tag + "_train", btwd_train) save_sparse_csr(feature_set_path + "trigramBinaryWordData" + tag + "_train", tbwd_train) save_sparse_csr(feature_set_path + "trigramTfidfWordData" + tag + "_train", ttwd_train) save_sparse_csr(feature_set_path + "bigramOnlyBinaryWordData" + tag + "_train", bowd_train) save_sparse_csr(feature_set_path + "bigramOnlyTfidfWordData" + tag + "_train", bowd2_train) save_sparse_csr(feature_set_path + "trigramOnlyBinaryWordData" + tag + "_train", towd_train) save_sparse_csr(feature_set_path + "trigramOnlyTfidfWordData" + tag + "_train", towd2_train) save_sparse_csr(feature_set_path + "binaryWordData" + tag + "_test", bwd_test) save_sparse_csr(feature_set_path + "freqWordData" + tag + "_test", fwd_test) save_sparse_csr(feature_set_path + "tfidfWordData" + tag + "_test", twd_test) save_sparse_csr(feature_set_path + "bigramBinaryWordData" + tag + "_test", bbwd_test) save_sparse_csr(feature_set_path + "bigramTfidfWordData" + tag + "_test", btwd_test) save_sparse_csr(feature_set_path + "trigramBinaryWordData" + tag + "_test", tbwd_test) save_sparse_csr(feature_set_path + "trigramTfidfWordData" + tag + "_test", ttwd_test) save_sparse_csr(feature_set_path + "bigramOnlyBinaryWordData" + tag + "_test", bowd_test) save_sparse_csr(feature_set_path + "bigramOnlyTfidfWordData" + tag + "_test", bowd2_test) save_sparse_csr(feature_set_path + "trigramOnlyBinaryWordData" + tag + "_test", towd_test) save_sparse_csr(feature_set_path + "trigramOnlyTfidfWordData" + tag + "_test", towd2_test)
articleList, commentList, parentList, commentCount = read_toy_comments(comment_data_path + 'trainTestDataSet.txt', comment_data_path + 'toyComments.csv') elif set == 3: articleList, commentList, commentCount = read_slashdot_comments(comment_data_path + 'slashdotDataSet.txt', limit=100000) # Values y = extract_values(articleList, commentList, commentCount, set) sss = StratifiedShuffleSplit(y, 1, test_size=0.95, random_state=42) y_train = [] y_test = [] for train, test in sss: np.save('train_vect', train) np.save('test_vect', test) y_train = y[train] y_test = y[test] processed_comment_list = extract_global_bag_of_words_processed(commentList) train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy') train_list = [] test_list = [] for v in train_v: train_list.append(processed_comment_list[v]) for v in test_v: test_list.append(processed_comment_list[v]) train, test, terms = extract_words(CountVectorizer(analyzer=UnigramAnalyzer(), dtype=float), train_list, test_list) print train.shape model = LDA()
elif set == 3: articleList, commentList, commentCount = read_slashdot_comments( comment_data_path + 'slashdotDataSet.txt', limit=100000) # Values y = extract_values(articleList, commentList, commentCount, set) sss = StratifiedShuffleSplit(y, 1, test_size=0.95, random_state=42) y_train = [] y_test = [] for train, test in sss: np.save('train_vect', train) np.save('test_vect', test) y_train = y[train] y_test = y[test] processed_comment_list = extract_global_bag_of_words_processed(commentList) train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy') train_list = [] test_list = [] for v in train_v: train_list.append(processed_comment_list[v]) for v in test_v: test_list.append(processed_comment_list[v]) train, test, terms = extract_words( CountVectorizer(analyzer=UnigramAnalyzer(), dtype=float), train_list, test_list) print train.shape model = LDA() model.fit(train.toarray(), y_train) values = []