def extractCharacterData(df_comments, tag): comment_list = extract_global_bag_of_words(df_comments) train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy') train_list = [] test_list = [] for v in train_v: train_list.append(comment_list[v]) for v in test_v: test_list.append(comment_list[v]) print len(train_list) print len(test_list) print 'Character Ngrams Binary' cb_train, cb_test = extract_words(CountVectorizer(analyzer=CharacterAnalyzer(), binary=True, dtype=float), train_list, test_list) save_sparse_csr(feature_set_path + "binaryCharacterData" + tag + "_train", cb_train) save_sparse_csr(feature_set_path + "binaryCharacterData" + tag + "_test", cb_test) print 'Character Ngrams tfidf' ct_train, ct_test = extract_words(TfidfVectorizer(analyzer=CharacterAnalyzer(), use_idf=True, smooth_idf=True, dtype=float), train_list, test_list) save_sparse_csr(feature_set_path + "tfidfCharacterData" + tag + "_train", ct_train) save_sparse_csr(feature_set_path + "tfidfCharacterData" + tag + "_test", ct_test) print 'Character skipgrams Binary' sb_train, sb_test = extract_words(CountVectorizer(analyzer=CharacterSkipGramAnalyzer(), binary=True, dtype=float), train_list, test_list) save_sparse_csr(feature_set_path + "binaryCharacterSkipgramData" + tag + "_train", sb_train) save_sparse_csr(feature_set_path + "binaryCharacterSkipgramData" + tag + "_test", sb_test) print 'Character skipgrams TFIDF' sb_train, sb_test = extract_words(TfidfVectorizer(analyzer=CharacterSkipGramAnalyzer(), use_idf=True, smooth_idf=True, dtype=float), train_list, test_list) save_sparse_csr(feature_set_path + "tfidfCharacterSkipgramData" + tag + "_train", sb_train) save_sparse_csr(feature_set_path + "tfidfCharacterSkipgramData" + tag + "_test", sb_test)
def extractLexicalBigramData(articleList, commentList, commentCount): comment_list = extract_global_bag_of_words(commentList) train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy') train_list = [] test_list = [] for v in train_v: train_list.append(comment_list[v]) for v in test_v: test_list.append(comment_list[v]) print len(train_list) print len(test_list) print 'Lexical Ngrams Binary' cb_train, cb_test = extract_words( CountVectorizer(analyzer=LexicalBigramUnigramAnalyzer(), binary=True, dtype=float), train_list, test_list) save_sparse_csr(feature_set_path + "binaryLexicalBigramsData_train", cb_train) save_sparse_csr(feature_set_path + "binaryLexicalBigramsData_test", cb_test) print 'Lexical Ngrams tfidf' ct_train, ct_test = extract_words( TfidfVectorizer(analyzer=LexicalBigramUnigramAnalyzer(), use_idf=True, smooth_idf=True, dtype=float), train_list, test_list) save_sparse_csr(feature_set_path + "tfidfLexicalBigramsData_train", ct_train) save_sparse_csr(feature_set_path + "tfidfLexicalBigramsData_test", ct_test)
def extractLexicalBigramData(articleList, commentList, commentCount): comment_list = extract_global_bag_of_words(commentList) train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy') train_list = [] test_list = [] for v in train_v: train_list.append(comment_list[v]) for v in test_v: test_list.append(comment_list[v]) print len(train_list) print len(test_list) print 'Lexical Ngrams Binary' cb_train, cb_test = extract_words(CountVectorizer(analyzer=LexicalBigramUnigramAnalyzer(), binary=True, dtype=float), train_list, test_list) save_sparse_csr(feature_set_path + "binaryLexicalBigramsData_train", cb_train) save_sparse_csr(feature_set_path + "binaryLexicalBigramsData_test", cb_test) print 'Lexical Ngrams tfidf' ct_train, ct_test = extract_words(TfidfVectorizer(analyzer=LexicalBigramUnigramAnalyzer(), use_idf=True, smooth_idf=True, dtype=float), train_list, test_list) save_sparse_csr(feature_set_path + "tfidfLexicalBigramsData_train", ct_train) save_sparse_csr(feature_set_path + "tfidfLexicalBigramsData_test", ct_test)
def extractWordData(df_comments, tag): processed_comment_list = extract_global_bag_of_words_processed(df_comments) print len(processed_comment_list) train_v, test_v = np.load('train_vect.npy'), np.load('test_vect.npy') train_list = [] test_list = [] for v in train_v: train_list.append(processed_comment_list[v]) for v in test_v: test_list.append(processed_comment_list[v]) # train_list = [' '.join(sent) for sent in train_list] # test_list = [' '.join(sent) for sent in test_list] print len(train_list) print len(test_list) print 'Unigram Binary' bwd_train, bwd_test = extract_words(CountVectorizer(analyzer=UnigramAnalyzer(), binary=True, dtype=float), train_list, test_list) print 'Unigram Frequency' fwd_train, fwd_test = extract_words(CountVectorizer(analyzer=UnigramAnalyzer(), dtype=float), train_list, test_list) print 'Unigram TFIDF' twd_train, twd_test = extract_words(TfidfVectorizer(analyzer=UnigramAnalyzer(), use_idf=True, smooth_idf=True, dtype=float), train_list, test_list) print 'Bigram Binary' bbwd_train, bbwd_test = extract_words(CountVectorizer(analyzer=UnigramBigramAnalyzer(), binary=True, dtype=float), train_list, test_list) print 'Bigram TFIDF' btwd_train, btwd_test = extract_words(TfidfVectorizer(analyzer=UnigramBigramAnalyzer(), use_idf=True, smooth_idf=True, dtype=float), train_list, test_list) print 'Trigram Binary' tbwd_train, tbwd_test = extract_words(CountVectorizer(analyzer=UnigramBigramTrigramAnalyzer(), binary=True, dtype=float), train_list, test_list) print 'Trigram TFIDF' ttwd_train, ttwd_test = extract_words(TfidfVectorizer(analyzer=UnigramBigramTrigramAnalyzer(), use_idf=True, smooth_idf=True, dtype=float), train_list, test_list) print 'Bigram Only Binary' bowd_train, bowd_test = extract_words(CountVectorizer(analyzer=BigramAnalyzer(), binary=True, dtype=float), train_list, test_list) print 'Bigram Only TFIDF' bowd2_train, bowd2_test = extract_words(TfidfVectorizer(analyzer=BigramAnalyzer(), use_idf=True, smooth_idf=True, dtype=float), train_list, test_list) print 'Trigram Only Binary' towd_train, towd_test = extract_words(CountVectorizer(analyzer=TrigramAnalyzer(), binary=True, dtype=float), train_list, test_list) print 'Trigram Only TFIDF' towd2_train, towd2_test = extract_words(TfidfVectorizer(analyzer=TrigramAnalyzer(), use_idf=True, smooth_idf=True, dtype=float), train_list, test_list) print(feature_set_path + "binaryWordData_train", bwd_train[123, :]) print(feature_set_path + "freqWordData_train", fwd_train[123, :]) print(feature_set_path + "tfidfWordData_train", twd_train[123, :]) print(feature_set_path + "bigramBinaryWordData_train", bbwd_train[123, :]) print(feature_set_path + "bigramTfidfWordData_train", btwd_train[123, :]) print(feature_set_path + "trigramBinaryWordData_train", tbwd_train[123, :]) print(feature_set_path + "trigramTfidfWordData_train", ttwd_train[123, :]) print(feature_set_path + "bigramOnlyBinaryWordData_train", bowd_train[123, :]) print(feature_set_path + "bigramOnlyTfidfWordData_train", bowd2_train[123, :]) print(feature_set_path + "trigramOnlyBinaryWordData_train", towd_train[123, :]) print(feature_set_path + "trigramOnlyTfidfWordData_train", towd2_train[123, :]) save_sparse_csr(feature_set_path + "binaryWordData" + tag + "_train", bwd_train) save_sparse_csr(feature_set_path + "freqWordData" + tag + "_train", fwd_train) save_sparse_csr(feature_set_path + "tfidfWordData" + tag + "_train", twd_train) save_sparse_csr(feature_set_path + "bigramBinaryWordData" + tag + "_train", bbwd_train) save_sparse_csr(feature_set_path + "bigramTfidfWordData" + tag + "_train", btwd_train) save_sparse_csr(feature_set_path + "trigramBinaryWordData" + tag + "_train", tbwd_train) save_sparse_csr(feature_set_path + "trigramTfidfWordData" + tag + "_train", ttwd_train) save_sparse_csr(feature_set_path + "bigramOnlyBinaryWordData" + tag + "_train", bowd_train) save_sparse_csr(feature_set_path + "bigramOnlyTfidfWordData" + tag + "_train", bowd2_train) save_sparse_csr(feature_set_path + "trigramOnlyBinaryWordData" + tag + "_train", towd_train) save_sparse_csr(feature_set_path + "trigramOnlyTfidfWordData" + tag + "_train", towd2_train) save_sparse_csr(feature_set_path + "binaryWordData" + tag + "_test", bwd_test) save_sparse_csr(feature_set_path + "freqWordData" + tag + "_test", fwd_test) save_sparse_csr(feature_set_path + "tfidfWordData" + tag + "_test", twd_test) save_sparse_csr(feature_set_path + "bigramBinaryWordData" + tag + "_test", bbwd_test) save_sparse_csr(feature_set_path + "bigramTfidfWordData" + tag + "_test", btwd_test) save_sparse_csr(feature_set_path + "trigramBinaryWordData" + tag + "_test", tbwd_test) save_sparse_csr(feature_set_path + "trigramTfidfWordData" + tag + "_test", ttwd_test) save_sparse_csr(feature_set_path + "bigramOnlyBinaryWordData" + tag + "_test", bowd_test) save_sparse_csr(feature_set_path + "bigramOnlyTfidfWordData" + tag + "_test", bowd2_test) save_sparse_csr(feature_set_path + "trigramOnlyBinaryWordData" + tag + "_test", towd_test) save_sparse_csr(feature_set_path + "trigramOnlyTfidfWordData" + tag + "_test", towd2_test)