class TF_Transformer(base.BaseEstimator, base.TransformerMixin): def __init__(self): self.cv_bi = CountVectorizer(min_df=2,max_df=0.7,ngram_range=(1,2)) self.tfidf_trans = TfidfTransformer() self.SVD_trans = TruncatedSVD(n_components=300) # X is a list of Fit_Review named tuples, y is none def fit(self, X, y=None): texts = [review.text for review in X] counts = self.cv_bi.fit_transform(texts) counts_tfidf = self.tfidf_trans.fit_transform(counts) self.SVD_trans.fit(counts_tfidf) return self # X is a list of either Fit_Review or Prod_Corpus named tuples def transform(self, X): texts = [review.text for review in X] counts = self.cv_bi.transform(texts) counts_tfidf = self.tfidf_trans.transform(counts) counts_trunc = self.SVD_trans.transform(counts_tfidf) return counts_trunc
def check_webshell(clf,dir): all=0 all_php=0 webshell=0 webshell_files_list = load_files_re(webshell_dir) CV = CountVectorizer(ngram_range=(3, 3), decode_error="ignore", max_features=max_features, token_pattern=r'\b\w+\b', min_df=1, max_df=1.0) x = CV.fit_transform(webshell_files_list).toarray() transformer = TfidfTransformer(smooth_idf=False) transformer.fit_transform(x) g = os.walk(dir) for path, d, filelist in g: for filename in filelist: fulepath=os.path.join(path, filename) t = load_file(fulepath) t_list=[] t_list.append(t) x2 = CV.transform(t_list).toarray() x2 = transformer.transform(x2).toarray() y_pred = clf.predict(x2) all+=1 if filename.endswith('.php'): all_php+=1 if y_pred[0] == 1: print "%s is webshell" % fulepath webshell+=1 print "Scan %d files(%d php files),%d files is webshell" %(all,all_php,webshell)
def Bags_Of_Words(train_fp, test_fp, freq_flag): # This function will a dictionary of bags of words # one will be the training bag of words and one will be the test bag of words # train_fp is the filepath to the dataset # test_fp is a CountVectorizer object you need to pass in # freq_flag will determine if you want to consider either occurence or frequency when making bags of words # Could affect how accurate our model is count_vect = CountVectorizer() tfidf_transformer = TfidfTransformer() df_train = pd.read_csv(train_fp) df_test = pd.read_csv(test_fp) # Only keep parent comments # df = df[df.comment_under_post != False] X_train_counts = count_vect.fit_transform(df_train.body) X_test_counts = count_vect.transform(df_test.body) # Takes into account comment length, transforms matrix into frequency of a particular word, not simply occurence if freq_flag: X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) X_test_tfidf = tfidf_transformer.fit_transform(X_test_counts) return {"train": X_train_tfidf, "test": X_test_tfidf} else: return {"train": X_train_counts, "test": X_test_counts}
def test_tfidf_no_smoothing(): X = [[1, 1, 1], [1, 1, 0], [1, 0, 0]] tr = TfidfTransformer(smooth_idf=False, norm='l2') tfidf = tr.fit_transform(X).toarray() assert_true((tfidf >= 0).all()) # check normalization assert_array_almost_equal((tfidf ** 2).sum(axis=1), [1., 1., 1.]) # the lack of smoothing make IDF fragile in the presence of feature with # only zeros X = [[1, 1, 0], [1, 1, 0], [1, 0, 0]] tr = TfidfTransformer(smooth_idf=False, norm='l2') with warnings.catch_warnings(record=True) as w: tfidf = tr.fit_transform(X).toarray() assert_equal(len(w), 1) # For Python 3 compatibility if hasattr(w[0].message,'args') : assert_true("divide by zero" in\ w[0].message.args[0]) else : assert_true("divide by zero" in\ w[0].message)
def work_with_simple_bag_of_words(): count = CountVectorizer() docs = np.array([ 'The sun is shining', 'The weather is sweet', 'The sun is shining and the weather is sweet', ]) bag = count.fit_transform(docs) print(count.vocabulary_) print(bag.toarray()) np.set_printoptions(precision=2) tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True) print(tfidf.fit_transform(bag).toarray()) tf_is = 2 n_docs = 3 idf_is = np.log((n_docs+1) / (3+1)) tfidf_is = tf_is * (idf_is + 1) print("tf-idf of term 'is' = %.2f" % tfidf_is) tfidf = TfidfTransformer(use_idf=True, norm=None, smooth_idf=True) raw_tfidf = tfidf.fit_transform(bag).toarray()[-1] print(raw_tfidf) l2_tfidf = raw_tfidf / np.sqrt(np.sum(raw_tfidf**2)) print(l2_tfidf)
def TFIDF(): global segcont global weight vectorizer = CountVectorizer() transformer = TfidfTransformer() tfidf = transformer.fit_transform(vectorizer.fit_transform(segcont)) word = vectorizer.get_feature_names() # 所有文本的关键字 weight = tfidf.toarray() # 对应的tfidf矩阵 del segcont seg = [] for i in range(len(weight)): enstr = "" for j in range(len(word)): if weight[i][j] >= 0.1:##################################### enstr = enstr + " " + word[j] seg.append(enstr) del weight vec = CountVectorizer() tra = TfidfTransformer() tidf = tra.fit_transform(vec.fit_transform(seg)) wo = vec.get_feature_names() we = tidf.toarray() global we
def runAnalysis(self): trainingData = np.loadtxt(open(self.training_file, 'rb'), delimiter = ',', skiprows = 0); testData = np.loadtxt(open(self.test_file,'rb'), delimiter = ',', skiprows = 0); #trainingData = np.genfromtxt(open(self.training_file,'rb'),delimiter=','); #testData = np.genfromtxt(open(self.testData,'rb'),delimiter=','); xTrain = trainingData[:, :trainingData.shape[1]-1] yTrain = trainingData[:,trainingData.shape[1]-1] xTest = testData[:, :testData.shape[1] -1] yTest = testData[:, testData.shape[1]-1] #evaluateCorrelationResults(xTrain, yTrain) #xTrain,xTest = transform(xTrain,yTrain,xTest) #tf-idf transformation transformer = TfidfTransformer() xTrain = transformer.fit_transform(xTrain) xTest = transformer.fit_transform(xTest) appendDataTofile("Training dimension -> ",xTrain.shape) appendDataTofile("Testing dimension -> ",xTest.shape) #MultinomialNB classification """appendDataTofile("MultiNB"); yPred = classify(lambda:naive_bayes.MultinomialNB(),xTrain,xTest,yTrain,yTest)""" #Logistic Regression classification #penalty="l1",C=0.5,intercept_scaling=2 """appendDataTofile("Log regression"); yPred = classify(lambda:linear_model.LogisticRegression(), xTrain,xTest,yTrain,yTest)""" #SVM based classification appendDataTofile("SVM"); #C=8.0,gamma=0.10,kernel='rbf',probability=True,shrinking=True #yPred = classify(lambda:svm.SVC(), # xTrain,xTest,yTrain,yTest) #Grid search SVM yPred = gridSearchCVforSVM(xTrain,xTest,yTrain,yTest) #yPred = clusterify(lambda:KMeans(n_clusters=3),xTrain,xTest,yTrain,yTest) """yPred = classify(lambda:KNeighborsClassifier(),xTrain,xTest,yTrain,yTest)""" """yPred = classify(lambda: linear_model.RidgeClassifierCV,xTrain,xTest,yTrain,yTest)""" outputFile = open("../results.txt", 'w+') rows = len(yPred) #outputFile.write("ID\tSentiment\n"); for i in range(0,rows): outputFile.write(str(yPred[i])+"\n") outputFile.close()
def linearSVC_prediction(self): tfidf = TfidfTransformer() X = tfidf.fit_transform(self.dvec.fit_transform(self.words)) c_tfidf = TfidfTransformer() c_X = c_tfidf.fit_transform(self.c_dvec.fit_transform(self.c_words)) self.svc = LinearSVC() self.svc.fit(X, self.scores) self.c_svc = LinearSVC() self.c_svc.fit(c_X, self.c_scores)
def test_tf_idf_smoothing(): X = [[1, 1, 1], [1, 1, 0], [1, 0, 0]] tr = TfidfTransformer(smooth_idf=True, norm="l2") tfidf = tr.fit_transform(X).toarray() assert_true((tfidf >= 0).all()) # check normalization assert_array_almost_equal((tfidf ** 2).sum(axis=1), [1.0, 1.0, 1.0]) # this is robust to features with only zeros X = [[1, 1, 0], [1, 1, 0], [1, 0, 0]] tr = TfidfTransformer(smooth_idf=True, norm="l2") tfidf = tr.fit_transform(X).toarray() assert_true((tfidf >= 0).all())
def handle_doc(word_set,rs_path): doc_dir = os.listdir(rs_path) doc_matrix = [] doc_cat = [] for docs in doc_dir: files = os.listdir(rs_path+docs) print "start to handle the --> "+docs for file_d in files: d_path = rs_path+docs+'/'+file_d #get the single file path with open(d_path,'rb') as text_file: str_tmp = '' file_lines = text_file.readlines() for line in file_lines: pattern = r'''[a-zA-Z]+''' tokens = nltk.regexp_tokenize(line,pattern) for t in tokens: if t.lower() in word_set: str_tmp += t.lower() str_tmp += ' ' doc_matrix.append(str_tmp) doc_cat.append(cat_dic[docs]) text_file.close() str_tmp = '' for sw in word_set: str_tmp += sw str_tmp += ' ' doc_matrix.append(str_tmp) doc_cat.append('NAN') vectorizer = CountVectorizer() doc_num = vectorizer.fit_transform(doc_matrix) tfidf = TfidfTransformer() doc_tfidf = tfidf.fit_transform(doc_num) return doc_tfidf[:-1,:],doc_cat[:-1]
def bayes_tfidf(prefix, sufix, dic_fn): """ prefix example: ./data/single_label_sen/sen_spanish_protest sufix example: pop_cat """ train_file = prefix + "_train.txt.tok" test_file = prefix + "_test.txt.tok" train_y_file = prefix + "_train." + sufix test_y_file = prefix + "_test." + sufix dic_cn = {k.strip(): i for i, k in enumerate(open(dic_fn))} word_train_set = [l.strip().lower() for l in open(train_file)] word_test_set = [l.strip().lower() for l in open(test_file)] train_y = [dic_cn[l.strip()] for l in open(train_y_file)] test_y = [dic_cn[l.strip()] for l in open(test_y_file)] # construct the word count matrix count_vect = CountVectorizer() train_set_count = count_vect.fit_transform(word_train_set) test_set_count = count_vect.transform(word_test_set) # construct tfidf matrix tfidf_transformer = TfidfTransformer() train_set_x = tfidf_transformer.fit_transform(train_set_count) test_set_x = tfidf_transformer.transform(test_set_count) print "start the model" test_score = bayes_experiment([train_set_x, train_y], [test_set_x, test_y]) return test_score
def test_tfidf_no_smoothing(): X = [[1, 1, 1], [1, 1, 0], [1, 0, 0]] tr = TfidfTransformer(smooth_idf=False, norm='l2') tfidf = tr.fit_transform(X).toarray() assert_true((tfidf >= 0).all()) # check normalization assert_array_almost_equal((tfidf ** 2).sum(axis=1), [1., 1., 1.]) # the lack of smoothing make IDF fragile in the presence of feature with # only zeros X = [[1, 1, 0], [1, 1, 0], [1, 0, 0]] tr = TfidfTransformer(smooth_idf=False, norm='l2') clean_warning_registry() with warnings.catch_warnings(record=True) as w: 1. / np.array([0.]) numpy_provides_div0_warning = len(w) == 1 in_warning_message = 'divide by zero' tfidf = assert_warns_message(RuntimeWarning, in_warning_message, tr.fit_transform, X).toarray() if not numpy_provides_div0_warning: raise SkipTest("Numpy does not provide div 0 warnings.")
def get_topic_tfidf(cor_list, topic_num, path_base): # 该类将文本中的词语转换成词频矩阵,矩阵元素a[i][j]表示词j在第i个文本下的词频 vectorizer = CountVectorizer() # 统计每个词语的tfidf transformer = TfidfTransformer() corpus_split = list() for i in range(topic_num): corpus_split.append(list()) for j in topics_list[i]: # 把评论放到相应主题下的列表中 corpus_split[i].append(cor_list[j]) print('第', i, '个列表元素的长度', len(corpus_split[i])) # 第一个fit_transformer是计算tfidf,第二个是转化成词频矩阵 tfidf = transformer.fit_transform( vectorizer.fit_transform(corpus_split[i])) # 获取词袋模型中的所有词语 word = vectorizer.get_feature_names() # 将tfidf矩阵抽取出来,a[i][j]表示词j在第i个文本中的tfidf权重 weight = tfidf.toarray() path = path_base + str(i) with open(path, 'w') as f: for m in range(len(weight)): for n in range(len(word)): # f.write(word[n]) # f.write(' ') f.write(str(weight[m][n])) f.write(' ') f.write('\n')
def tfidf(fileList): segPath = sys.path[0] + '/seg_result' corpus = [] #存取文档的分词结果 for eachFile in fileList: fileName = segPath + '/' + eachFile f = open(fileName,'r+') content = f.read() corpus.append(content) vectorizer = CountVectorizer() # 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频 transformer = TfidfTransformer() # 该类会统计每个词语的tf-idf权值,同时会使用默认的中文停用词 tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus)) # 第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵 word = vectorizer.get_feature_names() # 获取词袋模型中的所有词语 weight = tfidf.toarray() # 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重 #创建tfidf文件夹,保存tf-idf的结果 tfidfFilePath = os.getcwd() + '/tfidfFile' if not os.path.exists(tfidfFilePath): os.mkdir(tfidfFilePath) for i in range(len(weight)): print u"--------Writing all the tf-idf in the", i, u" file into ", tfidfFilePath + '/' + str(i) + '.txt', "--------" name = tfidfFilePath + '/' + string.zfill(i, 5) + '.txt' f = open(name,'w+') for j in range(len(word)): #f.write(word[j] + " " + str(weight[i][j]) + "\n") #f.write(str(weight[i][j]) + "\n") f.write(word[j] + "\n") f.close()
def test_pickling_transformer(): X = CountVectorizer().fit_transform(JUNK_FOOD_DOCS) orig = TfidfTransformer().fit(X) s = pickle.dumps(orig) copy = pickle.loads(s) assert_equal(type(copy), orig.__class__) assert_array_equal(copy.fit_transform(X).toarray(), orig.fit_transform(X).toarray())
def get_feature_by_bag_tfidf(): global white_count global black_count global max_features print "max_features=%d" % max_features x=[] y=[] webshell_files_list = load_files_re(webshell_dir) y1=[1]*len(webshell_files_list) black_count=len(webshell_files_list) wp_files_list =load_files_re(whitefile_dir) y2=[0]*len(wp_files_list) white_count=len(wp_files_list) x=webshell_files_list+wp_files_list y=y1+y2 CV = CountVectorizer(ngram_range=(2, 4), decode_error="ignore",max_features=max_features, token_pattern = r'\b\w+\b',min_df=1, max_df=1.0) x=CV.fit_transform(x).toarray() transformer = TfidfTransformer(smooth_idf=False) x_tfidf = transformer.fit_transform(x) x = x_tfidf.toarray() return x,y
def tf_idf(seg_files): seg_path = './segfile/' corpus = [] for file in seg_files: fname = seg_path + file f = open(fname, 'r+') content = f.read() f.close() corpus.append(content) vectorizer = CountVectorizer() transformer = TfidfTransformer() tfdif = transformer.fit_transform(vectorizer.fit_transform(corpus)) word = vectorizer.get_feature_names() weight = tfdif.toarray() save_path = './tfidffile' if not os._exists(save_path): os.mkdir(save_path) for i in range(len(weight)): print('--------Writing all the tf-idf in the', i, u' file into ', save_path + '/' + string.zfill(i, 5) + '.txt', '--------') f = open(save_path + '/' + string.zfill(i, 5) + '.txt', 'w+') for j in range(len(word)): f.write(word[j] + ' ' + str(weight[i][j]) + '\r\n') f.close()
def load_dataset(prefix, sufix, dic_fn, vocab_fn='./data/english_review.trn-100000.vocab'): train_file = prefix + "_train.txt.tok" test_file = prefix + "_test.txt.tok" train_y_file = prefix + "_train." + sufix test_y_file = prefix + "_test." + sufix dic_cn = {k.strip(): i for i, k in enumerate(open(dic_fn))} word_train_set = [l.strip().lower() for l in open(train_file)] word_test_set = [l.strip().lower() for l in open(test_file)] train_y = [dic_cn[l.strip()] for l in open(train_y_file)] test_y = [dic_cn[l.strip()] for l in open(test_y_file)] vocab = [l.strip().lower().split("\t")[0] for l in open(vocab_fn)] count_vect = CountVectorizer(vocabulary=vocab) train_set_count = count_vect.fit_transform(word_train_set) test_set_count = count_vect.transform(word_test_set) tfidf_transformer = TfidfTransformer() train_set_x = tfidf_transformer.fit_transform(train_set_count).toarray() test_set_x = tfidf_transformer.transform(test_set_count).toarray() train_shared_x, train_shared_y = shared_dataset([train_set_x, train_y]) test_shared_x, test_shared_y = shared_dataset([test_set_x, test_y]) return [(train_shared_x, train_shared_y), (test_shared_x, test_shared_y)]
def getTfidfData(dataTrain, dataTest, dataHold): print dataTrain.target_names count_vect = CountVectorizer(strip_accents='ascii', stop_words='english', max_features=len(dataTrain.target) * 2) tfidf_transformer = TfidfTransformer(sublinear_tf=True) X_counts = count_vect.fit_transform(dataTrain.data) X_tfidf = tfidf_transformer.fit_transform(X_counts) print X_tfidf.shape Y_counts = count_vect.transform(dataTest.data) Y_tfidf = tfidf_transformer.transform(Y_counts) print Y_tfidf.shape H_counts = count_vect.transform(dataHold.data) H_tfidf = tfidf_transformer.transform(H_counts) print 'feature selection using chi square test', len(dataTrain.target) feature_names = count_vect.get_feature_names() ch2 = SelectKBest(chi2, k='all') X_tfidf = ch2.fit_transform(X_tfidf, dataTrain.target) Y_tfidf = ch2.transform(Y_tfidf) H_tfidf = ch2.transform(H_tfidf) if feature_names: # keep selected feature names feature_names = [feature_names[i] for i in ch2.get_support(indices=True)] if feature_names: feature_names = numpy.asarray(feature_names) print 'important features' print feature_names[:10] return X_tfidf, Y_tfidf, H_tfidf
def extract_text_features(train_data, test_data): """ Returns one types of training and test data features. 1) Term Frequency times Inverse Document Frequency (tf-idf): X_train_tfidf, X_test_tfidf Parameters ---------- train_data : List[str] Training data in list. Will only take 30000 reviews for efficiency purposes test_data : List[str] Test data in list Returns ------- Tuple(scipy.sparse.csr.csr_matrix,.., list) Returns X_train_tfidf, X_test_tfidf, vocab as a tuple. """ # set up a count vectorizer that removes english stopwords when building a term-doc matrix count_vect = CountVectorizer(stop_words=set(stopwords.words('english'))) # build the term frequency per document matrix from a random sublist of 30,000 documents train_counts = count_vect.fit_transform(random.sample(train_data, 30000)) test_counts = count_vect.transform(test_data) tfidf_transformer = TfidfTransformer() train_tfidf = tfidf_transformer.fit_transform(train_counts) test_tfidf = tfidf_transformer.transform(test_counts) vocab = count_vect.get_feature_names() return (train_tfidf, test_tfidf, vocab)
class VectorModel(object): def __init__(self , list_of_comments=None): self.__list_of_comments = list_of_comments self.__vectorizer = [] self.__corpus_simple_vector = [] self.__transformer = [] self.__corpus_tf_idf = [] #self.prepare_models() def prepare_models(self): self.__vectorizer = CountVectorizer() vector = self.__vectorizer.fit_transform(self.__list_of_comments) self.__corpus_simple_vector = vector.toarray() self.__transformer = TfidfTransformer() tfidf = self.__transformer.fit_transform(self.__corpus_simple_vector) self.__corpus_tf_idf = tfidf.toarray() return [self.__vectorizer , self.__corpus_simple_vector , self.__transformer , self.__corpus_tf_idf] def set_models(self , vectorizer , transformer): self.__vectorizer = vectorizer self.__transformer = transformer def get_comment_frequency_vector(self , comments): vec_comments = [] for i in comments: vec_comments.append(i) vectores = self.__vectorizer.transform(vec_comments).toarray() return vectores def get_comment_tf_idf_vector(self , comments): vector = self.get_comment_frequency_vector(comments) result = self.__transformer.transform(vector).toarray() return result
def tfidf(corpus, word_category, file_to_write): vectorizer = CountVectorizer() transformer = TfidfTransformer() tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus)) weight = tfidf.toarray() sum_weight = np.sum(weight, axis=0) word = vectorizer.get_feature_names() word_and_weight = [] for i in range(len(sum_weight)): word_and_weight.append([word[i], sum_weight[i]]) word_and_weight.sort(key=lambda key: key[1], reverse=True) f = open(file_to_write, "w+") result = [] for j in range(len(word_and_weight)): try: f.write( word_and_weight[j][0] + " " + str(word_and_weight[j][1]) + " " + word_category[word_and_weight[j][0]] + "\n" ) result.append([word_and_weight[j][0], word_and_weight[j][1], word_category[word_and_weight[j][0]]]) except: continue f.close() return result
def LR_modeling(file_name, k, AUC=True, weight=False): raw_data = pd.read_csv(file_name) raw_data = raw_data.drop(['issue', 'field'], axis=1) X = raw_data.drop('panelvote', axis=1) y = raw_data['panelvote'] tfidf = TfidfTransformer(norm=u'l2', use_idf=True, smooth_idf=True, sublinear_tf=False) X = tfidf.fit_transform(X.values) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.8, random_state=42) lr = LogisticRegression(C=1) lr.fit(X_train, y_train) auc = np.mean(cross_validation.cross_val_score(lr, X, y, scoring="roc_auc")) if AUC == True: print "AUC for %s on the test data = %.3f" % (file_name, auc) if weight == False: top_positive, top_negative = get_top_k_nocoeff(lr.coef_[0], k) return raw_data.columns[top_positive], raw_data.columns[top_negative] else: top_positive, top_negative = get_top_k(lr.coef_[0], k) final_pos = {} final_neg = {} for i in top_positive.keys(): final_pos[raw_data.columns[i]] = top_positive[i] for j in top_negative.keys(): final_neg[raw_data.columns[j]] = top_negative[j] pos = sorted(final_pos.items(), key=operator.itemgetter(1), reverse=True) neg = sorted(final_neg.items(), key=operator.itemgetter(1)) return pos, neg
def text_sentiment(docs_new): docs_new=[docs_new] twenty_train= load_files('./Sentiment') #the complete data is in this directory; like comp.graphics etc count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(twenty_train.data) tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts) X_train_tf = tf_transformer.transform(X_train_counts) tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) # Fit a classifier on the training set #clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target) #f = open('my_classifier.pickle', 'wb') #pickle.dump(clf, f) #f = open('my_classifier.pickle',) #clf = pickle.load(f) #f.close() # save the classifier #with open('my_sentiment.pkl', 'wb') as fid: #cPickle.dump(clf, fid) # load it again with open('my_sentiment.pkl', 'rb') as fid: clf = cPickle.load(fid) X_new_counts = count_vect.transform(docs_new) X_new_tfidf = tfidf_transformer.transform(X_new_counts) predicted = clf.predict(X_new_tfidf) return twenty_train.target_names[predicted]
class TfIdfMixin(RawBOWMixin): def build(self): if not hasattr(self, "_tfidf_transformer"): self._tfidf_transformer = None if self._tfidf_transformer is None: self._tfidf_transformer = TfidfTransformer() #input=u"content", preprocessor=lambda x: x, #tokenizer=lambda x: x) def process(self, input_df, ndarray_data): X = self._tfidf_transformer.fit_transform( ndarray_data["RawBOWMatrix"]) ndarray_data["TfIdfMatrix"] = X return input_df, ndarray_data def requires(self): return [] def returns(self): return [] def ndarray_requires(self): return ["RawBOWMatrix",] def ndarray_returns(self): return ["TfIdfMatrix"] def name(self): return "TfIdfMixin"
def get_tf_idf(x_array): print ('start get tf-idf array...') transformer = TfidfTransformer() tfidf = transformer.fit_transform(x_array) tfidf_array = tfidf.toarray() print ('ok...\n') return tfidf_array
def classify(self): # 1. load Corpus from files #(corpus, labels) = self.loadFile("dataless/20NG/train_rho0.2_epsilon0.3_window_default") (corpus, labels) = self.loadFile("dataless/20NG/20ng-train-no-stop.txt") print set(labels) m = self.loadSenna("../senna/embeddings/embeddings.txt","../senna/hash/words.lst") #dict{str: np.array()} #m = Word2Vec.load_word2vec_format("vectors/whole/new_c_e_train_neg10size400min_count1", binary=True) #words = set(m.index2word) #words = set(m.keys()) #print corpus #print labels # 2. Encode Feature Matrix cv = CountVectorizer(min_df=1) X = cv.fit_transform(corpus) # Frequency #print "Frequency:",X #print cv.get_feature_names() transformer = TfidfTransformer() X = transformer.fit_transform(X) # TF-IDF weighted entities #print "Tf-idf:",X # 3. calculate final vectors to predict labels # print X[0]for x in X[0]: pre_vectors = self.pre_vectors(X, cv ,m) # 3. Encode label vector le = preprocessing.LabelEncoder() Y = le.fit_transform(labels)
def estimation(file='song_text.txt', separator=u'--text--'): arr = text_split_line(file, u'--text--') dvect = data_vector(arr) target = dvect[0] text = dvect[1] dic = dvect[2] # for converting target integer to artist name # print (target) # print (dic) count_vect = CountVectorizer() word_vect = count_vect.fit_transform(text) tfidf_transformer = TfidfTransformer() vect_tfidf = tfidf_transformer.fit_transform(word_vect) machine = svm.SVC(probability=True) # one of the best for text, see tutorial working with text machine.fit(vect_tfidf, target) print (machine.score(vect_tfidf, target)) prediction = machine.predict(vect_tfidf) # accuracy test (tutorial) print (u'model predictive accuracy: {:.1%}' .format(np.mean(prediction == target))) new_texts = [text[500], text[2345], text[-2], text[0], text[5893]] new_data = count_vect.transform(new_texts) new_tfidf = tfidf_transformer.transform(new_data) prediction = machine.predict(new_tfidf) for i in range(len(new_texts)): print (u'{}\t=> {}'.format(new_texts[i].splitlines()[:2], dic[prediction[i]])) return
def getContextFeature(self): import time print 'start to get Context Feature' start = time.time() from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer #when we meet the large corpus, need to input an iteration! corpus = self.getIterText() #transfer the text into word frequency matrix vectorizer = CountVectorizer() transformer = TfidfTransformer() tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus)) print 'get word' word=vectorizer.get_feature_names() print 'get weight' weight=tfidf print 'weight type:', type(weight) #print weight end = time.time() print 'total time: \t', end-start return weight,word
def tf_idf(self, **kwargs): """Perform tf-idf transformation.""" tfid = TfidfTransformer(**kwargs) tfidf_matrix = tfid.fit_transform(self.matrix) return Space(tfidf_matrix, self.row_labels, self.column_labels)
file_name = "segment.p" df = pickle.load(open(file_name, "rb")) df['new_words'] = df["words"].map(lambda x: ("".join(re.findall(ur'[\u4e00-\u9fff]+', x))).strip()) df['token_words'] = df['new_words'].map(lambda x: " ".join(jieba.lcut(x, cut_all=False))) df = df.reindex(np.random.permutation(df.index)) corpus = [] label = [] org_word = [] for i in range(len(df)): corpus = corpus + [df['token_words'][i] for x in range(df['num_term'][i]+1)] label = label + [df['label'][i] for x in range(df['num_term'][i]+1)] org_word = org_word + [df['new_words'][i] for x in range(df['num_term'][i]+1)] vectorizer=CountVectorizer() transformer=TfidfTransformer() tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus)) word=vectorizer.get_feature_names() weight=tfidf.toarray() label = np.asarray(label) idf = transformer.idf_ # non_word_value = 9.2564775671942705 clf = svm.SVC(gamma=0.001, C=100.) clf.fit(weight, label) # final_results = clf.predict(weight) test_files=os.listdir(test_dir) alldict=[] for j1,i1 in enumerate(test_files):
print(" . Most correlated unigrams:\n . {}".format( '\n . '.join(unigrams[-N:]))) print(" . Most correlated bigrams:\n . {}".format( '\n . '.join(bigrams[-N:]))) # splitting to training and test set X_train, X_test, y_train, y_test = train_test_split( dataset['consumer_complaint_narrative'], dataset['product'], random_state=0) # fitting the countvectorizer and tfidf transformer to the x_train dataset count_vect = CountVectorizer() X_train_count_vect = count_vect.fit_transform(X_train) tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_count_vect) #Fitting the dataset to the Naive bayes classifier classifier = MultinomialNB().fit(X_train_tfidf, y_train) # sample predictions classifier.predict( count_vect.transform([ 'This company refuses to provide me verification and validation of debt per my right under the FDCPA. I do not believe this debt is mine.' ])) classifier.predict( count_vect.transform([ "I am disputing the inaccurate information the Chex-Systems has on my credit report. I initially submitted a police report on XXXX/XXXX/16 and Chex Systems only deleted the items that I mentioned in the letter and not all the items that were actually listed on the police report. In other words they wanted me to say word for word to them what items were fraudulent. The total disregard of the police report and what accounts that it states that are fraudulent. If they just had paid a little closer attention to the police report I would not been in this position now and they would n't have to research once again. I would like the reported information to be removed : XXXX XXXX XXXX" ])) dataset[
infile = open(wordEmbeddingModel,'rb') results = pickle.load(infile) #infile = open('../wordEmbeddings/wikiVectors','rb') print('is creating feature matrix...') proto_matrix = append_features(results) fea = np.matrix(proto_matrix) fea = np.nan_to_num(fea) y = getLabel(results) y = np.array(y) print('tifidf word vectors') tfidf_transformer = TfidfTransformer() X_vec = tfidf_transformer.fit_transform(fea).toarray() #reduce dimension reducer = TruncatedSVD(n_components=5, n_iter=7, random_state=42) reducer.fit(X_vec) X_vec = reducer.transform(X_vec) print('load LIWC data...') text_liwc = pd.read_csv('../data/LIWC_self_label_valence.csv') liwc = text_liwc.loc[:,'function':'OtherP'].values ####combine with liwc X = np.concatenate((X_vec, liwc), axis=1) #Normalize data, convert it to unit vectors
def IncreasingFIT1(): global recordAccuracy recordAccuracy = [] vectorizer = CountVectorizer(stop_words=stopwordslist) transformer = TfidfTransformer() global total_vect_time, parsing_time, vectorizing_time, oldVocubularysave, newVocubularysave # for T in range(TrainDataSize): global T for T in range(updatesize): tick = time.time() # X_train = vectorizer.transform(xtrain[i]) count = vectorizer.fit_transform(xtrain[T]) X_train = transformer.fit_transform(count) # ---------------------------------------- VocubularyList = vectorizer.get_feature_names() # vectorizer = CountVectorizer(stop_words=None,vocabulary=VocubularyList) # tfidf = X_train.toarray().T print(X_train.shape) model1 = SelectKBest(chi2, k=1) X_chi2 = model1.fit_transform(X_train, ytrain[T]) print(X_chi2.shape) print(model1.scores_.shape) j = 0 for i in VocubularyList: # print(i,",",vectorizer2.vocabulary_[i],",",max(tfidf[vectorizer2.vocabulary_[i]])) newVocubularysave.append({ "name": i, 'numb': vectorizer.vocabulary_[i], 'value': model1.scores_[j] }) j = j + 1 print("get newVocubularysave!") # newVocubularysave=oldVocubularysave newVocubularysave = oldVocubularysave + newVocubularysave newVocubularysave = sortbyword(newVocubularysave, FeatureSpaceSize, 'value') # print(newVocubularysave) l = [] for numV in newVocubularysave: l.append(numV['name']) # print(l) print("========================================================") print("========================================================") oldVocubularysave = newVocubularysave # /----------------------------------------- total_vect_time += time.time() - tick # 测试集的处理----------------- tick = time.time() parsing_time = time.time() - tick tick = time.time() vectorizing_time = time.time() - tick test_stats['n_test'] += len(ytest) test_stats['n_test_pos'] += sum(ytest) # end 数据集文本向量化 (哈希技巧) ------------------------------------------------------- print(len(newVocubularysave)) joblib.dump(newVocubularysave, "VocubularySave.v") print('开始增量训练...') IncreasingFIT() # IncreasingFIT() print('已完成...')
def IncreasingFIT(): global total_vect_time classifiers = { 'SGD': SGDClassifier(), 'Perceptron': Perceptron(), 'NB Multinomial': MultinomialNB(alpha=0.01), 'Passive-Aggressive': PassiveAggressiveClassifier(), } Vocubularysave = [] if os.path.exists("VocubularySave.v"): Vocubularysave = joblib.load("VocubularySave.v") VocubularyList = [] for numV in Vocubularysave: VocubularyList.append(numV['name']) vectorizer = CountVectorizer(stop_words=None, vocabulary=VocubularyList) transformer = TfidfTransformer() count = vectorizer.fit_transform(xtest) X_test = transformer.fit_transform(count) for i in range(TrainDataSize): tick = time.time() # X_train = vectorizer.transform(xtrain[i]) count = vectorizer.fit_transform(xtrain[i]) X_train = transformer.fit_transform(count) total_vect_time += time.time() - tick for cls_name, cls_useless in partial_fit_classifiers.items(): cls = classifiers[cls_name] tick = time.time() # update estimator with examples in the current mini-batch # 使用当前最小批次中的示例更新估算器 # print(X_train) cls.partial_fit(X_train, ytrain[i], classes=all_classes) # if i % printjumpsize == 0: if i == (TrainDataSize - 1): # accumulate test accuracy stats # 累积测试准确度统计 cls_stats[cls_name]['total_fit_time'] += time.time() - tick cls_stats[cls_name]['n_train'] += X_train.shape[0] cls_stats[cls_name]['n_train_pos'] += sum(ytrain[i]) tick = time.time() # 测试准确性函数 cls_stats[cls_name]['accuracy'] = cls.score(X_test, ytest) cls_stats[cls_name]['prediction_time'] = time.time() - tick acc_history = (cls_stats[cls_name]['accuracy'], cls_stats[cls_name]['n_train']) cls_stats[cls_name]['accuracy_history'].append(acc_history) run_history = (cls_stats[cls_name]['accuracy'], total_vect_time + cls_stats[cls_name]['total_fit_time']) cls_stats[cls_name]['runtime_history'].append(run_history) # accumulate test accuracy stats # 累积测试准确度统计 if T == 0: print(progress(cls_name, cls_stats[cls_name])) if T != 0: AccuracyAverage[cls_name]['total_fit_time'] += time.time( ) - tick AccuracyAverage[cls_name]['n_train'] += X_train.shape[0] AccuracyAverage[cls_name]['n_train_pos'] += sum(ytrain[i]) tick = time.time() # 测试准确性函数 AccuracyAverage[cls_name]['accuracy'] += cls.score( X_test, ytest) RecordOneAccuracy[cls_name]['accuracy'] += cls.score( X_test, ytest) acc_history = (AccuracyAverage[cls_name]['accuracy'], AccuracyAverage[cls_name]['n_train']) AccuracyAverage[cls_name]['accuracy_history'].append( acc_history) run_history += ( AccuracyAverage[cls_name]['accuracy'], total_vect_time + AccuracyAverage[cls_name]['total_fit_time']) AccuracyAverage[cls_name]['runtime_history'].append( run_history) recordAccuracy.append(RecordOneAccuracy) print(progress2(cls_name, AccuracyAverage[cls_name], T))
print(y[:250]) X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.02,random_state=0) print(99999999999999999999) vectorizer=CountVectorizer(ngram_range=(1,2)) training_features=vectorizer.fit_transform(X_train) print(8888888888888888888888) np.asarray(training_features) #now to create the tfidf thing tfidf_vec=TfidfTransformer() X_train_tfidfvec=tfidf_vec.fit_transform(training_features) print(77777777777777777777777777) #classifier=svm.SVC(probability=True) #classifier = KNeighborsClassifier(n_neighbors=5) classifier= MultinomialNB() print(666666666666666666666666) classifier.fit(X_train_tfidfvec,y_train) print(5555555555555555) testing_features=vectorizer.transform(X_test)
def main(_): np.random.seed(3) #固定seed让每次的random都一样 TIME_STEPS = FLAGS.N IMPUT_SIZE = 625 BATCH_SIZE = 30 BATCH_INDEX = 0 OUTPUT_SIZE = 2 CELL_SIZE = 175 LR = 0.001 totalData = [] totalDataLabel = [] counter = 0 totalDoc = 0 totalpost = 0 tdlist1 = 0 Pos = 0 Neg = 0 maxpost = 0 minpost = 62827 thulac_pip = thulac.thulac(seg_only=True) #只进行分词,不进行词性标注 EventList = GetEventList() print("Generating BlackList with N = ", TIME_STEPS, " ...") for event in EventList: totalDoc += 1 Eid = event["eid"] Label = event["label"] # print("Eid : ", Eid, "Label: ", Label) WeiboPostIdList = event["posts"] if len(WeiboPostIdList) == 1: tdlist1 += 1 continue if len(WeiboPostIdList) >= maxpost: maxpost = len(WeiboPostIdList) if len(WeiboPostIdList) <= minpost: minpost = len(WeiboPostIdList) event_file_path = os.path.join(Weibo_Json_Dir, Eid + ".json") event_file = open(event_file_path, "r") event_json = json.load(event_file) WeiboPostList = [] index = 0 for WeiboPostId in WeiboPostIdList: totalpost += 1 WeiboJson = event_json[index] index += 1 WeiboText = WeiboJson["text"] Time = WeiboJson["t"] WeiboPost = {"text" : WeiboText, "time" : Time} WeiboPostList.append(WeiboPost) if Label == "0": Pos += 1 else: Neg += 1 #Sort by time WeiboPostList = sorted(WeiboPostList, key=lambda k: k['time']) #find Time Invertal of weibo TotalTimeLine = WeiboPostList[-1]['time']-WeiboPostList[0]['time'] IntervalTime = TotalTimeLine/TIME_STEPS k = 0 PreConInt = [] while True: k += 1 WeiboIndex = 0 output = [] if TotalTimeLine == 0: for weibo in WeiboPostList: weibo_text = thulac_pip.cut(weibo["text"], text=True) output.append(weibo_text) break Start = WeiboPostList[0]['time'] Interval = int(TotalTimeLine/IntervalTime) Intset = [] for inter in range(0,Interval): empty = 0 interval = [] for q in range(WeiboIndex,len(WeiboPostList)): if WeiboPostList[q]['time'] >= Start and WeiboPostList[q]['time'] < Start+IntervalTime: empty += 1 weibo_text = thulac_pip.cut(WeiboPostList[q]["text"], text=True) interval.append(weibo_text) #记录超出interval的weibo位置,下次可直接从此开始 elif WeiboPostList[q]['time'] >= Start+IntervalTime: WeiboIndex = q-1 break # empty interval if empty == 0: output.append([]) else: #add the last weibo if WeiboPostList[-1]['time'] == Start+IntervalTime: weibo_text = thulac_pip.cut(WeiboPostList[-1]["text"], text=True) interval.append(weibo_text) Intset.append(inter) output.append(interval) Start = Start+IntervalTime ConInt = ContinuousInterval(Intset) if len(ConInt)<TIME_STEPS and len(ConInt) > len(PreConInt): IntervalTime = int(IntervalTime*0.5) PreConInt = ConInt if IntervalTime == 0: output = output[ConInt[0]:ConInt[-1]+1] break else: # print(len(ConInt)) output = output[ConInt[0]:ConInt[-1]+1] break counter+=1 event_file.close() # print (counter) # 把Interval的所有字都串在一起 for q in range(0,len(output)): output[q] = ''.join(s for s in output[q]) try: #Caculate Tfidf vectorizer = CountVectorizer() transformer = TfidfTransformer() #print(output) tf = vectorizer.fit_transform(output) tfidf = transformer.fit_transform(tf) # Debug # print(tfidf.toarray()) Allvocabulary = vectorizer.get_feature_names() except ValueError: BlackList.append(Eid) continue # print(vectorizer.get_feature_names()) Input = [] for interval in tfidf.toarray(): interval = sorted(interval,reverse=True) while len(interval) < IMPUT_SIZE: interval.append(0.0) Input.append(interval[:IMPUT_SIZE]) if len(Input) < TIME_STEPS: for q in range(0,TIME_STEPS-len(Input)): Input.insert(0,[0.0] * IMPUT_SIZE) totalData.append(Input[:TIME_STEPS]) totalDataLabel.append(Label) file_name = CX_WORD_DIR + "/BlackLists/BlackList" + str(FLAGS.N) + ".txt" f = open(file_name,'w') f.write(str(BlackList)) f.close() print("Generating BlackList with N = ", TIME_STEPS, " done.")
def tfidfTransform(matrix): tfidf_transformer = TfidfTransformer() tfidf_matrix = tfidf_transformer.fit_transform(matrix) return tfidf_matrix
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True) print(len(twenty_train.data)) print(len(twenty_test.data)) print(twenty_train.target_names) print("\n".join(twenty_train.data[0].split("\n"))) print(twenty_train.target[0]) from sklearn.feature_extraction.text import CountVectorizer count_vect = CountVectorizer() X_train_tf = count_vect.fit_transform(twenty_train.data) from sklearn.feature_extraction.text import TfidfTransformer tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_tf) X_train_tfidf.shape from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import accuracy_score mod = MultinomialNB() mod.fit(X_train_tfidf, twenty_train.target) X_test_tf = count_vect.transform(twenty_test.data) X_test_tfidf = tfidf_transformer.transform(X_test_tf) predicted = mod.predict(X_test_tfidf) print("Accuracy:", accuracy_score(twenty_test.target, predicted)) print( classification_report(twenty_test.target, predicted, target_names=twenty_test.target_names)) print("confusion matrix is \n", confusion_matrix(twenty_test.target,
def cluster_rows(sliced_data, n_clusters=2, cluster_method='PDN', n_iters=100, n_restarts=3, cluster_prep_method=None, cluster_penalty=1.0, rand_gen=None, sklearn_args=None): """ A wrapper to abstract from the implemented clustering method cluster_method = GMM | DPGMM | HOEM """ clustering = None # # slicing the data # allIndexes = numpy.arange(0, sliced_data.shape[0]) # zerorowsIdx = allIndexes[numpy.sum(sliced_data, 1) == 0] # datarowsIdx = allIndexes[numpy.sum(sliced_data, 1) > 0] # clustering_data = numpy.delete(sliced_data, zerorowsIdx, 0) clustering_data = sliced_data if cluster_prep_method == "tf-idf": tfidf_transformer = TfidfTransformer() clustering_data = tfidf_transformer.fit_transform(clustering_data) elif cluster_prep_method == "log+1": clustering_data = numpy.log(clustering_data + 1) elif cluster_prep_method == "sqrt": clustering_data = numpy.sqrt(clustering_data) # clustering_data = clustering_data # row_sums = clustering_data.sum(axis=1) + 0.001 # clustering_data = clustering_data / row_sums[:, numpy.newaxis] # clustering_data = numpy.sqrt(clustering_data) # sliced_data_sum = numpy.sum(sliced_data, axis=1) # sliced_data = sliced_data / sliced_data_sum[:, numpy.newaxis] # sliced_data = numpy.sqrt(sliced_data) print("RUNNING CLUSTERING dims: " + str(sliced_data.shape) + " into: " + str(n_clusters) + " method: " + cluster_method + " pre: " + str(cluster_prep_method)) #if sliced_data.shape[1] == 1: # print("V" + str(data_slice.feature_ids)) start_t = perf_counter() if cluster_method == 'PDN': assert cluster_prep_method == None clustering = ABPDN.pdnClustering(clustering_data, nM=n_clusters, maxIters=n_iters, max_depth=5) elif cluster_method == 'GMM': clustering_data = numpy.log(clustering_data + 1) # # retrieving other properties cov_type = sklearn_args['covariance_type'] \ if 'covariance_type' in sklearn_args else 'diag' # # creating the cluster from sklearn gmm_c = sklearn.mixture.GMM(n_components=n_clusters, covariance_type=cov_type, random_state=rand_gen, n_iter=n_iters, n_init=n_restarts) # # fitting to training set try: gmm_c.fit(clustering_data) except Exception: pass # # getting the cluster assignment clustering = gmm_c.predict(clustering_data) elif cluster_method == "KMeans": clustering = KMeans(n_clusters=n_clusters, random_state=rand_gen, n_jobs=1).fit_predict(clustering_data) elif cluster_method == "RandomPartition": clustering = above(make_planes(1, clustering_data.shape[1]), clustering_data)[:, 0] elif cluster_method == 'DPGMM': # # retrieving other properties cov_type = sklearn_args['covariance_type'] \ if 'covariance_type' in sklearn_args else 'diag' verbose = sklearn_args['verbose']\ if 'verbose' in sklearn_args else False dpgmm_c = sklearn.mixture.DPGMM(n_components=n_clusters, covariance_type=cov_type, random_state=rand_gen, n_iter=n_iters, alpha=cluster_penalty, verbose=verbose) # # fitting to training set dpgmm_c.fit(clustering_data) # # getting the cluster assignment clustering = dpgmm_c.predict(clustering_data) elif cluster_method == 'HOEM': raise NotImplementedError('Hard Online EM is not implemented yet') else: raise Exception('Clustering method not valid') end_t = perf_counter() print('Clustering done in %f secs' % (end_t - start_t)) # nI = sliced_data.shape[0] # uniqueNi = len(set([tuple(x) for x in clustering_data])) # print(nI, uniqueNi, sum(clustering)) # guarantee that we have a partition #if sum(clustering) == 0: #split evenly in n clusters # clustering = numpy.asarray((list(range(n_clusters))*math.ceil(nI/n_clusters))[0:nI]) # print(sliced_data) print(list(map(lambda c: numpy.sum(clustering == c), range(n_clusters)))) # clusteringComplete = numpy.zeros(data_slice.instance_ids.shape) # clusteringComplete[zerorowsIdx] = n_clusters # clusteringComplete[datarowsIdx] = clustering # return retrieve_clustering(clustering, data_slice.instance_ids[datarowsIdx]) return clustering
def _tfidf(table, input_col, max_df=None, min_df=1, num_voca=1000, idf_weighting_scheme='inverseDocumentFrequency', norm='l2', smooth_idf=True, sublinear_tf=False, output_type=False): corpus = np.array(table[input_col]) if max_df == None: max_df = len(corpus) tf_vectorizer = CountVectorizer(stop_words='english', max_df=max_df, min_df=min_df, max_features=num_voca) tf_vectorizer.fit(corpus) csr_matrix_tf = tf_vectorizer.transform(corpus) tfidf_vectorizer = TfidfTransformer(norm=norm, use_idf=True, smooth_idf=smooth_idf, sublinear_tf=sublinear_tf) csr_matrix_tfidf = tfidf_vectorizer.fit_transform(csr_matrix_tf) voca_dict = sorted(tf_vectorizer.vocabulary_.items(), key=itemgetter(1)) len_voca = len(voca_dict) # tf-idf table tfidf_table = pd.DataFrame() document_list = [] docID_list = [] if output_type == False: vocabulary_list = [] label_table = pd.DataFrame() for doc in range(len(corpus)): docID_list += ['doc_{}'.format(doc) for _ in range(len_voca)] document_list += [str(corpus[doc]) for _ in range(len_voca)] vocabulary_list += [voca_dict[j][0] for j in range(len_voca)] label_table['document_id'] = docID_list label_table[input_col] = document_list label_table['vocabulary'] = vocabulary_list tfidf_table = label_table tfidf_table['frequency'] = np.ravel(csr_matrix_tf.todense()) if idf_weighting_scheme == 'inverseDocumentFrequency': tfidf_table['tfidf score'] = np.ravel(csr_matrix_tfidf.todense()) elif idf_weighting_scheme == 'unary': tfidf_table['tfidf score'] = list(map(float, np.array(tfidf_table['frequency']))) elif output_type == True: for doc in range(len(corpus)): docID_list += ['doc_{}'.format(doc) for _ in range(csr_matrix_tfidf.indptr[doc + 1] - csr_matrix_tfidf.indptr[doc])] document_list += [str(corpus[doc]) for _ in range(csr_matrix_tfidf.indptr[doc + 1] - csr_matrix_tfidf.indptr[doc])] tfidf_table['document_id'] = docID_list tfidf_table[input_col] = document_list tfidf_table['vocabulary'] = [voca_dict[i][0] for i in csr_matrix_tf.indices] tfidf_table['frequency'] = csr_matrix_tf.data data_list = [] for doc in range(len(corpus)): data_list += [csr_matrix_tfidf.data[i] for i in range(csr_matrix_tfidf.indptr[doc + 1] - csr_matrix_tfidf.indptr[doc])][::-1] if idf_weighting_scheme == 'inverseDocumentFrequency': tfidf_table['tfidf score'] = data_list elif idf_weighting_scheme == 'unary': tfidf_table['tfidf score'] = list(map(float, np.array(tfidf_table['frequency']))) else: raise_runtime_error("Please check 'output_type'.") # idf table idf_table = pd.DataFrame() idf_table['vocabulary'] = [voca_dict[j][0] for j in range(len(voca_dict))] if idf_weighting_scheme == 'inverseDocumentFrequency': idf_table['idf weight'] = tfidf_vectorizer.idf_.tolist() elif idf_weighting_scheme == 'unary': idf_table['idf weight'] = float(1) params = { 'Input Column': input_col, 'Max DF': max_df, 'Min DF': min_df, 'Number of Vocabularies': num_voca, 'IDF Weighting Scheme': idf_weighting_scheme, 'Norm': norm, 'Smooth IDF': smooth_idf, 'Sublinear TF': sublinear_tf, 'Remove Zero Counts': output_type } rb = BrtcReprBuilder() rb.addMD(strip_margin("""# TF-IDF Result""")) rb.addMD(strip_margin(""" | |### Parameters | |{display_params} | |### IDF Table | |{idf_table} | |### TFIDF Table | |{tfidf_table} | """.format(display_params=dict2MD(params), idf_table=pandasDF2MD(idf_table, num_rows=200), tfidf_table=pandasDF2MD(tfidf_table, num_rows=200)))) model = _model_dict('tfidf') model['csr_matrix_tf'] = csr_matrix_tf model['csr_matrix_tfidf'] = csr_matrix_tfidf model['parameter'] = params model['idf_table'] = idf_table model['tfidf_table'] = tfidf_table model['_repr_brtc_'] = rb.get() return {'model' : model}
feature_names = np.array(tfidf.get_feature_names())[indices] unigrams = [v for v in feature_names if len(v.split(' ')) == 1] bigrams = [v for v in feature_names if len(v.split(' ')) == 2] print("# '{}':".format(Variety)) print(" . Most correlated unigrams:\n. {}".format('\n. '.join( unigrams[-N:]))) print(" . Most correlated bigrams:\n. {}".format('\n. '.join( bigrams[-N:]))) X_train, X_test, y_train, y_test = train_test_split(df['Description'], df['Variety'], random_state=0) count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(X_train) tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) clf = MultinomialNB().fit(X_train_tfidf, y_train) print(clf.predict(count_vect.transform(["Tannins and acidity"]))) #we get pinot noir as the prediction print( clf.predict( count_vect.transform(["A rich blend of blackberry, strong flavors"]))) #we get red blend X_train, X_test, y_train, y_test = train_test_split(df['Variety'], df['Description'], random_state=0) count_vect = CountVectorizer()
# from_data.append(1) from_data.append(0 if name == "sara" else 1) email.close() print "emails processed" from_sara.close() from_chris.close() pickle.dump(word_data, open("your_word_data.pkl", "w")) pickle.dump(from_data, open("your_email_authors.pkl", "w")) # The string that you get for word_data[152] word_data[152] ### in Part 4, do TfIdf vectorization here from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer, TfidfVectorizer vectorizer = TfidfVectorizer(stop_words="english") X = vectorizer.fit_transform(word_data) transformer = TfidfTransformer() tfidf = transformer.fit_transform(X) vector = vectorizer.get_feature_names() # How many unique words are there in your Tfldf? print len(vector) # What is word number 34597 in your TfIdf? vector[34597]
def remove_stopwords(text, stopwords): tokens = text.split() filtered_tokens = [word for word in tokens if word not in stopwords] return " ".join(filtered_tokens) if __name__ == "__main__": corpus = [ "我 来到 北京 清华大学", "他 来到 了 网易 杭研 大厦", "小明 硕士 毕业 与 中国 科学院", "我 爱 北京 天安门" ] vectorizer = CountVectorizer( ) # 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频 transformer = TfidfTransformer() # 该类会统计每个词语的tf-idf权值 tfidf = transformer.fit_transform(vectorizer.fit_transform( corpus)) # 第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵 word = vectorizer.get_feature_names() # 获取词袋模型中的所有词语 print(word) print(len(word)) weight = tfidf.toarray() # 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重 print(weight.shape) for i in range( len(weight)): # 打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重 print("-------这里输出第", i, "类文本的词语tf-idf权重------") for j in range(len(word)): print(word[j], weight[i][j]) vectorizer = TfidfVectorizer(encoding="utf8") X = vectorizer.fit_transform(corpus) idf = vectorizer.idf_ print(idf)
"Football": 2, "Film": 3, "Technology": 4 }) Y = df["Category"] count_vect = CountVectorizer(stop_words=sw) vectorizer = TfidfTransformer() # In[7]: test = pd.read_csv('test_set.csv', sep='\t') testX = test["Title"] + test["Content"] X_train_counts = count_vect.fit_transform(X) X_train_counts = vectorizer.fit_transform(X_train_counts) svd = TruncatedSVD(n_components=200) X_lsi = svd.fit_transform(X_train_counts) parameters = {'C': [1, 10]} svr = svm.LinearSVC() clf = GridSearchCV(svr, parameters) clf = clf.fit(X_lsi, Y) X_test_counts = count_vect.transform(testX) X_test_counts = vectorizer.transform(X_test_counts) X_test_counts = svd.transform(X_test_counts) predicted = clf.predict(X_test_counts) output = np.zeros((len(predicted), 2), dtype=object)
news_train_data.append(x[2]) news_train_data_target = [] for x in news_train: news_train_data_target.append(x[0]) #создаем словарь характерных признаков и переводим документ в векторы признаков count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(news_train_data) #считаем частоту и обратную частоту терминов tf_transformer = TfidfTransformer(use_idf=False).fit( X_train_counts) #прогоняем алгоритм оценки на данных X_train_tf = tf_transformer.transform( X_train_counts) #преобразуем матрицу к виду tf-idf tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform( X_train_counts) #а можно сделать сразу и быстрее #обучаем классификатор clf = MultinomialNB().fit( X_train_tfidf, news_train_data_target) #полиномиальный Наивный Байесовский классификатор docs_new = ['В Ираке новые танки'] X_new_counts = count_vect.transform(docs_new) X_new_tfidf = tfidf_transformer.transform(X_new_counts) predicted = clf.predict(X_new_tfidf) ####################### #Чтобы с цепочкой vectorizer => transformer => classifier, используем Pipeline text_clf = Pipeline([ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()), ])
def docdir_handler_tfidf(dir_path, f, stop_word_list=stop_words, stop_word_pattern_list=stop_word_patterns, scale=0.9, frange=(0, )): ''' 对某一目录下的所有文档,进行遍历分词和对每篇执行f回调函数 先进行一遍tf-idf去掉 非重要词 默认阈值0.5 即在没文档的tfidf中去掉较小的50% :param dir_path: :param f: f(index, word),表示第几篇的什么单词,利用全局变量或闭包,引用等完成值传递或者操作 :return: 所有文件名 和 原始文档 这是f的一个例子,把每个文档的词连为一个字符串,同时存在列表里 corpus = [] def f(index, word): while(len(corpus) <= index): corpus.append('') corpus[index] += ' ' + word ''' filenames = [] docs = [] corpus = [] filtered_words = set() print('start cut....') print('start filter stopword...') for index, filename in enumerate(os.listdir(dir_path)): if (len(frange) > 0 and frange[0] > index) or (len(frange) > 1 and frange[1] <= index): continue filenames.append(filename) try: td_file = open(os.path.join(dir_path, filename)) td_content = td_file.read() finally: td_file.close() docs.append(td_content) seg_list = jieba.cut(td_content) for word in seg_list: word = word.strip() # 检查是否是停用词 if len(word) > 0 and word not in filtered_words and word not in stop_word_list and not pattern_check(word, stop_word_patterns) : while (len(corpus) <= index): corpus.append([]) corpus[index].append(word) else: # 备份被去掉的单词,加快匹配 filtered_words.add(word) vectorizer = CountVectorizer(dtype=np.int32) transformer = TfidfTransformer() tfidf = transformer.fit_transform( vectorizer.fit_transform([' '.join(doc) for doc in corpus])) vocas = vectorizer.get_feature_names() # weight = tfidf.toarray() weight = tfidf tfidf_filtered_count = 0 # 计算总文档的tf-idf col_weight = np.sum(weight, 0) ti = list(np.asarray(vocas)[np.argsort(col_weight)]) for index, row in enumerate(weight): # 计算该文档的tf-idf # ti = list(np.asarray(vocas)[np.argsort(row)]) for word in corpus[index]: try: if ti.index(word) >= len(vocas) * scale: f(index, word) except ValueError, e: f(index, word) else: tfidf_filtered_count += 1
def create_word_matrix(text, way=1): """ 中文文本调用分词; 轻量级调用根据需求调用方法一二三; 重量级调用方法四 """ if way in [1, 2, 3, 4]: #用词袋表示点评文本 if way == 1: from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer() X_tr_bow = vectorizer.fit_transform(text) X_te_bow = vectorizer.transform(text) #划为训练集和测试集 #word_llen = len(vectorizer.vocabulary_) #print(vectorizer.get_feature_names()) word_matrix = X_tr_bow.toarray() return X_tr_bow, X_te_bow, word_matrix if way == 2: #使用词袋矩阵创建tf-idf from sklearn.feature_extraction.text import TfidfTransformer tfidf_trfm = TfidfTransformer(norm=None) X_tr_tfidf = tfidf_trfm.fit_transform(X_tr_bow) X_te_tfidf = tfidf_trfm.transform(X_te_bow) return X_tr_tfidf, X_te_tfidf if way == 3: #对tf-idf进行归一化 from sklearn.feature_extraction.text import CountVectorizer vectorizer = CountVectorizer() X_tr_bow = vectorizer.fit_transform(text) X_te_bow = vectorizer.transform(text) #划为训练集和测试集 word_llen = len(vectorizer.vocabulary_) from sklearn.feature_extraction.text import TfidfTransformer from sklearn.preprocessing import StandardScaler tfidf_trfm = TfidfTransformer(norm=None) X_tr_tfidf = tfidf_trfm.fit_transform(X_tr_bow) X_te_tfidf = tfidf_trfm.transform(X_te_bow) from sklearn.preprocessing import Normalizer scaler = Normalizer().fit(X_tr_tfidf) normalized_X = scaler.transform(X_tr_tfidf) normalized_X_test = scaler.transform(X_te_tfidf) return normalized_X, normalized_X_test if way == 4: import jieba from sklearn.feature_extraction.text import TfidfVectorizer all_list = [" ".join(jieba.cut(s, cut_all=False)) for s in text] #生成初始词表 stpwrdpath = r"E:\MyMySql\feature\data\stopword.txt" with open(stpwrdpath, 'rb') as fp: stopword = fp.read().decode('utf-8') # 提用词提取 stopwordlist = stopword.splitlines() #将提用词转换为列表 tfidf = TfidfVectorizer(stop_words=stopwordlist) #创建tfidf stopwordlist = stopword.splitlines() tfidf = TfidfVectorizer(stop_words=stopwordlist) X_tf = tfidf.fit_transform(all_list).toarray() X_tr_tfidf = X_tf[:-1] X_te_tfidf = X_tf[-1] return X_tr_tfidf, X_te_tfidf
count_vect = CountVectorizer(ngram_range=(1, 1), max_df=0.1) #print(count_vect) tfidf_transformer = TfidfTransformer(use_idf=True) X_train_counts = count_vect.fit_transform(data[:1050]) #print(X_train_counts) # X_train_counts2 = count_vect.transform(data[5000:]) testData = count_vect.transform(testData) #print(testData) # print (X_train_counts.shape ,X_train_counts2.shape ) # # X_train_counts = vstack([X_train_counts, X_train_counts2]).toarray() # x1 = X_train_counts.toarray().tolist() # x2 = X_train_counts2.toarray().tolist() # xf = x1 + x2 # train = np.asarray(xf) trainData = tfidf_transformer.fit_transform(X_train_counts) # trainData2 = tfidf_transformer.transform(X_train_counts[10000:]) # trainData = [*trainData , *trainData2] # print (trainData.shape , trainData2.shape) testData = tfidf_transformer.transform(testData) # print (trainData.shape) clf = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=0) clf.fit(trainData.toarray(), labels[:1050]) ans = clf.predict(testData.toarray()) for x in ans: print(x)
if word not in stopwords.words('english') ] #lowercase# labeldata1.data = labeldata1.data.str.lower() #remove punctuation# labeldata1.data = labeldata1.data.str.strip("?") # create count vector count_vect = CountVectorizer() counts = count_vect.fit_transform(labeldata1.data) # create tfidf matrix tfidf_transformer = TfidfTransformer() tfidf = tfidf_transformer.fit_transform(counts) # To check which model is more suitable to data,initialize different classification models. model = [] model.append(('SVM', SVC())) model.append(('LDA', LinearDiscriminantAnalysis())) model.append(('CART', DecisionTreeClassifier())) model.append(('NB', GaussianNB())) model.append(('RF', RandomForestClassifier())) # By cross validation using Kfold analysis identify model accuracy results = [] names = [] scoring = 'accuracy' for name, model in model:
#!/usr/bin/env python3 # -*- coding: utf-8 -*- """ Created on Sun Nov 18 09:08:44 2018 @author: xsxsz """ import numpy as np from sklearn import datasets from sklearn import metrics from sklearn.feature_extraction.text import TfidfTransformer from sklearn.naive_bayes import GaussianNB train=datasets.fetch_20newsgroups(subset='train') test=datasets.fetch_20newsgroups(subset='test',shuffle=True,random_state=10) tfidf=TfidfTransformer() X_tain_tfidf=tfidf.fit_transform(train) X_test_tfidf=tfidf.transform(test) clf=GaussianNB() clf.fit(X_tain_tfidf,train.target) predict=clf.predict(X_test_tfidf) print(metrics.classification_report(train.target,predict,target_names=test.target_names))
from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer corpus = [ 'This is the first document.', 'This is the second second document.', 'And the third one.', 'Is this the first document?', ] vector = CountVectorizer() x = vector.fit_transform(corpus) word = vector.get_feature_names() print(x.toarray()) print(word) transform = TfidfTransformer() print(transform) tfidf = transform.fit_transform(x) print(tfidf.toarray())
def get_tf_idf(X_train_count): tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_count) print "TF IDF Done", X_train_tfidf.shape return X_train_tfidf
def BOW_NN(directory): corpus, label = read_documents(directory) print "Corpus Size: " + str(len(corpus)) print "Label Size: " + str(len(label)) """ vectorize the corpus and convert it to a matrix. in the matrix, a row is a document, a column is a token(word). """ vectorizer = CountVectorizer(min_df=1) sparse_matrix = vectorizer.fit_transform(corpus) print("Original BOW Matrix Shape: ") print(sparse_matrix.toarray().shape) """ tf-idf weighting """ transformer = TfidfTransformer(smooth_idf=True) tfidf = transformer.fit_transform(sparse_matrix) tfidf = tfidf.todense() tfidf = np.array(tfidf) """ #SVD for LSA """ svd = TruncatedSVD(300) normalizer = Normalizer(copy=False) lsa = make_pipeline(svd, normalizer) reduced_matrix = lsa.fit_transform(tfidf) print(reduced_matrix.shape) print("Reduced Matrix Shape: ") print reduced_matrix.shape #print("Tf-idf Matrix Shape: ") #print tfidf.shape cv_index = [] #Generate a list of random numbers for cross validation use. for i in range(len(corpus)): cv_index.append(np.random.randint(0, 10)) #Start 10-fold Cross Validation: score_array = [] max_score = 0.0 for j in range (10): #datasets, labelsets = make_idx_data_cv(tfidf, label, cv_index, j) datasets, labelsets = make_idx_data_cv(reduced_matrix, label, cv_index, j) """ #Neural Networks Classification """ print "====================Start Neural Networks Classifier Training " + str(j+1) + "==============================" clf = MLPClassifier(solver='lbfgs', alpha=1e-5, hidden_layer_sizes=(500, ), random_state=1) print "Training Dataset Shape: " print datasets[0].shape clf.fit(datasets[0], labelsets[0]) print "Training Complete." #Start predicting testing corpus. print "Start Predicting..." score = clf.score(datasets[1], labelsets[1]) score_array.append(score) #if score > max_score: #max_score = score # Save classifier using Joblib. this method can only pickle a model to the disk # To load back the model, use "clf = joblib.load("filename.pkl")." #joblib.dump(clf, 'E:/A1113/FYP/BloombergNews/BloombergNews/SVM+BOW.pkl') print "Testing data accuracy :" print score print "====================Cross Validation Complete==============================" #print "Highest accuracy is " + str(max_score) print "Average accuracy is " + str(np.mean(score_array))
while os.path.exists(splitfilename): with open(splitfilename, 'r+', encoding='UTF-8-sig', errors='ignore') as wf: word_lst = [] a = wf.read() #word_lst = list(a.split(',')) a = "".join(a) c.append(a) #word_lst.append(a.split(' ')) num += 1 splitfilename = "split_text/split" + str(num) + ".txt" vectorizer = CountVectorizer() transformer = TfidfTransformer() tfidf = transformer.fit_transform(vectorizer.fit_transform(c)) word = vectorizer.get_feature_names() # 所有文本的关键字 weight = tfidf.toarray() # 对应的tfidf矩阵 #print(weight) # n_clusters=4,参数设置需要的分类这里设置成4类 kmeans = KMeans(n_clusters=177, random_state=0).fit(weight) #center为各类的聚类中心,保存在df_center的DataFrame中给数据加上标签 center = kmeans.cluster_centers_ df_center = pd.DataFrame(center) #标注每个点的聚类结果 labels = kmeans.labels_ # print(labels) # print(kmeans.labels_) print(kmeans.cluster_centers_)
TF-IDF 로 가중치를 주어 벡터화 TfidfTransformer() norm='l2' 각 문서의 피처 벡터를 어떻게 벡터 정규화 할지 정합니다. L2 : 벡터의 각 원소의 제곱의 합이 1이 되도록 만드는 것이고 기본 값(유클리디안거리) L1 : 벡터의 각 원소의 절댓값의 합이 1이 되도록 크기를 조절(맨하탄거리) smooth_idf=False 피처를 만들 때 0으로 나오는 항목에 대해 작은 값을 더해서(스무딩을 해서) 피처를 만들지 아니면 그냥 생성할지를 결정 sublinear_tf=False use_idf=True TF-IDF를 사용해 피처를 만들 것인지 아니면 단어 빈도 자체를 사용할 것인지 여부 ''' from sklearn.feature_extraction.text import TfidfTransformer transformer = TfidfTransformer(smooth_idf=False) # print(transformer) # TfidfTransformer(norm='l2', smooth_idf=False, sublinear_tf=False, use_idf=True) feature_tfidf = transformer.fit_transform(feature_vector) ''' %%time feature_tfidf.shape ''' tfidf_freq = pd.DataFrame(feature_tfidf.toarray(), columns=vocab) df_tfidf = pd.DataFrame(tfidf_freq.sum()) df_tfidf_top = df_tfidf.sort_values(by=0, ascending=False) # print(df_tfidf_top.head()) ''' Clustering - K-Means - MiniBatchKMeans https://scikit-learn.org/stable/auto_examples/cluster/plot_mini_batch_kmeans.html '''
# print(df.head()) df = pd.read_csv('./movie_data.csv') from sklearn.feature_extraction.text import CountVectorizer count = CountVectorizer() docs = np.array([ 'The sun is shining', 'The weather is sweet', 'The sun is shining and the weather is sweet' ]) bag = count.fit_transform(docs) from sklearn.feature_extraction.text import TfidfTransformer tfidf = TfidfTransformer() np.set_printoptions(precision=2) print(tfidf.fit_transform((count.fit_transform(docs))).toarray()) # 使用正则表达式处理文本 import re def preprocessor(text): text = re.sub('<[^>]*>', '', text) emoticons = re.findall('(?::|;|=)(?:-)?(?:\)|\(|D|P)', text) text = re.sub('[\W]+', ' ', text.lower()) + ''.join(emoticons).replace( '-', '') return text df['review'] = df['review'].apply(preprocessor)
for index, row in clickbait.iterrows(): if isinstance(row["status_message_without_tags"], str): texts.append(row["status_message_without_tags"]) y.append('c') # min_df is the minimum frequency word_vectorizer = CountVectorizer(analyzer='word', min_df=4) X = word_vectorizer.fit_transform(texts) #print("Shape of X: ", X.shape) #print("Vocabulary", word_vectorizer.vocabulary_) #print("Vocabulary length = ", len(word_vectorizer.vocabulary_)) # 'Normalize' the count matrix X (precisely scale down the impact of higher frequency ones) tfid = TfidfTransformer() X_normalized = tfid.fit_transform(X) ##################################################### # 3 CLASSIFICATION FOR RAW X AND X NORMALIZED # ##################################################### model(X, y) # But normalized measures of precision and recall improve model(X_normalized, y) # def feature_extraction(text): # urls = re.findall(r'http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+', text) # extracted_features = [len(text), text.count('!'), len(urls)] # return extracted_features
count_vect1 = CountVectorizer(min_df=3, stop_words='english') # class_1_Count = count_vect1.fit_transform(class_1.data) # class_1_tfidf = tfidf_transform.fit_transform(class_1_Count) # print "Number of terms in class1 data TF-IDF representation:",class_1_tfidf.shape # class_2_Count = count_vect1.fit_transform(class_2.data) # class_2_tfidf = tfidf_transform.fit_transform(class_2_Count) # print "Number of terms in class2 data TF-IDF representation:",class_2_tfidf.shape totalCount = count_vect1.fit_transform(total.data) totalData_tfidf = tfidf_transform.fit_transform(totalCount) #print "Number of terms in combined data TFxIDF representation:",totalData_tfidf.shape labels = [ int(x / 4) for x in total.target] svd = TruncatedSVD(n_components=2) totalLSI = svd.fit_transform(totalData_tfidf) kmeans = KMeans(n_clusters=2, n_init=30).fit(totalLSI) x1 = totalLSI[kmeans.labels_ == 0][:, 0] y1 = totalLSI[kmeans.labels_ == 0][:, 1] plt.plot(x1,y1,'r+', label='Computer Technology') x2 = totalLSI[kmeans.labels_ == 1][:, 0]
def test_tfidf_no_smoothing(): X = [[1, 1, 1], [1, 1, 0], [1, 0, 0]] tr = TfidfTransformer(smooth_idf=False, norm='l2') tfidf = tr.fit_transform(X).toarray() assert_true((tfidf >= 0).all()) # check normalization assert_array_almost_equal((tfidf ** 2).sum(axis=1), [1., 1., 1.]) # the lack of smoothing make IDF fragile in the presence of feature with # only zeros X = [[1, 1, 0], [1, 1, 0], [1, 0, 0]] tr = TfidfTransformer(smooth_idf=False, norm='l2') # First we need to verify that numpy here provides div 0 warnings with warnings.catch_warnings(record=True) as w: 1. / np.array([0.]) numpy_provides_div0_warning = len(w) == 1 with warnings.catch_warnings(record=True) as w: tfidf = tr.fit_transform(X).toarray() if not numpy_provides_div0_warning: raise SkipTest("Numpy does not provide div 0 warnings.") assert_equal(len(w), 1) # For Python 3 compatibility if hasattr(w[0].message, 'args'): assert_true("divide by zero" in w[0].message.args[0]) else: assert_true("divide by zero" in w[0].message)