class TF_Transformer(base.BaseEstimator, base.TransformerMixin): def __init__(self): self.cv_bi = CountVectorizer(min_df=2,max_df=0.7,ngram_range=(1,2)) self.tfidf_trans = TfidfTransformer() self.SVD_trans = TruncatedSVD(n_components=300) # X is a list of Fit_Review named tuples, y is none def fit(self, X, y=None): texts = [review.text for review in X] counts = self.cv_bi.fit_transform(texts) counts_tfidf = self.tfidf_trans.fit_transform(counts) self.SVD_trans.fit(counts_tfidf) return self # X is a list of either Fit_Review or Prod_Corpus named tuples def transform(self, X): texts = [review.text for review in X] counts = self.cv_bi.transform(texts) counts_tfidf = self.tfidf_trans.transform(counts) counts_trunc = self.SVD_trans.transform(counts_tfidf) return counts_trunc
def check_webshell(clf,dir): all=0 all_php=0 webshell=0 webshell_files_list = load_files_re(webshell_dir) CV = CountVectorizer(ngram_range=(3, 3), decode_error="ignore", max_features=max_features, token_pattern=r'\b\w+\b', min_df=1, max_df=1.0) x = CV.fit_transform(webshell_files_list).toarray() transformer = TfidfTransformer(smooth_idf=False) transformer.fit_transform(x) g = os.walk(dir) for path, d, filelist in g: for filename in filelist: fulepath=os.path.join(path, filename) t = load_file(fulepath) t_list=[] t_list.append(t) x2 = CV.transform(t_list).toarray() x2 = transformer.transform(x2).toarray() y_pred = clf.predict(x2) all+=1 if filename.endswith('.php'): all_php+=1 if y_pred[0] == 1: print "%s is webshell" % fulepath webshell+=1 print "Scan %d files(%d php files),%d files is webshell" %(all,all_php,webshell)
def text_sentiment(docs_new): docs_new=[docs_new] twenty_train= load_files('./Sentiment') #the complete data is in this directory; like comp.graphics etc count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(twenty_train.data) tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts) X_train_tf = tf_transformer.transform(X_train_counts) tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) # Fit a classifier on the training set #clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target) #f = open('my_classifier.pickle', 'wb') #pickle.dump(clf, f) #f = open('my_classifier.pickle',) #clf = pickle.load(f) #f.close() # save the classifier #with open('my_sentiment.pkl', 'wb') as fid: #cPickle.dump(clf, fid) # load it again with open('my_sentiment.pkl', 'rb') as fid: clf = cPickle.load(fid) X_new_counts = count_vect.transform(docs_new) X_new_tfidf = tfidf_transformer.transform(X_new_counts) predicted = clf.predict(X_new_tfidf) return twenty_train.target_names[predicted]
def handle_doc(word_set,rs_path): doc_dir = os.listdir(rs_path) doc_matrix = [] doc_cat = [] for docs in doc_dir: files = os.listdir(rs_path+docs) print "start to handle the --> "+docs for file_d in files: d_path = rs_path+docs+'/'+file_d #get the single file path with open(d_path,'rb') as text_file: str_tmp = '' file_lines = text_file.readlines() for line in file_lines: pattern = r'''[a-zA-Z]+''' tokens = nltk.regexp_tokenize(line,pattern) for t in tokens: if t.lower() in word_set: str_tmp += t.lower() str_tmp += ' ' doc_matrix.append(str_tmp) doc_cat.append(cat_dic[docs]) text_file.close() str_tmp = '' for sw in word_set: str_tmp += sw str_tmp += ' ' doc_matrix.append(str_tmp) doc_cat.append('NAN') vectorizer = CountVectorizer() doc_num = vectorizer.fit_transform(doc_matrix) tfidf = TfidfTransformer() doc_tfidf = tfidf.fit_transform(doc_num) return doc_tfidf[:-1,:],doc_cat[:-1]
def bayes_tfidf(prefix, sufix, dic_fn): """ prefix example: ./data/single_label_sen/sen_spanish_protest sufix example: pop_cat """ train_file = prefix + "_train.txt.tok" test_file = prefix + "_test.txt.tok" train_y_file = prefix + "_train." + sufix test_y_file = prefix + "_test." + sufix dic_cn = {k.strip(): i for i, k in enumerate(open(dic_fn))} word_train_set = [l.strip().lower() for l in open(train_file)] word_test_set = [l.strip().lower() for l in open(test_file)] train_y = [dic_cn[l.strip()] for l in open(train_y_file)] test_y = [dic_cn[l.strip()] for l in open(test_y_file)] # construct the word count matrix count_vect = CountVectorizer() train_set_count = count_vect.fit_transform(word_train_set) test_set_count = count_vect.transform(word_test_set) # construct tfidf matrix tfidf_transformer = TfidfTransformer() train_set_x = tfidf_transformer.fit_transform(train_set_count) test_set_x = tfidf_transformer.transform(test_set_count) print "start the model" test_score = bayes_experiment([train_set_x, train_y], [test_set_x, test_y]) return test_score
def get_feature_by_bag_tfidf(): global white_count global black_count global max_features print "max_features=%d" % max_features x=[] y=[] webshell_files_list = load_files_re(webshell_dir) y1=[1]*len(webshell_files_list) black_count=len(webshell_files_list) wp_files_list =load_files_re(whitefile_dir) y2=[0]*len(wp_files_list) white_count=len(wp_files_list) x=webshell_files_list+wp_files_list y=y1+y2 CV = CountVectorizer(ngram_range=(2, 4), decode_error="ignore",max_features=max_features, token_pattern = r'\b\w+\b',min_df=1, max_df=1.0) x=CV.fit_transform(x).toarray() transformer = TfidfTransformer(smooth_idf=False) x_tfidf = transformer.fit_transform(x) x = x_tfidf.toarray() return x,y
def race_tfidf(data, can_be_noun_arg, stop_words): print data = data.groupby('race')['last'] data = dict(list(data)) docs = [] for k in data: docs.append(' '.join(data[k])) count_vectorizer = CountVectorizer(stop_words='english') counts = count_vectorizer.fit_transform(docs) #print counts.todense().shape tfidf = TfidfTransformer(norm="l2", sublinear_tf='True') tfidf.fit(counts) #print "IDF:", tfidf.idf_.shape tf_idf_matrix = tfidf.transform(counts) freqs = {} sorted_voc = sorted(count_vectorizer.vocabulary_.iteritems(), key=operator.itemgetter(1)) terms,_ = zip(*sorted_voc) for i,k in enumerate(data.keys()): # make list row = np.array(tf_idf_matrix.todense()[i,:])[0].tolist() freq = zip(terms, row) freqs[k] = sorted(freq, reverse=True, key=lambda x: x[1]) print freqs[k][:5] #print tf_idf_matrix.todense().shape return freqs
def work_with_simple_bag_of_words(): count = CountVectorizer() docs = np.array([ 'The sun is shining', 'The weather is sweet', 'The sun is shining and the weather is sweet', ]) bag = count.fit_transform(docs) print(count.vocabulary_) print(bag.toarray()) np.set_printoptions(precision=2) tfidf = TfidfTransformer(use_idf=True, norm='l2', smooth_idf=True) print(tfidf.fit_transform(bag).toarray()) tf_is = 2 n_docs = 3 idf_is = np.log((n_docs+1) / (3+1)) tfidf_is = tf_is * (idf_is + 1) print("tf-idf of term 'is' = %.2f" % tfidf_is) tfidf = TfidfTransformer(use_idf=True, norm=None, smooth_idf=True) raw_tfidf = tfidf.fit_transform(bag).toarray()[-1] print(raw_tfidf) l2_tfidf = raw_tfidf / np.sqrt(np.sum(raw_tfidf**2)) print(l2_tfidf)
def make_training_xy(self, data): X = self.vectorizer.fit_transform(data.body) if self.tfidf: X = TfidfTransformer().fit_transform(X) X = X.tocsc() Y = normalize_scores(data.net, data.subreddit[0]) return X,Y
def test_tfidf_no_smoothing(): X = [[1, 1, 1], [1, 1, 0], [1, 0, 0]] tr = TfidfTransformer(smooth_idf=False, norm='l2') tfidf = tr.fit_transform(X).toarray() assert_true((tfidf >= 0).all()) # check normalization assert_array_almost_equal((tfidf ** 2).sum(axis=1), [1., 1., 1.]) # the lack of smoothing make IDF fragile in the presence of feature with # only zeros X = [[1, 1, 0], [1, 1, 0], [1, 0, 0]] tr = TfidfTransformer(smooth_idf=False, norm='l2') clean_warning_registry() with warnings.catch_warnings(record=True) as w: 1. / np.array([0.]) numpy_provides_div0_warning = len(w) == 1 in_warning_message = 'divide by zero' tfidf = assert_warns_message(RuntimeWarning, in_warning_message, tr.fit_transform, X).toarray() if not numpy_provides_div0_warning: raise SkipTest("Numpy does not provide div 0 warnings.")
def tfidf(fileList): segPath = sys.path[0] + '/seg_result' corpus = [] #存取文档的分词结果 for eachFile in fileList: fileName = segPath + '/' + eachFile f = open(fileName,'r+') content = f.read() corpus.append(content) vectorizer = CountVectorizer() # 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频 transformer = TfidfTransformer() # 该类会统计每个词语的tf-idf权值,同时会使用默认的中文停用词 tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus)) # 第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵 word = vectorizer.get_feature_names() # 获取词袋模型中的所有词语 weight = tfidf.toarray() # 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重 #创建tfidf文件夹,保存tf-idf的结果 tfidfFilePath = os.getcwd() + '/tfidfFile' if not os.path.exists(tfidfFilePath): os.mkdir(tfidfFilePath) for i in range(len(weight)): print u"--------Writing all the tf-idf in the", i, u" file into ", tfidfFilePath + '/' + str(i) + '.txt', "--------" name = tfidfFilePath + '/' + string.zfill(i, 5) + '.txt' f = open(name,'w+') for j in range(len(word)): #f.write(word[j] + " " + str(weight[i][j]) + "\n") #f.write(str(weight[i][j]) + "\n") f.write(word[j] + "\n") f.close()
def test_pickling_transformer(): X = CountVectorizer().fit_transform(JUNK_FOOD_DOCS) orig = TfidfTransformer().fit(X) s = pickle.dumps(orig) copy = pickle.loads(s) assert_equal(type(copy), orig.__class__) assert_array_equal(copy.fit_transform(X).toarray(), orig.fit_transform(X).toarray())
def test_tfidf_no_smoothing(): X = [[1, 1, 1], [1, 1, 0], [1, 0, 0]] tr = TfidfTransformer(smooth_idf=False, norm='l2') tfidf = tr.fit_transform(X).toarray() assert_true((tfidf >= 0).all()) # check normalization assert_array_almost_equal((tfidf ** 2).sum(axis=1), [1., 1., 1.]) # the lack of smoothing make IDF fragile in the presence of feature with # only zeros X = [[1, 1, 0], [1, 1, 0], [1, 0, 0]] tr = TfidfTransformer(smooth_idf=False, norm='l2') with warnings.catch_warnings(record=True) as w: tfidf = tr.fit_transform(X).toarray() assert_equal(len(w), 1) # For Python 3 compatibility if hasattr(w[0].message,'args') : assert_true("divide by zero" in\ w[0].message.args[0]) else : assert_true("divide by zero" in\ w[0].message)
class VectorModel(object): def __init__(self , list_of_comments=None): self.__list_of_comments = list_of_comments self.__vectorizer = [] self.__corpus_simple_vector = [] self.__transformer = [] self.__corpus_tf_idf = [] #self.prepare_models() def prepare_models(self): self.__vectorizer = CountVectorizer() vector = self.__vectorizer.fit_transform(self.__list_of_comments) self.__corpus_simple_vector = vector.toarray() self.__transformer = TfidfTransformer() tfidf = self.__transformer.fit_transform(self.__corpus_simple_vector) self.__corpus_tf_idf = tfidf.toarray() return [self.__vectorizer , self.__corpus_simple_vector , self.__transformer , self.__corpus_tf_idf] def set_models(self , vectorizer , transformer): self.__vectorizer = vectorizer self.__transformer = transformer def get_comment_frequency_vector(self , comments): vec_comments = [] for i in comments: vec_comments.append(i) vectores = self.__vectorizer.transform(vec_comments).toarray() return vectores def get_comment_tf_idf_vector(self , comments): vector = self.get_comment_frequency_vector(comments) result = self.__transformer.transform(vector).toarray() return result
class UnitClassifier(Trainer): def __init__(self, x, y, train_ratio): super(UnitClassifier, self).__init__(x, y, train_ratio) self._count_vec = CountVectorizer() self._tfidf_transformer = TfidfTransformer() def Fit(self): x_count = self._count_vec.fit_transform(self._x_train) self._tfidf_transformer.fit(x_count) def Preprocess(self, x): return self._tfidf_transformer.transform(self._count_vec.transform(x)) def Learn(self, x_train, y_train): LOG.info('x_train.shape = %s', str(x_train.shape)) LOG.info('len(y_train) = %d', len(y_train)) clf = RandomForestClassifier(verbose=0, n_jobs=-1, n_estimators=20) LOG.info('Training...') clf.fit(x_train, y_train) LOG.info('Done...') return clf def Eval(self): LOG.info('Eval ...') y_pred = self.Predict(self._x_test) return { 'misclass': np.mean(y_pred != self._y_test), 'report': classification_report(self._y_test, y_pred, target_names=self._model.classes_) }
class CaloriesRegressor(Trainer): def __init__(self, x, y, train_ratio): super(CaloriesRegressor, self).__init__(x, y, train_ratio) self._count_vec = CountVectorizer() self._tfidf_transformer = TfidfTransformer() def Fit(self): x_count = self._count_vec.fit_transform(self._x_train) self._tfidf_transformer.fit(x_count) def Preprocess(self, x): return self._tfidf_transformer.transform(self._count_vec.transform(x)) def Learn(self, x_train, y_train): LOG.info('x_train.shape = %s', str(x_train.shape)) LOG.info('len(y_train) = %d', len(y_train)) clf = RandomForestRegressor(verbose=0, n_jobs=-1, n_estimators=100) LOG.info('Training...') clf.fit(x_train, y_train) LOG.info('Done...') return clf def Eval(self): LOG.info('Eval ...') y_pred = self.Predict(self._x_test) return { 'median_absolute_error': median_absolute_error(self._y_test, y_pred), 'mean_squared_error': mean_squared_error(self._y_test, y_pred), 'explained_variance_score': explained_variance_score(self._y_test, y_pred), }
def load_dataset(prefix, sufix, dic_fn, vocab_fn='./data/english_review.trn-100000.vocab'): train_file = prefix + "_train.txt.tok" test_file = prefix + "_test.txt.tok" train_y_file = prefix + "_train." + sufix test_y_file = prefix + "_test." + sufix dic_cn = {k.strip(): i for i, k in enumerate(open(dic_fn))} word_train_set = [l.strip().lower() for l in open(train_file)] word_test_set = [l.strip().lower() for l in open(test_file)] train_y = [dic_cn[l.strip()] for l in open(train_y_file)] test_y = [dic_cn[l.strip()] for l in open(test_y_file)] vocab = [l.strip().lower().split("\t")[0] for l in open(vocab_fn)] count_vect = CountVectorizer(vocabulary=vocab) train_set_count = count_vect.fit_transform(word_train_set) test_set_count = count_vect.transform(word_test_set) tfidf_transformer = TfidfTransformer() train_set_x = tfidf_transformer.fit_transform(train_set_count).toarray() test_set_x = tfidf_transformer.transform(test_set_count).toarray() train_shared_x, train_shared_y = shared_dataset([train_set_x, train_y]) test_shared_x, test_shared_y = shared_dataset([test_set_x, test_y]) return [(train_shared_x, train_shared_y), (test_shared_x, test_shared_y)]
def tfidf_score(train_set, test_set): stopwords = nltk.corpus.stopwords.words('english') vectorizer = TfidfVectorizer(min_df=1, stop_words=set(stopwords)) #Remove all the None Types from the input datasets train_set = filter(None, train_set) test_set = filter(None, test_set) vectorizer.fit_transform(train_set) #print "Word Index is {0} \n".format(vectorizer.vocabulary_) smatrix = vectorizer.transform(test_set) tfidf = TfidfTransformer(norm="l2") tfidf.fit(smatrix) #print "IDF scores:", tfidf.idf_ tf_idf_matrix = tfidf.transform(smatrix) pairwise_similarity = tf_idf_matrix * tf_idf_matrix.T msum = tf_idf_matrix.sum(axis=1) cos_sum = pairwise_similarity.sum(axis=1) mlist = msum.tolist() cos_sim = cos_sum.tolist() count = 0 tfidfscores = {} for s in train_set: tfidfscores[s] = [] tfidfscores[s].append(mlist[count][0]) tfidfscores[s].append(cos_sim[count][0]) count += 1 return tfidfscores
def getContextFeature(self): import time print 'start to get Context Feature' start = time.time() from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer #when we meet the large corpus, need to input an iteration! corpus = self.getIterText() #transfer the text into word frequency matrix vectorizer = CountVectorizer() transformer = TfidfTransformer() tfidf=transformer.fit_transform(vectorizer.fit_transform(corpus)) print 'get word' word=vectorizer.get_feature_names() print 'get weight' weight=tfidf print 'weight type:', type(weight) #print weight end = time.time() print 'total time: \t', end-start return weight,word
def getTfidfData(dataTrain, dataTest, dataHold): print dataTrain.target_names count_vect = CountVectorizer(strip_accents='ascii', stop_words='english', max_features=len(dataTrain.target) * 2) tfidf_transformer = TfidfTransformer(sublinear_tf=True) X_counts = count_vect.fit_transform(dataTrain.data) X_tfidf = tfidf_transformer.fit_transform(X_counts) print X_tfidf.shape Y_counts = count_vect.transform(dataTest.data) Y_tfidf = tfidf_transformer.transform(Y_counts) print Y_tfidf.shape H_counts = count_vect.transform(dataHold.data) H_tfidf = tfidf_transformer.transform(H_counts) print 'feature selection using chi square test', len(dataTrain.target) feature_names = count_vect.get_feature_names() ch2 = SelectKBest(chi2, k='all') X_tfidf = ch2.fit_transform(X_tfidf, dataTrain.target) Y_tfidf = ch2.transform(Y_tfidf) H_tfidf = ch2.transform(H_tfidf) if feature_names: # keep selected feature names feature_names = [feature_names[i] for i in ch2.get_support(indices=True)] if feature_names: feature_names = numpy.asarray(feature_names) print 'important features' print feature_names[:10] return X_tfidf, Y_tfidf, H_tfidf
def tf_idf(seg_files): seg_path = './segfile/' corpus = [] for file in seg_files: fname = seg_path + file f = open(fname, 'r+') content = f.read() f.close() corpus.append(content) vectorizer = CountVectorizer() transformer = TfidfTransformer() tfdif = transformer.fit_transform(vectorizer.fit_transform(corpus)) word = vectorizer.get_feature_names() weight = tfdif.toarray() save_path = './tfidffile' if not os._exists(save_path): os.mkdir(save_path) for i in range(len(weight)): print('--------Writing all the tf-idf in the', i, u' file into ', save_path + '/' + string.zfill(i, 5) + '.txt', '--------') f = open(save_path + '/' + string.zfill(i, 5) + '.txt', 'w+') for j in range(len(word)): f.write(word[j] + ' ' + str(weight[i][j]) + '\r\n') f.close()
def test_classifiers(): print "running bayes classifier.." # train_bayesian_classifier_from_scratch() dataset = get_thing_from_file("training_dataset.txt") print dataset.target_names bayes = get_thing_from_file("bayes.txt") bayes_model = bayes.fit(dataset.data, dataset.target) bayes_model = get_thing_from_file("bayes_model.txt") results = [] count = 0 url_arr = [] bayes_predicted = bayes_model.predict(dataset) # for url in get_test_articles(): # url_arr.append(url) article_arr = get_article_array(url_arr) docs_new = ['God is love', 'OpenGL on the GPU is fast'] count_vect = CountVectorizer() tfidf_trans = TfidfTransformer() x_new_counts = count_vect.transform(docs_new) x_new_horse = tfidf_trans.transform(x_new_counts) predicted = bayes_model.predict(x_new_horse) for doc, category in zip(docs_new, predicted): print('%r => %s' % (doc, dataset.target_names[category]))
def tfidf(corpus, word_category, file_to_write): vectorizer = CountVectorizer() transformer = TfidfTransformer() tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus)) weight = tfidf.toarray() sum_weight = np.sum(weight, axis=0) word = vectorizer.get_feature_names() word_and_weight = [] for i in range(len(sum_weight)): word_and_weight.append([word[i], sum_weight[i]]) word_and_weight.sort(key=lambda key: key[1], reverse=True) f = open(file_to_write, "w+") result = [] for j in range(len(word_and_weight)): try: f.write( word_and_weight[j][0] + " " + str(word_and_weight[j][1]) + " " + word_category[word_and_weight[j][0]] + "\n" ) result.append([word_and_weight[j][0], word_and_weight[j][1], word_category[word_and_weight[j][0]]]) except: continue f.close() return result
def extract_text_features(train_data, test_data): """ Returns one types of training and test data features. 1) Term Frequency times Inverse Document Frequency (tf-idf): X_train_tfidf, X_test_tfidf Parameters ---------- train_data : List[str] Training data in list. Will only take 30000 reviews for efficiency purposes test_data : List[str] Test data in list Returns ------- Tuple(scipy.sparse.csr.csr_matrix,.., list) Returns X_train_tfidf, X_test_tfidf, vocab as a tuple. """ # set up a count vectorizer that removes english stopwords when building a term-doc matrix count_vect = CountVectorizer(stop_words=set(stopwords.words('english'))) # build the term frequency per document matrix from a random sublist of 30,000 documents train_counts = count_vect.fit_transform(random.sample(train_data, 30000)) test_counts = count_vect.transform(test_data) tfidf_transformer = TfidfTransformer() train_tfidf = tfidf_transformer.fit_transform(train_counts) test_tfidf = tfidf_transformer.transform(test_counts) vocab = count_vect.get_feature_names() return (train_tfidf, test_tfidf, vocab)
def cal_product_description_tfidf(): #PART II compute the tf-idf for product description print "\nBegins,compute the tf-idf for product description ..." product_description_data = pd.read_csv('product_descriptions.csv') print "\nMerge the product description into database..." AllSet = pd.merge( AllSet , product_description_data, how='left', on='product_uid') print "\nStemming the product description ..." AllSet['product_description'] = AllSet['product_description'].map(lambda x: stem_process(x)) product_description=AllSet['product_description'] print "\nGet the (product description vocabulary)-(search term) frequency matrix..." search_vect_descrip = CountVectorizer(stop_words='english', binary=True)# use binary value to indicate the frequency search_vect_descrip.fit(product_description)#learn the vocabulary search_descrip_fq_matrix = search_vect_descrip.transform(search_term) #get the (product description vocabulary)-(search term) frequency matrix print "\nGet the (product description vocabulary)-(product_description) frequency matrix..." description_vect = CountVectorizer(stop_words ='english') description_vect.fit_transform(product_description)#learn the vocabulary description_fq_matrix=description_vect.transform(product_description) #get the (product discription vocabulary)-(product_description) frequency matrix print "\nGet the idf matrix..." tfidf_transformer = TfidfTransformer(norm="l2",smooth_idf=True) tfidf_transformer.fit(description_fq_matrix) # get idf for each vocabulary tf_idf_descrip_matrix = tfidf_transformer.transform(description_fq_matrix) #get the idf matrix print "\nCompute the result of tf-idf for product description ..." tf_idf_descrip_result=[]#compute the result of tf-idf for product title for index in range(tf_idf_descrip_matrix.shape[0]): tf_idf_descrip_result.append((np.multiply(tf_idf_descrip_matrix[index], search_descrip_fq_matrix[index].transpose()))[0, 0]) pd.DataFrame({"id":AllSet['id'],"product_description_tfidf": tf_idf_descrip_result}).to_csv('product_description_tfidf.csv', index=False)
def LR_modeling(file_name, k, AUC=True, weight=False): raw_data = pd.read_csv(file_name) raw_data = raw_data.drop(['issue', 'field'], axis=1) X = raw_data.drop('panelvote', axis=1) y = raw_data['panelvote'] tfidf = TfidfTransformer(norm=u'l2', use_idf=True, smooth_idf=True, sublinear_tf=False) X = tfidf.fit_transform(X.values) X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.8, random_state=42) lr = LogisticRegression(C=1) lr.fit(X_train, y_train) auc = np.mean(cross_validation.cross_val_score(lr, X, y, scoring="roc_auc")) if AUC == True: print "AUC for %s on the test data = %.3f" % (file_name, auc) if weight == False: top_positive, top_negative = get_top_k_nocoeff(lr.coef_[0], k) return raw_data.columns[top_positive], raw_data.columns[top_negative] else: top_positive, top_negative = get_top_k(lr.coef_[0], k) final_pos = {} final_neg = {} for i in top_positive.keys(): final_pos[raw_data.columns[i]] = top_positive[i] for j in top_negative.keys(): final_neg[raw_data.columns[j]] = top_negative[j] pos = sorted(final_pos.items(), key=operator.itemgetter(1), reverse=True) neg = sorted(final_neg.items(), key=operator.itemgetter(1)) return pos, neg
def cal_product_title_tfidf(): #PART I compute the tf-idf for product title print "\nBegins,compute the tf-idf for product title ..." print "\nStemming product_title..." AllSet['product_title'] = AllSet['product_title'].map(lambda x : stem_process(x)) product_title = AllSet['product_title'] print "\nGet the (product title vocabulary)-(search term) frequency matrix..." search_vect_tittle = CountVectorizer(stop_words='english', binary=True)# use binary value to indicate the frequency search_vect_tittle.fit(product_title)#learn the vocabulary search_tittle_fq_matrix = search_vect_tittle.transform(search_term) #get the (product title vocabulary)-(search term) frequency matrix print "\nGet the (product title vocabulary)-(product_title) frequency matrix" title_vect = CountVectorizer(stop_words='english') title_vect.fit_transform(product_title)#learn the vocabulary title_fq_matrix = title_vect.transform(product_title) #get the (product title vocabulary)-(product_title) frequency matrix print "\nGet the idf matrix" tfidf_transformer = TfidfTransformer(norm="l2", smooth_idf=True) tfidf_transformer.fit(title_fq_matrix) # get idf for each vocabulary tf_idf_title_matrix = tfidf_transformer.transform(title_fq_matrix) #get the idf matrix print "\nCompute the result of tf-idf for product title ..." tf_idf_title_result = [] #compute the result of tf-idf for product title for index in range(tf_idf_title_matrix.shape[0]): tf_idf_title_result.append((np.multiply(tf_idf_title_matrix[index], search_tittle_fq_matrix[index].transpose()))[0, 0]) pd.DataFrame({"id": AllSet['id'],"product_title_tfidf": tf_idf_title_result}).to_csv('product_title_tfidf.csv', index=False) return 0
def __init__(self, **kwargs): pickle_loader = PickleLoader() saved_pickles_path = kwargs.get("saved_pickles_path") self.pickle_paths = PicklePaths(parent_dir=saved_pickles_path) if self.pickle_paths.pickles_exist(): self.title_count_vectorizer = pickle_loader.load_pickle( self.pickle_paths.TITLE_COUNT_VECTORIZER_PICKLE_PATH ) self.title_tfidf_transformer = pickle_loader.load_pickle( self.pickle_paths.TITLE_TFIDF_TRANSFORMER_PICKLE_PATH ) self.title_classifier = pickle_loader.load_pickle(self.pickle_paths.TITLE_CLASSIFIER_PICKLE_PATH) self.desc_count_vectorizer = pickle_loader.load_pickle(self.pickle_paths.DESC_COUNT_VECTORIZER_PICKLE_PATH) self.desc_tfidf_transformer = pickle_loader.load_pickle( self.pickle_paths.DESC_TFIDF_TRANSFORMER_PICKLE_PATH ) self.desc_classifier = pickle_loader.load_pickle(self.pickle_paths.DESC_CLASSIFIER_PICKLE_PATH) else: # todo: don't forget lemmatization, stopwords. etc self.title_count_vectorizer = CountVectorizer() self.title_tfidf_transformer = TfidfTransformer() self.desc_count_vectorizer = CountVectorizer() self.desc_tfidf_transformer = TfidfTransformer() self.title_classifier = None self.desc_classifier = None
def estimation(file='song_text.txt', separator=u'--text--'): arr = text_split_line(file, u'--text--') dvect = data_vector(arr) target = dvect[0] text = dvect[1] dic = dvect[2] # for converting target integer to artist name # print (target) # print (dic) count_vect = CountVectorizer() word_vect = count_vect.fit_transform(text) tfidf_transformer = TfidfTransformer() vect_tfidf = tfidf_transformer.fit_transform(word_vect) machine = svm.SVC(probability=True) # one of the best for text, see tutorial working with text machine.fit(vect_tfidf, target) print (machine.score(vect_tfidf, target)) prediction = machine.predict(vect_tfidf) # accuracy test (tutorial) print (u'model predictive accuracy: {:.1%}' .format(np.mean(prediction == target))) new_texts = [text[500], text[2345], text[-2], text[0], text[5893]] new_data = count_vect.transform(new_texts) new_tfidf = tfidf_transformer.transform(new_data) prediction = machine.predict(new_tfidf) for i in range(len(new_texts)): print (u'{}\t=> {}'.format(new_texts[i].splitlines()[:2], dic[prediction[i]])) return
def use_pipeline_with_fs(self): ##################### #Build a vectorizer / classifier pipeline that filters out tokens that are too rare or too frequent ##################### pipeline = Pipeline([ ('vect', TfidfVectorizer(stop_words=stopwords, min_df=3, max_df=0.90)), ("selector", SelectPercentile()), ('clf', RandomForestClassifier()), ]) # Build a grid search to find the best parameter # Fit the pipeline on the training set using grid search for the parameters parameters = { 'vect__ngram_range': [(1, 1), (1, 2), (1, 3)], 'vect__use_idf': (True, False), 'clf__n_estimators': (10, 50, 100), 'clf__criterion': ("gini", "entropy"), 'clf__max_depth': (None, 2, 4), 'clf__min_samples_split': (2, 4, 6), 'selector__score_func': (chi2, f_classif), 'selector__percentile': (85, 95, 100), } ################# # Exhaustive search over specified parameter values for an estimator, use cv to generate data to be used # implements the usual estimator API: when “fitting” it on a dataset all the possible combinations of parameter values are evaluated and the best combination is retained. ################# cv = StratifiedShuffleSplit(y_train, n_iter=5, test_size=0.2, random_state=42) grid_search = GridSearchCV(pipeline, param_grid=parameters, cv=cv, n_jobs=-1) clf_gs = grid_search.fit(docs_train, y_train) ############### # print the cross-validated scores for the each parameters set explored by the grid search ############### best_parameters, score, _ = max(clf_gs.grid_scores_, key=lambda x: x[1]) for param_name in sorted(parameters.keys()): print("%s: %r" % (param_name, best_parameters[param_name])) print("Score for gridsearch is %0.2f" % score) #y_predicted = clf_gs.predict(docs_test) ############### # run the classifier again with the best parameters # in order to get 'clf' for get_important_feature function! ############### ngram_range = best_parameters['vect__ngram_range'] use_idf = best_parameters['vect__use_idf'] score_func = best_parameters['selector__score_func'] percentile = best_parameters['selector__percentile'] # vectorisation count_vect = CountVectorizer(stop_words=stopwords, min_df=3, max_df=0.90, ngram_range=ngram_range) X_CV = count_vect.fit_transform(docs_train) # print number of unique words (n_features) print("Shape of train data is " + str(X_CV.shape)) # tfidf transformation tfidf_transformer = TfidfTransformer(use_idf=use_idf) X_tfidf = tfidf_transformer.fit_transform(X_CV) ################# # feature selection ################# selector = SelectPercentile(score_func=score_func, percentile=percentile) combined_features = Pipeline([("vect", count_vect), ("tfidf", tfidf_transformer), ("feat_select", selector)]) X_features = combined_features.fit_transform(docs_train, y_train) X_test_features = combined_features.transform(docs_test) print("Shape of train data after feature selection is " + str(X_features.shape)) print("Shape of test data after feature selection is " + str(X_test_features.shape)) # run classifier on selected features clf = RandomForestClassifier().fit(X_features, y_train) # get the features which are selected and write to file feature_boolean = selector.get_support(indices=False) f = open(path_to_store_feature_selection_boolean_file, 'w') for fb in feature_boolean: f.write(str(fb) + '\n') f.close() ################## # get cross validation score ################## scores = cross_val_score(clf, X_features, y_train, cv=10, scoring='f1_weighted') print("Cross validation score: " + str(scores)) # Get average performance of classifier on training data using 10-fold CV, along with standard deviation print("Cross validation accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) ################# # run classifier on test data ################# y_predicted = clf.predict(X_test_features) # print the mean accuracy on the given test data and labels print("Classifier score on test data is: %0.2f " % clf.score(X_test_features, y_test)) # Print and plot the confusion matrix print(metrics.classification_report(y_test, y_predicted)) cm = metrics.confusion_matrix(y_test, y_predicted) print(cm) # import matplotlib.pyplot as plt # plt.matshow(cm) # plt.show() return clf, count_vect
remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation) def LemNormalize(text): return LemTokens( nltk.word_tokenize(text.lower().translate(remove_punct_dict))) LemVectorizer = CountVectorizer(tokenizer=LemNormalize, stop_words='english') LemVectorizer.fit_transform(array) print(LemVectorizer.vocabulary_) tf_matrix = LemVectorizer.transform(array).toarray() tfidfTran = TfidfTransformer(norm="l2") tfidfTran.fit(tf_matrix) print tfidfTran.idf_ tfidf_matrix = tfidfTran.transform(tf_matrix) print("") print( "*************** printing tfidf_martrix ****************************************" ) print("") print tfidf_matrix.toarray() cos_similarity_matrix = tfidf_matrix * tfidf_matrix.T print("") print( "*************** printing cos_similarity_matrix *******************************" ) print("")
def train_classifier(self): # Get list of features count_vect = CountVectorizer(stop_words=stopwords, min_df=3, max_df=0.90, ngram_range=_ngram_range) X_CV = count_vect.fit_transform(docs_train) # print number of unique words (n_features) print("Shape of train data is " + str(X_CV.shape)) # tfidf transformation### tfidf_transformer = TfidfTransformer(use_idf=_use_idf) X_tfidf = tfidf_transformer.fit_transform(X_CV) # train the classifier print("Fitting data ...") clf = RandomForestClassifier(n_estimators=_n_estimators, criterion=_criterion, max_depth=_max_depth, min_samples_split=_min_samples_split).fit( X_tfidf, y_train) ################## # get cross validation score ################## scores = cross_val_score(clf, X_tfidf, y_train, cv=10, scoring='f1_weighted') print("Cross validation score: " + str(scores)) # Get average performance of classifier on training data using 10-fold CV, along with standard deviation print("Cross validation accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) ################## # run classifier on test data ################## X_test_CV = count_vect.transform(docs_test) print("Shape of test data is " + str(X_test_CV.shape)) X_test_tfidf = tfidf_transformer.transform(X_test_CV) y_predicted = clf.predict(X_test_tfidf) # print the mean accuracy on the given test data and labels print("Classifier score on test data is: %0.2f " % clf.score(X_test_tfidf, y_test)) print(metrics.classification_report(y_test, y_predicted)) cm = metrics.confusion_matrix(y_test, y_predicted) print(cm) return clf, count_vect
def make_reuters_data(data_dir): np.random.seed(1234) from sklearn.feature_extraction.text import CountVectorizer from os.path import join did_to_cat = {} cat_list = ['CCAT', 'GCAT', 'MCAT', 'ECAT'] with open(join(data_dir, 'rcv1-v2.topics.qrels')) as fin: for line in fin.readlines(): line = line.strip().split(' ') cat = line[0] did = int(line[1]) if cat in cat_list: did_to_cat[did] = did_to_cat.get(did, []) + [cat] for did in list(did_to_cat.keys()): if len(did_to_cat[did]) > 1: del did_to_cat[did] dat_list = [ 'lyrl2004_tokens_test_pt0.dat', 'lyrl2004_tokens_test_pt1.dat', 'lyrl2004_tokens_test_pt2.dat', 'lyrl2004_tokens_test_pt3.dat', 'lyrl2004_tokens_train.dat' ] data = [] target = [] cat_to_cid = {'CCAT': 0, 'GCAT': 1, 'MCAT': 2, 'ECAT': 3} del did for dat in dat_list: with open(join(data_dir, dat)) as fin: for line in fin.readlines(): if line.startswith('.I'): if 'did' in locals(): assert doc != '' if did in did_to_cat: data.append(doc) target.append(cat_to_cid[did_to_cat[did][0]]) did = int(line.strip().split(' ')[1]) doc = '' elif line.startswith('.W'): assert doc == '' else: doc += line assert len(data) == len(did_to_cat) x = CountVectorizer(dtype=np.float64, max_features=2000).fit_transform(data) y = np.asarray(target) from sklearn.feature_extraction.text import TfidfTransformer x = TfidfTransformer(norm='l2', sublinear_tf=True).fit_transform(x) x = x[:10000] y = y[:10000] x = np.asarray(x.todense()) * np.sqrt(x.shape[1]) print('todense succeed') p = np.random.permutation(x.shape[0]) x = x[p] y = y[p] print('permutation finished') assert x.shape[0] == y.shape[0] x = x.reshape((x.shape[0], x.size / x.shape[0])) np.save(join(data_dir, 'reutersidf10k.npy'), {'data': x, 'label': y})
def loaddata(data): data1 = jieba.cut(data) data11 = "" for item in data1: data11 += item + " " return data11 dataf = pda.read_sql(sql, conn) dataf2 = dataf.T title = dataf2.values[0] content = dataf2.values[1] train_text = [] for i in content: thisdata = loaddata(str(i)) pat1 = "<[^>]*?>" thisdata = re.sub(pat1, "", thisdata) thisdata = thisdata.replace("\n", "").replace("\t", "") train_text.append(thisdata) count_vect = CountVectorizer() train_x_counts = count_vect.fit_transform(train_text) #tfidf模型 from sklearn.feature_extraction.text import TfidfTransformer tf_ts = TfidfTransformer(use_idf=True).fit(train_x_counts) train_x_tf = tf_ts.transform(train_x_counts) from sklearn.cluster import KMeans kms = KMeans(n_clusters=3) y = kms.fit_predict(train_x_tf) dataf["type"] = y dataf.to_csv("./01聚类归类法结果.csv")
from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer from sklearn.pipeline import Pipeline from sklearn.naive_bayes import MultinomialNB from skmultilearn.problem_transform import LabelPowerset import numpy as np from sklearn.linear_model import LogisticRegression from sklearn import svm # LabelPowerset allows for multi-label classification # Build a pipeline for multinomial naive bayes classification text_clf = Pipeline([ ('vect', CountVectorizer(stop_words="english", lowercase=True, ngram_range=(1, 1))), ('tfidf', TfidfTransformer(use_idf=True)), ('clf', LabelPowerset(svm.LinearSVC())), ]) voting_clf = Pipeline([ ('vect', CountVectorizer(stop_words="english", lowercase=True, ngram_range=(1, 1))), ('tfidf', TfidfTransformer(use_idf=False)), ('clf', ClassifierChain(svm.LinearSVC())), ]) text_clf = text_clf.fit(X_train, y_train) voting_clf = voting_clf.fit(X_train, y_train)
HashingVectorizer(ngram_range=(1, 3), n_features=2**27, dtype=np.float32, norm='l2', lowercase=False, stop_words=stopwords)), ('drop_cols', DropColumnsByDf(min_df=2)), ])) ], n_jobs=1) sparse_merge = vectorizer.fit_transform(merge) print(f'[{time() - start_time}] Merge vectorized') print(sparse_merge.shape) tfidf_transformer = TfidfTransformer() X = tfidf_transformer.fit_transform(sparse_merge) print(f'[{time() - start_time}] TF/IDF completed') X_train = X[:nrow_train] print(X_train.shape) X_test = X[nrow_train:] del merge del sparse_merge del vectorizer del tfidf_transformer gc.collect() X_train, X_test = intersect_drop_columns(X_train, X_test, min_df=1)
y = dataemp["label"] # In[ ]: # Vectorizing the text data # In[40]: cv = CountVectorizer() X = cv.fit_transform(X) # Feeding the output of vectors into TFIDF transformer # In[41]: tfidf = TfidfTransformer() tfidf.fit_transform(X) # Splitting the training and test data in the ratio of 70% training data and 30% test data # In[42]: X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101) # Naive Bayes Classification model # In[43]:
print len(sent_list_old) print sent_list_old print sent_list_old[3] for sentence in sent_list_old: sent_list.append(clean_str(sentence)) print sent_list[3] print len(sent_list) vectorizer = CountVectorizer(min_df=1, stop_words='english', strip_accents='ascii') count_vectorizer = vectorizer.fit_transform(sent_list) transformer = TfidfTransformer(smooth_idf=True) tfidf = transformer.fit_transform(count_vectorizer) print count_vectorizer.shape mat = tfidf.toarray() km = SphericalKMeans(10) clusters = km.fit(mat) centroids = km.cluster_centers_ labels = km.labels_ summa_index = [] for c in range(10): simi_list = [] max_simi = -1 * float("inf") max_idx = -1 centroid = centroids[c, :]
@file: tf-idf-liiuxuejiang.py @time: 18-3-28 下午6:43 @contact: [email protected] ''' import jieba import jieba.posseg as pseg import os import sys from sklearn import feature_extraction from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer if __name__ == "__main__": corpus = [ "我 来到 北京 清华大学", # 第一类文本切词后的结果,词之间以空格隔开 "他 来到 了 网易 杭研 大厦", # 第二类文本的切词结果 "小明 硕士 毕业 与 中国 科学院", # 第三类文本的切词结果 "我 爱 北京 天安门" ] # 第四类文本的切词结果 vectorizer = CountVectorizer( ) # 该类会将文本中的词语转换为词频矩阵,矩阵元素a[i][j] 表示j词在i类文本下的词频 transformer = TfidfTransformer() # 该类会统计每个词语的tf-idf权值 tfidf = transformer.fit_transform(vectorizer.fit_transform( corpus)) # 第一个fit_transform是计算tf-idf,第二个fit_transform是将文本转为词频矩阵 word = vectorizer.get_feature_names() # 获取词袋模型中的所有词语 weight = tfidf.toarray() # 将tf-idf矩阵抽取出来,元素a[i][j]表示j词在i类文本中的tf-idf权重 for i in range( len(weight)): # 打印每类文本的tf-idf词语权重,第一个for遍历所有文本,第二个for便利某一类文本下的词语权重 print(u"-------这里输出第", i, u"类文本的词语tf-idf权重------") for j in range(len(word)): print(word[j], weight[i][j])
def predict_posts(self): docs_train, docs_test, y_train, y_test = train_test_split( X, y, test_size=0.01, random_state=42) print("Number of data point is " + str(len(y))) ############### # uncomment either one of the below # predict unlabelled tweet OR test classifier on gold standard ############### # dataset_topredict = pd.read_csv(path_to_file_to_be_predicted, header=0, names=['tweets']) dataset_topredict = pd.read_csv(path_to_gold_standard_file, header=0, names=['tweets', 'class']) X_topredict = dataset_topredict['tweets'] y_goldstandard = dataset_topredict['class'] ############### # train classifier ############### # Get list of features count_vect = CountVectorizer(stop_words=stopwords, min_df=3, max_df=0.90, ngram_range=_ngram_range) X_CV = count_vect.fit_transform(docs_train) # print number of unique words (n_features) print("Shape of train data is " + str(X_CV.shape)) # tfidf transformation### tfidf_transformer = TfidfTransformer(use_idf=_use_idf) X_tfidf = tfidf_transformer.fit_transform(X_CV) # train the classifier print("Fitting data ...") clf = RandomForestClassifier().fit(X_tfidf, y_train) ################## # get cross validation score ################## scores = cross_val_score(clf, X_tfidf, y_train, cv=10, scoring='f1_weighted') print("Cross validation score: " + str(scores)) # Get average performance of classifier on training data using 10-fold CV, along with standard deviation # the factor two is to signify 2 sigma, which is 95% confidence level print("Cross validation accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) ################## # run classifier to predict tweets ################## X_test_CV = count_vect.transform(X_topredict) print("Shape of test data is " + str(X_test_CV.shape)) X_test_tfidf = tfidf_transformer.transform(X_test_CV) y_predicted = clf.predict(X_test_tfidf) ################## # run classifier on gold standard (tweets that were labelled by twitter insight) ################## # print the mean accuracy on the given test data and labels print("Classifier score on test data is: %0.2f " % clf.score(X_test_tfidf, y_goldstandard)) print(metrics.classification_report(y_goldstandard, y_predicted)) cm = metrics.confusion_matrix(y_goldstandard, y_predicted) print(cm) ################## # write prediction results to file ################## f = open(path_to_store_predicted_results, 'w') for yp in y_predicted: f.write(yp + '\n') f.close()
from sklearn.datasets import fetch_20newsgroups Training_Data = fetch_20newsgroups(subset='train', shuffle=True) from sklearn.feature_extraction.text import CountVectorizer from sklearn.pipeline import Pipeline from sklearn.feature_extraction.text import TfidfTransformer from sklearn.naive_bayes import MultinomialNB classificationText = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())]) classificationText = classificationText.fit(Training_Data.data, Training_Data.target) # Performance measurement of NB Classifier import numpy as np Testing_Data = fetch_20newsgroups(subset='test', shuffle=True) prediction_target = classificationText.predict(Testing_Data.data) print("Accuracy in Categorization in percentage : ", (np.mean(prediction_target == Testing_Data.target)) * 100)
def train_classifier_use_feature_selection(self): # Get list of features count_vect = CountVectorizer(stop_words=stopwords, min_df=3, max_df=0.90, ngram_range=_ngram_range) X_CV = count_vect.fit_transform(docs_train) # print number of unique words (n_features) print("Shape of train data is " + str(X_CV.shape)) # tfidf transformation### tfidf_transformer = TfidfTransformer(use_idf=_use_idf) X_tfidf = tfidf_transformer.fit_transform(X_CV) ################# # feature selection ################# selector = SelectPercentile(score_func=_score_func, percentile=_percentile) print("Fitting data with feature selection ...") selector.fit(X_tfidf, y_train) # get how many features are left after feature selection X_features = selector.transform(X_tfidf) print("Shape of array after feature selection is " + str(X_features.shape)) clf = RandomForestClassifier(n_estimators=_n_estimators, criterion=_criterion, max_depth=_max_depth, min_samples_split=_min_samples_split).fit( X_features, y_train) # get the features which are selected and write to file feature_boolean = selector.get_support(indices=False) f = open(path_to_store_feature_selection_boolean_file, 'w') for fb in feature_boolean: f.write(str(fb) + '\n') f.close() ################## # get cross validation score ################## scores = cross_val_score(clf, X_features, y_train, cv=10, scoring='f1_weighted') print("Cross validation score: " + str(scores)) # Get average performance of classifier on training data using 10-fold CV, along with standard deviation print("Cross validation accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2)) #################### #test clf on test data #################### X_test_CV = count_vect.transform(docs_test) print("Shape of test data is " + str(X_test_CV.shape)) X_test_tfidf = tfidf_transformer.transform(X_test_CV) # apply feature selection on test data too X_test_selector = selector.transform(X_test_tfidf) print("Shape of array for test data after feature selection is " + str(X_test_selector.shape)) y_predicted = clf.predict(X_test_selector) # print the mean accuracy on the given test data and labels print("Classifier score on test data is: %0.2f " % clf.score(X_test_selector, y_test)) print(metrics.classification_report(y_test, y_predicted)) cm = metrics.confusion_matrix(y_test, y_predicted) print(cm) return clf, count_vect
pickle.dump(nb, f) f.close() # importing classification report and confussion matrix from sklearn.metrics import confusion_matrix,classification_report print(confusion_matrix(y_test,predictions)) print('\n') print(classification_report(y_test,predictions)) from sklearn.feature_extraction.text import TfidfTransformer from sklearn.pipeline import Pipeline pipeline = Pipeline([ ('bow', CountVectorizer()), # strings to token integer counts ('tfidf', TfidfTransformer()), # integer counts to weighted TF-IDF scores ('classifier', MultinomialNB()), # train on TF-IDF vectors w/ Naive Bayes classifier ]) X = text['review'] y = df['rating'] X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.3,random_state=101) pipeline.fit(X_train,y_train) predictions = pipeline.predict(X_test) print("Actual Ratings(rating): ",end = "") display(y_test[:15]) print("Predicted Ratings: ",end = "") # print(predictions[:15])
twenty_train = fetch_20newsgroups(subset='train', shuffle=True) print("lenth of the twenty_train--------->", len(twenty_train)) #print(twenty_train.target_names) #prints all the categories print("***First Line of the First Data File***") #print("\n".join(twenty_train.data[0].split("\n")[:5]))#prints first line of the first data file #2 Extracting features from text files from sklearn.feature_extraction.text import CountVectorizer count_vect = CountVectorizer() X_train_counts = count_vect.fit_transform(twenty_train.data) print('dim=', X_train_counts.shape) #3 TF-IDF from sklearn.feature_extraction.text import TfidfTransformer tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts) print(X_train_tfidf.shape) # Machine Learning #4 Training Naive Bayes (NB) classifier on training data. from sklearn.naive_bayes import MultinomialNB clf = MultinomialNB().fit(X_train_tfidf, twenty_train.target) # Building a pipeline: We can write less code and do all of the above, by building a pipeline as follows: # The names ‘vect’ , ‘tfidf’ and ‘clf’ are arbitrary but will be used later. # We will be using the 'text_clf' going forward. from sklearn.pipeline import Pipeline text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB())]) text_clf = text_clf.fit(twenty_train.data, twenty_train.target)
def tfidf_transformer(bow_matrix): transformer = TfidfTransformer(norm='l2', smooth_idf=True, use_idf=True) tfidf_matrix = transformer.fit_transform(bow_matrix) return transformer, tfidf_matrix
def load_imdb(feature_type='tfidf'): """ Load IMDB data in several formats. :param feature_type: feature type, default is 'tfidf', others are 'origin', 'tfidf-seq' :return: """ data_dir = "../DeepForestTF_Data/" if (os.path.exists(data_dir + "imdb_x_train.npy") and os.path.exists(data_dir + "imdb_x_test.npy") and os.path.exists(data_dir + "imdb_y_train.npy") and os.path.exists(data_dir + "imdb_y_test.npy")): x_train = np.load(data_dir + "imdb_x_train.npy") x_test = np.load(data_dir + "imdb_x_test.npy") y_train = np.load(data_dir + "imdb_y_train.npy") y_test = np.load(data_dir + "imdb_y_test.npy") x_train = x_train.reshape((x_train.shape[0], -1)) x_test = x_test.reshape((x_test.shape[0], -1)) return x_train, x_test, y_train, y_test max_features = 0 if feature_type.startswith('tfidf'): max_features = 5000 (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features) else: (x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=None) if feature_type == 'origin': max_len = 400 x_train = sequence.pad_sequences(x_train, maxlen=max_len) x_test = sequence.pad_sequences(x_test, maxlen=max_len) elif feature_type == 'tfidf': from sklearn.feature_extraction.text import TfidfTransformer transformer = TfidfTransformer(smooth_idf=True) x_train_bin = np.zeros((len(x_train), max_features), dtype=np.int16) x_test_bin = np.zeros((len(x_test), max_features), dtype=np.int16) for i, x_i in enumerate(x_train): x_train_bin[i, :] = np.bincount(x_i, minlength=max_features) for i, x_i in enumerate(x_test): x_test_bin[i, :] = np.bincount(x_i, minlength=max_features) transformer.fit_transform(x_train_bin) x_train = transformer.transform(x_train_bin) x_test = transformer.transform(x_test_bin) x_train = np.asarray(x_train.todense()) x_test = np.asarray(x_test.todense()) elif feature_type == 'tfidf-seq': from sklearn.feature_extraction.text import TfidfTransformer transformer = TfidfTransformer(smooth_idf=True) transformer2 = TfidfTransformer(smooth_idf=True) max_len = 400 n_train = len(x_train) n_test = len(x_test) x_train_bin = np.zeros((n_train, max_features), dtype=np.int16) x_test_bin = np.zeros((n_test, max_features), dtype=np.int16) for i, x_i in enumerate(x_train): x_train_bin_i = np.bincount(x_i) x_train_bin[i, :len(x_train_bin_i)] = x_train_bin_i for i, x_i in enumerate(x_test): x_test_bin_i = np.bincount(x_i) x_test_bin[i, :len(x_test_bin_i)] = x_test_bin_i x_train_tfidf = transformer.fit_transform(x_train_bin) x_test_tfidf = transformer2.fit_transform(x_test_bin) x_train_tfidf = np.asarray(x_train_tfidf.todense()) x_test_tfidf = np.asarray(x_test_tfidf.todense()) x_train_id = sequence.pad_sequences(x_train, maxlen=max_len) x_test_id = sequence.pad_sequences(x_test, maxlen=max_len) x_train = np.zeros(x_train_id.shape, dtype=np.float32) x_test = np.zeros(x_test_id.shape, dtype=np.float32) for i in range(n_train): x_train[i, :] = x_train_tfidf[i][x_train_id[i]] for i in range(n_test): x_test[i, :] = x_test_tfidf[i][x_test_id[i]] else: raise ValueError('Unknown feature type: {}'.format(feature_type)) x_train = x_train[:, np.newaxis, :, np.newaxis].astype('float32') x_test = x_test[:, np.newaxis, :, np.newaxis].astype('float32') return x_train, x_test, y_train.astype('int8'), y_test.astype('int8')
file = os.path.join(args.train, "processed_data.csv") # Loading Data df = pd.read_csv(file, engine="python") # isolating the target column (label) y = df['LABEL'] X = df.drop(['LABEL'], axis=1) # Splitting into train and test set X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42) nb_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('nb', MultinomialNB())]) parameters_ = { 'vect__ngram_range': [(1, 1), (1, 2), (2, 2)], 'tfidf__use_idf': (True, False), 'tfidf__norm': ('l1', 'l2'), 'nb__alpha': [1, 1e-1, 1e-2] } #cls_naivebayes_ = MultinomialNB() clf = GridSearchCV(nb_clf, param_grid=parameters_, cv=5) clf.fit(X_train["PROCESSED_REVIEW"].values, y_train) # saving the model using joblib joblib.dump(clf, os.path.join(args.model_dir, 'model.joblib')) def input_fn(input_data, content_type='application/json'):
tf = cv.fit_transform(comments) terms = cv.get_feature_names() term_sums = tf.sum(axis=0) term_counts = [] for i in range(len(terms)): term_counts.append([terms[i], term_sums[0,i]]) def sortSecond(e): return e[1] term_counts.sort(key=sortSecond, reverse=True) print("\nTerms with Highest Frequency:") for i in range(50): print('{:<15s}{:>5d}'.format(term_counts[i][0], term_counts[i][1])) print("") # Modify tf, term frequencies, to TF/IDF matrix from the data print("Conducting Term/Frequency Matrix using TF-IDF") tfidf_vect = TfidfTransformer(norm=None, use_idf=True) #set norm=None tf = tfidf_vect.fit_transform(tf) term_idf_sums = tf.sum(axis=0) term_idf_scores = [] for i in range(len(terms)): term_idf_scores.append([terms[i], term_idf_sums[0,i]]) term_idf_scores.sort(key=sortSecond, reverse=True) # In sklearn, SVD is synonymous with LSA (Latent Semantic Analysis) lda = LatentDirichletAllocation(n_components=n_topics, max_iter=max_iter,\ learning_method=learning_method, \ learning_offset=learning_offset, \ random_state=12345) lda.fit_transform(tf)
# I want to find articles related to the demonstrations in Hong Kong #So i wanted originally to just make my query 'Hong Kong' but I landed on: # 'hong', 'protests' and 'extradition' terms = ['hong', 'protests', 'extradition'] # After several tries with versatile letters and "hong kong", i just settled for # 'hong'. It's not like 'hong' og 'kong' is seperate words in english anyway term_idxs = [model_vect.vocabulary_.get(term) for term in terms] term_counts = [counts[idx] for idx in term_idxs] print(term_counts) #Here we get the term counts for each of the three words in our query from sklearn.feature_extraction.text import TfidfTransformer model_tfidf = TfidfTransformer() data_tfidf = model_tfidf.fit_transform(data_vect) data_tfidf # And then here I transform the count to a tfidf representation idfs = model_tfidf.idf_ term_idfs = [idfs[idx] for idx in term_idxs] term_idfs # And I get the individual weights for my selected terms df = pd.DataFrame(columns=['count', 'idf'], index=terms, data=zip(term_counts,term_idfs)) df # Here i use pandas to make a dataframe, that will let me compare my term counts with # my term weights. The one that have the largest difference is 'extradition', which i # guess means that it has a higher semantic value compared to how often it is represented
neg_word2vec_scores[i] += neg_min #a_min = -min(diff) #diff = [i + a_min for i in diff] print X[0] #pos_word2vec_scores, neg_word2vec_scores = np.array(pos_word2vec_scores), np.array(neg_word2vec_scores) #diff = np.array(diff) sentiment_prob_list = np.array(sentiment_prob_list) X = np.array(X) Y = np.array(Y) print X.shape print Y.shape vectorizer = CountVectorizer(stop_words=stop) transformer = TfidfTransformer() #tfv = TfidfVectorizer(min_df=3, max_features=None, strip_accents='unicode', \ # analyzer='word',token_pattern=r'\w{1,}',ngram_range=(1, 2), use_idf=1,smooth_idf=1,sublinear_tf=1) trainVectorizerArray = vectorizer.fit_transform(X).toarray() transformer.fit(trainVectorizerArray) L = transformer.transform(trainVectorizerArray).toarray() print L.shape #tfv.fit(X) #Z = tfv.transform(X) #svd = TruncSVD(n_components = 25) #totalsvd = svd.fit(Z) #totalsvd = svd.fit_transform(Z) #totalsvd = totalsvd[:,:] + 1
file_obj.close() return bunch #写入bunch对象 def writebunchobj(path,bunchobj): file_obj = open(path, "wb") pickle.dump(bunchobj,file_obj) file_obj.close() # 1. 读取停用词表 stopword_path = "train_word_bag/hlt_stop_words.txt" stpwrdlst = readfile(stopword_path).splitlines() # 2. 导入分词后的词向量bunch对象 path = "train_word_bag/train_set.dat" # 词向量空间保存路径 bunch = readbunchobj(path) # 3. 构建tf-idf词向量空间对象 tfidfspace = Bunch(target_name=bunch.target_name,label=bunch.label,filenames=bunch.filenames,tdm=[],vocabulary={}) # 4. 使用TfidfVectorizer初始化向量空间模型 vectorizer = TfidfVectorizer(stop_words=stpwrdlst,sublinear_tf = True,max_df = 0.5) transformer=TfidfTransformer() # 该类会统计每个词语的tf-idf权值 # 文本转为词频矩阵,单独保存字典文件 tfidfspace.tdm = vectorizer.fit_transform(bunch.contents) tfidfspace.vocabulary = vectorizer.vocabulary_ # 创建词袋的持久化 space_path = "train_word_bag/tfdifspace.dat" # 词向量空间保存路径 writebunchobj(space_path,tfidfspace) print "if-idf词向量空间创建成功!!!"
twenty_test = fetch_20newsgroups(subset='test', categories=categories, shuffle=True) #twenty_train=fetch_20newsgroups(data_home='./scikit_learn_data',subset='train',shuffle=True) #print(twenty_train) #twenty_test=fetch_20newsgroups(data_home='./scikit_learn_data',subset='test',shuffle=True) #print(twenty_train) print("Number of Training Examples: ", len(twenty_train.data)) print("Number of Test Examples: ", len(twenty_test.data)) print(twenty_train.target_names) from sklearn.feature_extraction.text import CountVectorizer count_vect = CountVectorizer() X_train_tf = count_vect.fit_transform(twenty_train.data) from sklearn.feature_extraction.text import TfidfTransformer tfidf_transformer = TfidfTransformer() X_train_tfidf = tfidf_transformer.fit_transform(X_train_tf) X_train_tfidf.shape from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import accuracy_score from sklearn import metrics mod = MultinomialNB() mod.fit(X_train_tfidf, twenty_train.target) X_test_tf = count_vect.transform(twenty_test.data) X_test_tfidf = tfidf_transformer.transform(X_test_tf) predicted = mod.predict(X_test_tfidf) print("Accuracy: ", accuracy_score(twenty_test.target, predicted)) print( classification_report(twenty_test.target,
# load data.npy and target.npy data = sp.load('data.npy') target = sp.load('target.npy') # vectorize method step1: count vectorizer (bag of words) from sklearn.feature_extraction.text import CountVectorizer countVector = CountVectorizer(stop_words=stopWords, decode_error='ignore') trainCounts = countVector.fit_transform(data) # .shape output format: (sample number, dict size) # print(trainCounts.shape) # vectorize method step2: TF-IDF vectorizer from sklearn.feature_extraction.text import TfidfTransformer tfTransformer = TfidfTransformer(use_idf=False).fit(trainCounts) trainTF = tfTransformer.transform(trainCounts) # print(trainTF.shape) # build naive Bayes classifier from sklearn.naive_bayes import MultinomialNB # 1) use bag of words vector naiveBayesClassifier = MultinomialNB().fit(trainCounts, target) # 2) use TF-IDF vetor # naiveBayesClassifier = MultinomialNB().fit(trainTF, target) # print test results predicted = naiveBayesClassifier.predict(tfTransformer.transform(countVector.transform(data))) from sklearn import metrics # print(metrics.classification_report(target, predicted))
import numpy as np from numpy import * from numpy import arange from nltk.tokenize import word_tokenize from nltk.corpus import stopwords from sklearn import feature_extraction from sklearn.feature_extraction.text import TfidfTransformer from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import HashingVectorizer corpus = [] for line in open('text3000.txt', 'r').readlines(): line = line.split('\t') corpus.append(line[1].strip()) vectorizer = CountVectorizer() transformer = TfidfTransformer() tfidf = transformer.fit_transform(vectorizer.fit_transform(corpus)) word = vectorizer.get_feature_names() weight = tfidf.toarray() print type(weight) print len(weight) print len(weight[0]) size = 3000 length = 200 preciseAll = 0 # resName = "euc60.txt" # result = codecs.open(resName, 'w', 'utf-8') for i in xrange(0, len(weight)): d = dict() for j in xrange(0, len(weight)):
# f.readline() # print(f) # np1 = np.loadtxt(f, dtype = 'str', delimiter = ',') # np1 = np.loadtxt(filecp, dtype = 'str', delimiter = ',',skiprows=5) df1 = pd.read_csv(filename, header=None, encoding="utf8") np1 = np.array(df1) data = np1[:, 0:-1] label = np1[:, -1] return data, label.astype('float32') if __name__ == "__main__": data, label = read_data("../input/data_clean.csv") data_combine = np.hstack((data[:, 0], data[:, 1])) vectorizer = CountVectorizer() transformer = TfidfTransformer() label_up = np.hstack((label, label)).astype("float32") similarity = np.zeros((data.shape[0], 1)) tfidf = transformer.fit_transform(vectorizer.fit_transform(data_combine)) question = csr_matrix(tfidf[0:data.shape[0]]) answer = csr_matrix(tfidf[data.shape[0]:]) data_sparse = hstack([question, answer]) plt.figure() for model in ['logistic', 'linear', 'GBDT', 'NN']: mse_train = [] mse_test = [] print(model) for i in range(1, 100): data_sparse_one = SelectKBest(chi2, k=i * 100).fit_transform( data_sparse, label) # print("precess finished")
for i in range(len(data['body'])): if i in selected_indices: bodyCommentsTest.append(data['body'][i]) scoreTest.append(data['score'][i]) else: bodyCommentsTraining.append(data['body'][i]) scoreTraining.append(data['score'][i]) cv = CountVectorizer(stop_words='english', strip_accents='ascii', max_df=0.8, ngram_range=(1, 5)) #, ngram_range=(1, 2) bodyCommentsTraining = cv.fit_transform(bodyCommentsTraining) bodyCommentsTest = cv.transform(bodyCommentsTest) transformer = TfidfTransformer(norm='l2', use_idf=True, sublinear_tf=True) bodyCommentsTraining = transformer.fit_transform(bodyCommentsTraining) bodyCommentsTest = transformer.transform(bodyCommentsTest) y_pred = KMeans().fit_predict(bodyCommentsTraining) print y_pred values = set() for x in y_pred: values.add(x) temp = collections.Counter(y_pred) maxValue = 0
class BotServer: def __init__(self, file_path): """ Initialize corpus, bag-of-words, and TFIDF from CSV file at argument file_path. """ processing = Processing() # Read in FAQ data self.faq = pd.read_csv(file_path, keep_default_na=False) self.corpus = self.faq.question + ' ' + self.faq.answer # Create BOW tranformer based on faq.question + faq.answer self.bow_transformer = CountVectorizer( analyzer=processing.text_process).fit(self.faq.question) # Tranform faq.question itself into BOW self.corpus_bow = self.bow_transformer.transform(self.faq.question) # Create TFIDF transformer based on faq.question's BOW self.tfidf_transformer = TfidfTransformer().fit(self.corpus_bow) # Transform faq.question's BOW into TFIDF self.corpus_tfidf = self.tfidf_transformer.transform(self.corpus_bow) # Initialize search module encoder, decoder, decoder_n_layers, self.voc = buildModels() self.searcher = GreedySearchDecoder(encoder, decoder, decoder_n_layers) # Set upload folder and output records folder self.UPLOAD_FOLDER = '/app/records/in' self.REC_RES_FOLDER = '/app/records/out' # Set allowed extensions self.ALLOWED_EXTENSIONS = {'wav'} def tfidf_similarity(self, query): """ Returns (index, similarity value) of string argument query's most similar match in FAQ, determined by cosine similarity. """ # Transform test question into BOW using BOW transformer query_bow = self.bow_transformer.transform([query]) # Transform test question's BOW into TFIDF query_tfidf = self.tfidf_transformer.transform(query_bow) # Calculate cosine similarity and return maximum value with accompanying index similarities = np.transpose( cosine_similarity(query_tfidf, self.corpus_tfidf)) max_similarity = similarities.max() max_index = np.argmax(similarities) return max_index, max_similarity def match_query(self, query): """ Prints most similar match in FAQ to user query. """ index, similarity = self.tfidf_similarity(query) if similarity > 0.5: response = self.faq.answer.iloc[index] print(similarity) else: query = normalizeString(query) output_words = evaluate(self.searcher, self.voc, query) output_words[:] = [ x for x in output_words if not (x == 'EOS' or x == 'PAD') ] response = ' '.join(output_words) return response def allowed_file(self, filename): return '.' in filename and filename.rsplit( '.', 1)[1].lower() in self.ALLOWED_EXTENSIONS def get_duration(self, audio_name_only): fname = os.path.join(self.REC_RES_FOLDER, audio_name_only) audio = MP3(fname) return round(audio.info.length) def bot_dialog(self, request): """ Given the argument POST request, parse it according to form data, and return a json response based on sklearn matching within the FAQ. """ # Handle webhook request req = request.form msg_type = req.get('type') if msg_type == "Text": message = req.get('message') response_text = self.match_query(message) # Return json file as webhook response messages = [{ "type": "Text", "message": msg, "fromBot": True } for msg in response_text.split("\n\n")] elif msg_type == "Audio": respfilename = '' record = request.files['record'] if record and self.allowed_file(record.filename): filename = secure_filename(record.filename) record.save(os.path.join(self.UPLOAD_FOLDER, filename)) list_records = [] durations = [] try: r = spechrec.Recognizer() with spechrec.AudioFile( os.path.join(self.UPLOAD_FOLDER, filename)) as source: # listen for the data (load audio to memory) audio_data = r.record(source) # recognize (convert from speech to text) input_sentence = r.recognize_google(audio_data) #searcher = GreedySearchDecoder(encoder, decoder,decoder_n_layers) response_text = self.match_query(input_sentence) for msg in response_text.split("\n\n"): now = datetime.now() respfilename = now.strftime("%d-%m-%Y-%H:%M:%S") + ".mp3" engine = gTTS('' + response_text, lang='en') engine.save(os.path.join(self.REC_RES_FOLDER, respfilename)) list_records.append(respfilename) durations.append(self.get_duration(respfilename)) except: erreur = random.choice([ "Sorry, i did not understand you ,Please change the way you say it", "please be a little simple in your discussion i m not a human", "Sorry, get in mind that you are talking only with a computer " ]) print("" + erreur) now = datetime.now() respfilename = now.strftime("%d-%m-%Y-%H:%M:%S") + ".mp3" engine = gTTS('' + erreur, lang='en') engine.save(os.path.join(self.REC_RES_FOLDER, respfilename)) list_records.append(respfilename) durations.append(self.get_duration(respfilename)) #Return json file as webhook response messages = [{ "type": "Audio", "path": "https://coronafaqsbot.herokuapp.com/records/" + list_records[i], "isLocal": False, "duration": durations[i], "fromBot": True } for i in range(len(list_records))] return jsonify({"messages": messages})
# use all 25K words. Higher accuracy movieVzer = CountVectorizer(min_df=2, tokenizer=nltk.word_tokenize) # fit and tranform using training text docs_train_counts = movieVzer.fit_transform(docs_train) # 'screen' is found in the corpus, mapped to index 2290 print(movieVzer.vocabulary_.get('screen')) # Likewise, Mr. Steven Seagal is present... print(movieVzer.vocabulary_.get('seagal')) print(docs_train_counts.shape) # Convert raw frequency counts into TF-IDF values movieTfmer = TfidfTransformer() docs_train_tfidf = movieTfmer.fit_transform(docs_train_counts) # Using the fitted vectorizer and transformer, tranform the test data docs_test_counts = movieVzer.transform(docs_test) docs_test_tfidf = movieTfmer.transform(docs_test_counts) # Now ready to build a classifier. # We will use Multinominal Naive Bayes as our model # Train a Multimoda Naive Bayes classifier. Again, we call it "fitting" clf = MultinomialNB() clf.fit(docs_train_tfidf, y_train) # Predict the Test set results, find accuracy y_pred = clf.predict(docs_test_tfidf)
def train(self, method = "xgboost"): # x_train, x_test, y_train, y_test = self.read_data() corpus, label, numclass = self.feature(feature=self.feature_) print "label: ", Counter(label) print "numclass: ", numclass print "------------------------------------------------" x_train, x_test, y_train, y_test = train_test_split(corpus, label, test_size=0.2) print "train num: ", len(y_train) print "test num: ", len(y_test) print "------------------------------------------------" vectorizer = CountVectorizer() tfidftransformer = TfidfTransformer() tfidf = tfidftransformer.fit_transform(vectorizer.fit_transform(x_train)) weight = tfidf.toarray() print "weight.shape: ", weight.shape test_tfidf = tfidftransformer.transform(vectorizer.transform(x_test)) test_weight = test_tfidf.toarray() print "test_weight.shape: ", test_weight.shape print "------------------------------------------------\n" # city count feature x_train_count = np.array([self.city_count_feature(x) for x in x_train]) x_test_count = np.array([self.city_count_feature(x) for x in x_test]) assert len(weight) == len(x_train_count) assert len(test_weight) == len(x_test_count) weight = np.concatenate((weight, x_train_count), axis=1) test_weight = np.concatenate((test_weight, x_test_count), axis=1) # for i in range(len(weight)): # weight[i] = np.append(weight[i], x_train_count[i]) # test_weight[i] = np.append(test_weight[i], x_test_count[i]) print "weight.shape: ", weight.shape print "test_weight.shape: ", test_weight.shape # sys.exit(0) print "---------------------- train --------------------------" xgbtrain = xgb.DMatrix(weight, label=y_train) xgbtest = xgb.DMatrix(test_weight, label=y_test) param = {'max_depth': 6, 'eta': 0.05, 'eval_metric': 'merror', 'silent': 1, 'objective': 'multi:softmax', 'num_class': numclass} # 参数 evallist = [(xgbtrain, 'train'), (xgbtest, 'test')] num_round = 100 # 循环次数 bst = xgb.train(param, xgbtrain, num_round, evallist) # save model bst.save_model("./../model/TC{}.model".format(self.feature_)) preds = bst.predict(xgbtest) print "------------------------------------------------" count = 0 for i in range(80): if preds[i] == y_test[i]: count += 1 print "precision: ", count * 1.0 / len(y_test) good_cases = [] bad_cases = [] valid_data = pd.read_csv(InputDataPath) label_mapping_convert = self.caseAnalysis() print "------------------ case analysis ------------------" for i in range(len(y_test)): # good case if y_test[i] == preds[i]: goodcase = {"case":"good"} index = 0 while index < 399: if valid_data["extract"][index].decode("utf-8") == x_test[i]: break index += 1 goodcase["index"] = index for key, value in valid_data.iloc[index].items(): goodcase[key] = value goodcase["label"] = y_test[i] goodcase["predict"] = int(preds[i]) goodcase["predict_text"] = label_mapping_convert[preds[i]] good_cases.append(goodcase) # bad case else: badcase = {"case": "bad"} index = 0 while index < 399: if valid_data["extract"][index].decode("utf-8") == x_test[i]: break index += 1 badcase["index"] = index for key, value in valid_data.iloc[index].items(): badcase[key] = value badcase["label"] = y_test[i] badcase["predict"] = int(preds[i]) badcase["predict_text"] = label_mapping_convert[preds[i]] bad_cases.append(badcase) print "------------------ good case ------------------" print json.dumps(good_cases, ensure_ascii=False, encoding="utf-8", indent=4) print "------------------ bad case ------------------" print json.dumps(bad_cases, ensure_ascii=False, encoding="utf-8", indent=4) print "------------------------------------------------" print "precision: ", count * 1.0 / len(y_test)
def test_tfidf_no_smoothing(): X = [[1, 1, 1], [1, 1, 0], [1, 0, 0]] tr = TfidfTransformer(smooth_idf=False, norm='l2') tfidf = tr.fit_transform(X).toarray() assert_true((tfidf >= 0).all()) # check normalization assert_array_almost_equal((tfidf ** 2).sum(axis=1), [1., 1., 1.]) # the lack of smoothing make IDF fragile in the presence of feature with # only zeros X = [[1, 1, 0], [1, 1, 0], [1, 0, 0]] tr = TfidfTransformer(smooth_idf=False, norm='l2') # First we need to verify that numpy here provides div 0 warnings with warnings.catch_warnings(record=True) as w: 1. / np.array([0.]) numpy_provides_div0_warning = len(w) == 1 with warnings.catch_warnings(record=True) as w: tfidf = tr.fit_transform(X).toarray() if not numpy_provides_div0_warning: raise SkipTest("Numpy does not provide div 0 warnings.") assert_equal(len(w), 1) # For Python 3 compatibility if hasattr(w[0].message, 'args'): assert_true("divide by zero" in w[0].message.args[0]) else: assert_true("divide by zero" in w[0].message)