def _mnb(t, min_freq, save=False): if save: clf = mnb().fit(records, labels) save_classifier(clf, t, 'mnb', min_freq) return ('mnb', clf) else: clf = load_classifier(t, 'mnb', min_freq) return ('mnb', clf)
def CrossValidate(X, Y, IDX, cl, **kwargs): ''' Input X - is array of (N,K) features Y - is array of (N,1) labels IDX - is the (N,1) array of keys clf is a string for the classifier method: 'svm','mnb','knn',etc. kwargs is for ShiffleSplit and should be {'n_iter':5, 'test_size':0.80,'random_state':0} for example Returns: Predictions= dict(Key: log likelihood of label[0]) ''' print 'running cross-val' from sklearn import cross_validation as cv from sklearn import svm from sklearn.naive_bayes import MultinomialNB as mnb from sklearn import neighbors as knn #predcition=cv.cross_val_predict(clf,X,Y,**kwargs) #this _predict function only exists in an updated version of sklearn. Res = {} print kwargs splits = cv.ShuffleSplit( X.shape[0], **kwargs) # n_iter=5, test_size=0.80,random_state=0) print 'running ', len(splits), ' splits in cross-validation' for trainidx, testidx in splits: if len(set(Y[trainidx])) == 1: continue trainL = Y[trainidx] trainT = X[trainidx] testL = Y[testidx] testT = X[testidx] testIDX = IDX[testidx] if cl == 'knn': cl = neighbors.KNeighborsClassifier() if cl == 'svm': cl = svm.SVC(C=1, kernel='linear', probability=True) if cl == 'mnb': cl = mnb() cl.fit(trainT, trainL) print 'accuracy of nth fold is ', cl.score(testT, testL) preds = cl.predict_proba(testT) if 0 in preds: for i, p in enumerate(preds): if p[0] == 0: preds[i][0] = .01 preds[i][1] = .99 if p[1] == 0: preds[i][0] = .99 preds[i][1] = .01 female = [np.log(p[0] / p[1]) for p in preds] res = dict(zip(testIDX, female)) for k, v in res.iteritems(): if k in Res: Res[k].append(v) else: Res[k] = [v] return Res
def predictByMNB(features, classes, test): ## Why MNB requires non-negative features? ## MNB uses multinomial distribution to compute P(x_i|Y_j), which is the distribution of the i_th feature when given class Y_j. ## Because multinomial distribution's every value should be >= 0, so MNB ask for it too. if (features.min() < 0) : raise ValueError("Feautres must be larger than or equal to 0 if using multinomial naive bayes!") clf = mnb() clf.fit(features, classes) return clf.predict(test)
def multinomialNaiveBayesTrain(trainQuestions, tag, X, y, mnbd): clf = mnb() i = 0 for qid in trainQuestions: if tag in trainQuestions[qid].tags: y[i] = 1 else: y[i] = 0 i += 1 clf.fit(X, y) mnbd[tag] = clf
def multinomialNaiveBayesTrain(trainQuestions, tag, X, y, mnbd): clf = mnb() i = 0 for qid in trainQuestions: if tag in trainQuestions[qid].tags: y[i] = 1 else: y[i] = 0 i += 1 clf.fit(X,y) mnbd[tag] = clf
def RFETopWords(X, Y, n=20, clf=''): if clf == 'knn': cl = neighbors.KNeighborsClassifier() if clf == 'svm': cl = svm.LinearSVC() if clf == 'mnb': cl = mnb() selector = RFE(cl, n, step=.05) selector = selector.fit(X, Y) tops = np.argsort(selector.support_)[-n:] #words=[vectorizer.get_feature_names()[i] for i in tops] return selector, tops
def Classify(trainT, trainL, clf='knn'): '''Code to train and test classifiers. type can be 'knn' 'nb' or 'svm' returns the fit matrix #a dictionary of {twitterID: likelihood ratio}''' from sklearn import svm from sklearn.naive_bayes import MultinomialNB as mnb from sklearn import neighbors print 'Running Classifier ' + clf if clf == 'knn': cl = neighbors.KNeighborsClassifier() cl.fit(trainT, trainL) if clf == 'svm': cl = svm.SVC(C=100, gamma=.1, probability=True) cl.fit(trainT, trainL) if clf == 'mnb': cl = mnb() cl.fit(trainT, trainL) return cl
def naive_bayes_mnb(x_train, y_train, x_test, y_test): model = mnb() model.fit(x_train, y_train) expected = y_test predicted = model.predict(x_test) return expected, predicted
oob_score=False, random_state=13, verbose=0, warm_start=False) model0.fit(train_x, train_y) predicted = model0.predict(test_x) model0.score(test_x, test_y) # Just a little bit better... # ## Bayes (model 1) # In[34]: from sklearn.naive_bayes import MultinomialNB as mnb model1 = mnb() model1.fit(train_x, train_y) # In[35]: predicted = model1.predict(test_x) model1.score(test_x, test_y) # ## SVM (model 2) # In[36]: from sklearn import svm from sklearn.model_selection import GridSearchCV as gs # In[37]:
fig.autofmt_xdate() from sklearn.metrics import * y_dum = np.ones(len(data['rumorType'].values)) score1 = accuracy_score(y_dum, data['rumorType']) from sklearn.feature_extraction.text import TfidfVectorizer zhTokenizer = jieba.cut v = TfidfVectorizer(token_pattern=r'(?u)\b\w+\b', tokenizer = zhTokenizer, lowercase = False, stop_words = ['是','的'], max_features = 250) y = data['rumorType'] X_txt = data.drop(['rumorType','crawlTime','mainSummary'],axis=1) from sklearn.model_selection import train_test_split,cross_val_score,cross_validate X_tr,X_te,y_tr,y_te = train_test_split(X_txt,y,test_size=0.2,stratify=y) #Convert X_train X_tr_v = v.fit_transform(X_tr['title']) from sklearn.naive_bayes import MultinomialNB as mnb model_bl = mnb() model_bl.fit(X_tr_v,y_tr.values) X_te_v = v.transform(X_te['title']) y_pred = model_bl.predict(X_te_v) score2 = accuracy_score(y_pred,y_te)
model.fit( X , y ) X_test = [ row[ :-1 ] for row in test_data ] y_real = [ row[ -1 ] for row in test_data ] y_pred = model.predict( X_test ) print report( y_real , y_pred ) tp = lambda x : 1 if x == 'spam' else 0 real = [ tp( v ) for v in y_real ] pred = [ tp( v ) for v in y_pred ] print mean_absolute_error( real , pred ) print mean_squared_error( real , pred ) if __name__ == '__main__' : if len( sys.argv ) > 2 : train_fpath , test_fpath = sys.argv[ 1: ] train_data = import_csv( train_fpath ) test_data = import_csv( test_fpath ) ''' DECISION TREE ''' cf = dtc( criterion = 'gini' , max_depth = 50 ) classify( cf , train_data , test_data , 'decision_tree' ) ''' NEAREST NEIGHBORS ''' cf = knc( n_neighbors = 1 , metric = 'hamming' ) classify( cf , train_data , test_data , 'knearest_neighbors' ) ''' NAIVE BAYES ''' cf = mnb( alpha = 100.0 ) classify( cf , train_data , test_data , 'naive_bayes' ) else : print "Usage python %s [train_csv_file] [test_csv_file]" % sys.argv[ 0 ]
# predict classes dt_predictions_tuned = pd.Series(dt_estimator_tuned.predict(post2000_exp)) # cross predicted vs actual post2000_res.index = dt_predictions_tuned.index dt_crosstab_tuned = pd.crosstab( post2000_res, dt_predictions_tuned, rownames=["Actual"], colnames=["Predicted"], margins=True ) print dt_crosstab_tuned # BUILD NAIVE BAYES MODEL (UNSCALED DATA)------------------------------------------------ # run model nb = mnb() # conduct recursive feature search nb_rfe_cv = rfe(estimator=nb, step=1, cv=10, scoring="roc_auc", verbose=1) nb_rfe_cv.fit(pre2000_exp, pre2000_res) # identify and plot optimal number of features (d = 50). ROC_AUC=0.6391 print nb_rfe_cv.n_features_ print nb_rfe_cv.grid_scores_.max() plt.figure() plt.xlabel("NB: Number of Features selected") plt.ylabel("NB: Cross Validation Score (ROC_AUC)") plt.plot(range(1, len(nb_rfe_cv.grid_scores_) + 1), nb_rfe_cv.grid_scores_) plt.show()
ftr2_ext_dat = filter_dataset_2(ext_dat[0],ext_dat[1],ext_dat[2],ext_dat[3],ext_dat[4]) sorted_sessions_ftr2 = construct_customer_view(ftr2_ext_dat) ftr3_ext_dat = filter_dataset_3(ext_dat[0],ext_dat[1],ext_dat[2],ext_dat[3],ext_dat[4]) sorted_sessions_lng = construct_customer_view(ftr3_ext_dat) # Classifier Classes clf1 = svm.LinearSVC(C=0.05,penalty='l2') clf2 = lm.ElasticNetCV(l1_ratio=0.3,n_jobs=1) clf3 = lm.LogisticRegression(penalty='l1') clf4 = lm.SGDClassifier(loss='hinge',n_jobs=1,n_iter=100,penalty='elasticnet') clf5 = svm.SVC(C=4.0,kernel='rbf',degree=3,probability=True) bclf1 = dtree(max_depth=10) bclf2 = svm.SVC(C=4.0,kernel='rbf',degree=3,probability=True) bclf3 = mnb(alpha=1.0,fit_prior=True,class_prior=None) #clf6 = ensmbl.AdaBoostClassifier(base_estimator=bclf1,n_estimators=100,learning_rate=1.0) clf7 = ensmbl.RandomForestClassifier(n_estimators=10,criterion='gini') clf8 = gs.GridSearchCV(svm.LinearSVC(penalty='l2'),{'C':[0.005,0.01,0.05,0.1,0.2,0.3]},cv=3) clf9 = gs.GridSearchCV(svm.LinearSVC(penalty='l2'),{'C':[0.005,0.01,0.05,0.1,0.2,0.3]},cv=3) clf10 = gs.RandomizedSearchCV(svm.LinearSVC(penalty='l2'),{'C':[0.0001,0.001,0.01,0.1,0.25]},cv=3) clf11 = gs.RandomizedSearchCV(svm.LinearSVC(penalty='l2'),{'C':[1.0,2.0,5.0,10.0,20.0]},cv=3) #clf9 = gs.GridSearchCV(svm.SVC(kernel='poly',degree='3'),{'C':[0.005,0.01,0.05,0.1,0.2,0.3]},cv=3) #clf9 = gs.GridSearchCV(svm.SVC(),{'C':[0.3,0.5,1.0,2.0,3.0]},cv=3) #clf9 = gs.GridSearchCV(lm.SGDClassifier(penalty='elasticnet',loss='log',n_iter=1000,n_jobs=-1,shuffle=True),{'l1_ratio':[0.1,0.5,0.7,0.9]},cv=3) clf_ls = [gs.GridSearchCV(svm.LinearSVC(penalty='l2'),{'C':[0.005,0.01,0.05,0.1,0.2,0.3]},cv=3) for it in range(0,4)] # Feature Selection Classes fs1 = fs.SelectKBest(chi2,k=100) fs2 = fs.RFECV(clf1,step=1000,cv=5) fs3 = fs.RFE(clf1)
def hybridTrial(metadata): ''' This code takes two above feature sets and tests whether they change their collective and individual predictability Raw * Raw Topics = ? * .72 = .71 (No change in nb score) Subtopics * raw topics = .65 * .72 = .69 ''' print 'import raw topic scores' filename = 'Twitter/Data/Raw_Topic_Scores.csv' data = ImportCSVFeatureData(filename, -1) print 'drawing samples' vec = np.array([[float(l) for l in line[1:]] for line in data]) #exclude cases where sex is unknown labels = np.array([metadata[line[0]][0] for line in data]) # if 'age' not in line]) IDX = np.array([line[0] for line in data]) print 'CV for RAW TOPICS' CVargs = {'n_iter': 3, 'test_size': .9, 'random_state': 0} cl = mnb() #Preds=Classifiers.CrossValidate(vec,labels,IDX,cl,**CVargs) print 'importing subtopic scores' path = 'Twitter/Data/' preds = {} Data = [] for cat in set([line[2] for line in metadata.values()]): if cat == 'category' or cat == 'party': continue print 'RUNNINING ', cat, ' SUBTOPIC SCORES' f = 'Twitter_' + cat + '_Topic_Scores.csv' data = ImportCSVFeatureData(path + f, -1) Data.append(data) #for line in data: # for idx in IDX: # if line[0]==idx: # rvec.append(line) # break #vec=np.array([[float(l) for l in line[1:]] for line in data]) #exclude cases where sex is unknown #labels=np.array([metadata[line[0]][0] for line in data])# if 'age' not in line]) #IDX=np.array([line[0] for line in data]) print 'resorting cases to align with labels' rvec = [[] for i in IDX] #rlabels=[] for data in Data: for i, idx in enumerate(IDX): if idx in [line[0] for line in data]: for line in data: if idx == line[0]: rvec[i] += line[1:] continue #rvec.append(line[1:]) else: rvec[i] += [0 for i in data[0][1:]] #used to align RAW #for idx in IDX: # if line[0]==idx: # rvec.append(line[1:]) # break #rlabels.append(meta-data[str(int(idx))][0]) rvec = np.append(vec, np.array(rvec), axis=1) print 'crossvalidate testing COMBINATION' CVargs = {'n_iter': 3, 'test_size': .9, 'random_state': 0} cl = mnb() Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **CVargs) CVargs = {'n_iter': 3, 'test_size': .9, 'random_state': 0} cl = mnb() cl = ensemble.AdaBoostClassifier(n_estimators=10) Preds = Classifiers.CrossValidate(vec, labels, IDX, cl, **CVargs) return
''' #用TfIdfVectorizer将文本向量化 zhTokenizer = jieba.cut v = TfidfVectorizer(token_pattern=r"(?u)\b\w+\b", tokenizer=zhTokenizer, lowercase=False, stop_words=['是', '的'], max_features=250) y = data['rumorType'] #将谣言的内容和题目合一起 x_txt = data[['mainSummary', 'title']].apply(lambda x: ' '.join(x), axis=1) #划分训练集和测试集 x_tr, x_te, y_tr, y_te = train_test_split(x_txt, y, test_size=0.2, stratify=y) #构建模型并训练 x_tr_v = v.fit_transform(x_tr) model_bl = mnb() model_bl.fit(x_tr_v, y_tr.values) x_te_v = v.transform(x_te) y_pred = model_bl.predict(x_te_v) #accuracy_score分类准确率,算出分类中正确分类的百分比 print("使用Multinomial Naive Bayes模型预测的准确率:", accuracy_score(y_te, y_pred)) print('使用Multinomial Naive Bayes模型预测的精确率为:', precision_score(y_te, y_pred, average="micro")) print('使用Multinomial Naive Bayes模型预测的召回率为:', recall_score(y_te, y_pred, average="micro")) print('使用Multinomial Naive Bayes模型预测的F1值为:', f1_score(y_te, y_pred, average="micro")) print('使用Multinomial Naive Bayes模型预测的Cohen’s Kappa系数为:', cohen_kappa_score(y_te, y_pred)) print('使用Multinomial Naive Bayes模型预测的分类报告为:', '\n', classification_report(y_te, y_pred))
def main(): priors = [ .0369, .0162, .012, .0103, .0133, .0126, .0172, .0133, .5214, .0068, .1756, .0104, .1218, .0191, .013 ] ########################## ####System Counts######### ########################## # define global set for creating data frames # test_tree_list, test_classes, test_ids = extract_tree("test") # globalSetTest = set() # dictListTest = list() # for tree in test_tree_list: # dictListTest.append(perSysCallCount(tree, globalSetTest)) # train_tree_list, train_classes, train_ids = extract_tree("train") # dictListTrain = list() # for tree in train_tree_list: # dictListTrain.append(perSysCallCount(tree, globalSetTest)) # newPerSysCallCountFile(dictListTest,test_classes, test_ids, "perSysCountsTest.csv", globalSetTest) # newCountFile(test_tree_list, test_classes, test_ids, "choppyTest.csv") # del test_tree_list,test_classes,dictListTest,test_ids # newPerSysCallCountFile(dictListTrain,train_classes,train_ids, "perSysCountsTrain.csv",globalSetTest) #newCountFile(train_tree_list, train_classes, train_ids, "choppyTrain.csv") # del train_tree_list,train_classes,train_ids,dictListTrain ############################################### #######Per-Tree, Per-System Call Counts######## ############################################### """ Read in train and test as Pandas DataFrames """ # df_train = pd.read_csv("choppyTrain.csv") # df_test = pd.read_csv("choppyTest.csv") df_train = pd.read_csv("perSysCountsTrain.csv") df_test = pd.read_csv("perSysCountsTest.csv") #store class values Y_train = df_train.Class.values testID = df_test.Id.values #row where testing examples start test_idx = df_train.shape[0] df_all = pd.concat((df_train, df_test), axis=0) del df_train del df_test df_all = df_all.drop(['Id'], axis=1) df_all = df_all.drop(['Class'], axis=1) vals = df_all.values del df_all X_train = vals[:test_idx] X_test = vals[test_idx:] del vals # clf = bnb(class_prior=priors) # clf.fit(X_train, Y_train) clf = mnb(class_prior=priors) clf.fit(X_train, Y_train) del X_train del Y_train # bnb_predict = clf.predict(X_test) mnb_predict = clf.predict(X_test) # util.write_predictions(bnb_predict,test_ids,"ChoppySingleBNB.csv") util.write_predictions(mnb_predict, testID, "PerSysCallCountsBNB.csv")
#predict classes dt_predictions_tuned = pd.Series(dt_estimator_tuned.predict(post2000_exp)) #cross predicted vs actual post2000_res.index = dt_predictions_tuned.index dt_crosstab_tuned = pd.crosstab(post2000_res, dt_predictions_tuned, rownames=['Actual'], colnames=['Predicted'], margins=True) print dt_crosstab_tuned #BUILD NAIVE BAYES MODEL (UNSCALED DATA)------------------------------------------------ #run model nb = mnb() #conduct recursive feature search nb_rfe_cv = rfe(estimator=nb, step=1, cv=10, scoring='roc_auc', verbose = 1) nb_rfe_cv.fit(pre2000_exp, pre2000_res) #identify and plot optimal number of features (d = 50). ROC_AUC=0.6391 print nb_rfe_cv.n_features_ print nb_rfe_cv.grid_scores_.max() plt.figure() plt.xlabel("NB: Number of Features selected") plt.ylabel("NB: Cross Validation Score (ROC_AUC)") plt.plot(range(1, len(nb_rfe_cv.grid_scores_) + 1), nb_rfe_cv.grid_scores_) plt.show()
import sys import sklearn from classifier_utils import * from sklearn.naive_bayes import MultinomialNB as mnb if __name__ == '__main__' : if len( sys.argv ) > 2 : infilepath , alp = sys.argv[ 1: ] data = import_csv( infilepath ) cf = mnb( alpha = float( alp ) ) stats = cross_validation( data , cf ) print "PARAMS: alpha=%s" % alp print_stats( stats ) else : print "Usage python %s [csv_file] [neighbors] [distance]" % sys.argv[ 0 ]