def createNaiveBayesModel(feature_vector_data): ''' Uses the dimensionally reduced feature vectors of each of the instance, sense id pairs to create a naive bayes model ''' naive_bayes_model_word_type = {} for word_type, instance_sense_dict in feature_vector_data.iteritems(): vectors = [] senses = [] for i in xrange(len(instance_sense_dict)): sense = instance_sense_dict.keys()[i][1] data_type = instance_sense_dict.keys()[i][2] #Need to grab the TSNE vectors and senses of only the training data #Thus, we ignore all the validation data if data_type == "training": vectors.append(instance_sense_dict.values()[i]) senses.append(sense) vectors = np.array(vectors) senses = np.array(senses) nb = GaussianNB() nb.fit(vectors, senses) naive_bayes_model_word_type[word_type] = nb return naive_bayes_model_word_type
def test_string_labels_refit_false(): np.random.seed(123) clf1 = LogisticRegression() clf2 = RandomForestClassifier() clf3 = GaussianNB() y_str = y.copy() y_str = y_str.astype(str) y_str[:50] = 'a' y_str[50:100] = 'b' y_str[100:150] = 'c' clf1.fit(X, y_str) clf2.fit(X, y_str) clf3.fit(X, y_str) eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='hard', refit=False) eclf.fit(X, y_str) assert round(eclf.score(X, y_str), 2) == 0.97 eclf = EnsembleVoteClassifier(clfs=[clf1, clf2, clf3], voting='soft', refit=False) eclf.fit(X, y_str) assert round(eclf.score(X, y_str), 2) == 0.97
def NBAccuracy(features_train, labels_train, features_test, labels_test): """ compute the accuracy of your Naive Bayes classifier """ ### import the sklearn module for GaussianNB from sklearn.naive_bayes import GaussianNB ### create classifier clf = GaussianNB() ### fit the classifier on the training features and labels clf.fit(features_train, labels_train) ### use the trained classifier to predict labels for the test features pred = clf.predict(features_test) ### calculate and return the accuracy on the test data ### this is slightly different than the example, ### where we just print the accuracy ### you might need to import an sklearn module intersect = [i for i, j in zip(pred, labels_test) if i == j] matched = len(intersect) total = len(labels_test) accuracy = float(matched) / float(total) return accuracy
def selectKBest(previous_result, data): # remove 'restricted_stock_deferred' and 'director_fees' previous_result.pop(4) previous_result.pop(4) result = [] _k = 10 for k in range(0,_k): feature_list = ['poi'] for n in range(0,k+1): feature_list.append(previous_result[n][0]) data = featureFormat(my_dataset, feature_list, sort_keys = True, remove_all_zeroes = False) labels, features = targetFeatureSplit(data) features = [abs(x) for x in features] from sklearn.cross_validation import StratifiedShuffleSplit cv = StratifiedShuffleSplit(labels, 1000, random_state = 42) features_train = [] features_test = [] labels_train = [] labels_test = [] for train_idx, test_idx in cv: for ii in train_idx: features_train.append( features[ii] ) labels_train.append( labels[ii] ) for jj in test_idx: features_test.append( features[jj] ) labels_test.append( labels[jj] ) from sklearn.naive_bayes import GaussianNB clf = GaussianNB() clf.fit(features_train, labels_train) predictions = clf.predict(features_test) score = score_func(labels_test,predictions) result.append((k+1,score[0],score[1],score[2])) return result
def scikitNBClassfier(self): dataMat, labels = self.loadProcessedData() bayesian = Bayesian() myVocabList = bayesian.createVocabList(dataMat) ## 建立bag of words 矩阵 trainMat = [] for postinDoc in dataMat: trainMat.append(bayesian.setOfWords2Vec(myVocabList, postinDoc)) from sklearn.naive_bayes import GaussianNB gnb = GaussianNB() X = array(trainMat) y = labels testText = "美国军队的军舰今天访问了巴西港口城市,并首次展示了核潜艇攻击能力,飞机,监听。他们表演了足球。" testEntry = self.testEntryProcess(testText) bayesian = Bayesian() thisDoc = array(bayesian.setOfWords2Vec(myVocabList, testEntry)) ## 拟合并预测 y_pred = gnb.fit(X, y).predict(thisDoc) clabels = ['军事', '体育'] y_pred = gnb.fit(X, y).predict(X) print("Number of mislabeled points : %d" % (labels != y_pred).sum())
def test_predict_on_toy_problem(): """Manually check predicted class labels for toy dataset.""" clf1 = LogisticRegression(random_state=123) clf2 = RandomForestClassifier(random_state=123) clf3 = GaussianNB() X = np.array([[-1.1, -1.5], [-1.2, -1.4], [-3.4, -2.2], [1.1, 1.2], [2.1, 1.4], [3.1, 2.3]]) y = np.array([1, 1, 1, 2, 2, 2]) assert_equal(all(clf1.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2])) assert_equal(all(clf2.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2])) assert_equal(all(clf3.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2])) eclf = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='hard', weights=[1, 1, 1]) assert_equal(all(eclf.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2])) eclf = VotingClassifier(estimators=[ ('lr', clf1), ('rf', clf2), ('gnb', clf3)], voting='soft', weights=[1, 1, 1]) assert_equal(all(eclf.fit(X, y).predict(X)), all([1, 1, 1, 2, 2, 2]))
def test_gnb_sample_weight(): """Test whether sample weights are properly used in GNB. """ # Sample weights all being 1 should not change results sw = np.ones(6) clf = GaussianNB().fit(X, y) clf_sw = GaussianNB().fit(X, y, sw) assert_array_almost_equal(clf.theta_, clf_sw.theta_) assert_array_almost_equal(clf.sigma_, clf_sw.sigma_) # Fitting twice with half sample-weights should result # in same result as fitting once with full weights sw = rng.rand(y.shape[0]) clf1 = GaussianNB().fit(X, y, sample_weight=sw) clf2 = GaussianNB().partial_fit(X, y, classes=[1, 2], sample_weight=sw / 2) clf2.partial_fit(X, y, sample_weight=sw / 2) assert_array_almost_equal(clf1.theta_, clf2.theta_) assert_array_almost_equal(clf1.sigma_, clf2.sigma_) # Check that duplicate entries and correspondingly increased sample # weights yield the same result ind = rng.randint(0, X.shape[0], 20) sample_weight = np.bincount(ind, minlength=X.shape[0]) clf_dupl = GaussianNB().fit(X[ind], y[ind]) clf_sw = GaussianNB().fit(X, y, sample_weight) assert_array_almost_equal(clf_dupl.theta_, clf_sw.theta_) assert_array_almost_equal(clf_dupl.sigma_, clf_sw.sigma_)
def NBAccuracy(features_train, labels_train, features_test, labels_test): """ compute the accuracy of your Naive Bayes classifier """ ### import the sklearn module for GaussianNB from sklearn.naive_bayes import GaussianNB ### create classifier clf = GaussianNB() ### fit the classifier on the training features and labels clf.fit(features_train, labels_train) ### use the trained classifier to predict labels for the test features pred = clf.predict(features_test) ### calculate and return the accuracy on the test data ### this is slightly different than the example, ### where we just print the accuracy ### you might need to import an sklearn module total = len(labels_test) correct = (pred == labels_test).sum() accuracy = correct/float(total) from sklearn.metrics import accuracy_score accuracy = accuracy_score(labels_test,pred ) return accuracy
def categorize(train_data,test_data,train_class,n_features): #cf= ExtraTreesClassifier() #cf.fit(train_data,train_class) #print (cf.feature_importances_) #lsvmcf = sklearn.svm.LinearSVC(penalty='l2', loss='l2', dual=True, tol=0.0001, C=100.0) model = LogisticRegression() lgr = LogisticRegression(C=100.0,penalty='l1') #knn = KNeighborsClassifier(n_neighbors=5, weights='uniform', algorithm='auto', leaf_size=10, p=2, metric='minkowski', metric_params=None) svmlcf = sklearn.svm.SVC(C=1000.0, kernel='linear', degree=1, gamma=0.01, probability=True)#2 svmcf = sklearn.svm.SVC(C=1000.0, kernel='rbf', degree=1, gamma=0.01, probability=True)#2 cf = DecisionTreeClassifier() dct = DecisionTreeClassifier(criterion='gini', splitter='best', min_samples_split=7, min_samples_leaf=4) rf = RandomForestClassifier(n_estimators=10, criterion='gini', min_samples_split=7, min_samples_leaf=4, max_features='auto') gnb = GaussianNB() #1 adbst = sklearn.ensemble.AdaBoostClassifier(base_estimator=rf, n_estimators=5, learning_rate=1.0, algorithm='SAMME.R', random_state=True) #ch2 = SelectKBest(chi2, k=n_features) #train_data = ch2.fit_transform(train_data, train_class) #test_data = ch2.transform(test_data) #rfe = RFE(svmlcf,n_features) #rfe = rfe.fit(train_data, train_class) gnb.fit(train_data,train_class) return gnb.predict(test_data)
class GaussianColorClassifier(ContourClassifier): ''' A contour classifier which classifies a contour based on it's mean color in BGR, HSV, and LAB colorspaces, using a Gaussian classifier for these features. For more usage info, see class ContourClassifier ''' FEATURES = ['B', 'G', 'R', 'H', 'S', 'V', 'L', 'A', 'B'] def __init__(self, classes, **kwargs): super(GaussianColorClassifier, self).__init__(classes, **kwargs) self.classifier = GaussianNB() def get_features(self, img, mask): mean = cv2.mean(img, mask) mean = np.array([[mean[:3]]], dtype=np.uint8) mean_hsv = cv2.cvtColor(mean, cv2.COLOR_BGR2HSV) mean_lab = cv2.cvtColor(mean, cv2.COLOR_BGR2LAB) features = np.hstack((mean.flatten(), mean_hsv.flatten(), mean_lab.flatten())) return features def classify_features(self, features): return self.classifier.predict(features) def feature_probabilities(self, features): return self.classifier.predict_proba(features) def train(self, features, classes): self.classifier.fit(features, classes)
def NB_experiment(data_fold, train, test, dumper): print "Ready to find the Best Parameters for Naive Bayes" print 'Gaussian Naive Bayes' nb = GNB() print "fitting NaiveBayes Experiment" dumper.write('Classifier: Naive Bayes\n') scores = cross_validation.cross_val_score(nb, train[0], train[1], cv = data_fold, score_func=accus) reports = "Accuracy on Train: %0.2f (+/- %0.2f)"%(scores.mean(), scores.std()/2) print reports dumper.write(reports+'\n') reports = " ".join(['%0.2f'%(item) for item in scores]) dumper.write(reports+'\n') nb = GNB() nb.fit(train[0], train[1]) pred = clf_test(nb, test) output_ranking(pred, codecs.open('nb.ranking', 'w', 'utf-8')) return None
def getGaussianPred(featureMatrix, labels, testSet, testSet_docIndex): """ All input arguments are return of getTrainTestData() :param featureMatrix: :param labels: :param testSet: :param testSet_docIndex: :return docIndexPred: dict{docid: [index1, index2, ...], ...} key is docid value is all cognates' index """ gnb = GaussianNB() gnb.fit(featureMatrix, labels) # pred = gnb.predict(featureMatrix) pred = gnb.predict(testSet) docIndexPred = dict() for i, p in enumerate(pred): if p: docid = testSet_docIndex[i, 0] index = testSet_docIndex[i, 1] if docid in docIndexPred: docIndexPred[docid].append(index) else: docIndexPred[docid] = [index] return docIndexPred
def NBAccuracy(features_train, labels_train, features_test, labels_test): """ compute the accuracy of your Naive Bayes classifier """ ### import the sklearn module for GaussianNB from sklearn.naive_bayes import GaussianNB ### create classifier clf = GaussianNB() t0 = time() ### fit the classifier on the training features and labels clf.fit(features_train, labels_train) print "training time:", round(time()-t0, 3), "s" ### use the trained classifier to predict labels for the test features import numpy as np t1 = time() pred = clf.predict(features_test) print "predicting time:", round(time()-t1, 3), "s" ### calculate and return the accuracy on the test data ### this is slightly different than the example, ### where we just print the accuracy ### you might need to import an sklearn module accuracy = clf.score(features_test, labels_test) return accuracy
def gnbmodel(d,X_2,y_2,X_3,y_3,X_test,y_test): X_3_copy = X_3.copy(deep=True) X_3_copy['chance']=0 index = 0 ########## k折交叉验证 ########################### scores = cross_val_score(GaussianNB(), X_2, y_2, cv=5, scoring='accuracy') score_mean =scores.mean() print(d+'5折交互检验:'+str(score_mean)) ################################################# gnb = GaussianNB().fit(X_2,y_2) ################ 预测测试集 ################ answer_gnb = gnb.predict(X_test) accuracy = metrics.accuracy_score(y_test,answer_gnb) print(d+'预测:'+str(accuracy)) ############################################### chance = gnb.predict_proba(X_3)[:,1] for c in chance: X_3_copy.iloc[index,len(X_3_copy.columns)-1]=c index += 1 chance_que = X_3_copy.iloc[:,len(X_3_copy.columns)-1] return chance_que
def performNB(trainingScores, trainingResults, testScores): print "->Gaussian NB" X = [] for currMark in trainingScores: pass for idx in range(0, len(trainingScores[currMark])): X.append([]) for currMark in trainingScores: if "Asym" in currMark: continue print currMark, for idx in range(0, len(trainingScores[currMark])): X[idx].append(trainingScores[currMark][idx]) X_test = [] for idx in range(0, len(testScores[currMark])): X_test.append([]) for currMark in trainingScores: if "Asym" in currMark: continue for idx in range(0, len(testScores[currMark])): X_test[idx].append(testScores[currMark][idx]) gnb = GaussianNB() gnb.fit(X, np.array(trainingResults)) y_pred = gnb.predict_proba(X_test)[:, 1] print "->Gaussian NB" return y_pred
def main(argv): if len(argv) != 5: print "./NB_train_pred.py train.csv train_lable test.csv save_folder label_idx" sys.exit(1); output_folder = argv[3] label_idx = int(argv[4]) os.system("mkdir " + output_folder) print "Loading training data" train_array = np.load(argv[0]) print "Loading training label" train_label_array = np.load(argv[1]) print "Loading test data" test_array = np.load(argv[2]) print "building NB on label " + str(label_idx) gnb = GaussianNB() model = gnb.fit(train_array[:, 1:], train_label_array[1:, label_idx]) print "predicting label " + str(label_idx) nb_pred = gnb.predict(test_array[:,1:]) print "save the result" with open(output_folder + "/" + str(label_idx) + ".pred", 'w') as pred_file: pred_file.write("\n".join([ str(x) for x in nb_pred.tolist()])) with open(output_folder+"/"+str(label_idx) + ".npy", 'wb') as npy_file: np.save(npy_file, nb_pred)
def NBAccuracy(features_train, labels_train, features_test, labels_test): #Import sklearn modules for GaussianNB from sklearn.naive_bayes import GaussianNB from sklearn.metrics import accuracy_score #Create classifer classifer = GaussianNB(); #Timing fit algorithm t0 = time(); #Fit classier on the training features classifer.fit(features_train, labels_train); print "Training Time: ", round(time() - t0, 3), "s"; GaussianNB(); #Timing prediction algorithm t0=time(); #Use trained classifer to predict labels for test features pred = classifer.predict(features_test); print "Prediction Time: ", round(time() - t0, 3), "s"; #Calculate accuracy from features_test with answer in labels_test accuracy = accuracy_score(pred, labels_test); return accuracy;
def NB_predict(mtx_train,label_train,mtx_test,label_test): G_NB = GaussianNB() label_train = np.ravel(label_train) #start = timeit.default_timer() clf_nb = G_NB.fit(mtx_train,label_train) #stop = timeit.default_timer() #time_interval = stop - start #print ("predict time is %f" %time_interval) #start = timeit.default_timer() pCVR = clf_nb.predict_proba(mtx_test) #stop = timeit.default_timer() #time_interval = stop - start #print ("predict time is %f" %time_interval) ####### Evaluation #fpr,tpr,thresholds = roc_curve(label_test,pCVR[:,1]) #roc_auc = auc(fpr,tpr) predict_CVR = np.mean(pCVR[:,1]) #print("LR predicted CVR is %.5f" % predict_CVR) auc_score = roc_auc_score(label_test,pCVR[:,1]) #print("ROC AUC score for LR is %.4f" % auc_score) lg_rmse = sqrt(mean_squared_error(label_test, pCVR[:,1])) #print("rmse is %.4f" % lg_rmse) return pCVR, predict_CVR, auc_score, lg_rmse
def main(unused_argv): x,y=load_data() x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.4, random_state=0) vp = learn.preprocessing.VocabularyProcessor(max_document_length=MAX_DOCUMENT_LENGTH, min_frequency=1) x_train = np.array(list(vp.fit_transform(x_train))) x_test = np.array(list(vp.transform(x_test))) n_words=len(vp.vocabulary_) print('Total words: %d' % n_words) gnb = GaussianNB() y_predict = gnb.fit(x_train, y_train).predict(x_test) score = metrics.accuracy_score(y_test, y_predict) print('NB Accuracy: {0:f}'.format(score)) feature_columns = tf.contrib.learn.infer_real_valued_columns_from_input(x_train) classifier = tf.contrib.learn.DNNClassifier( feature_columns=feature_columns, hidden_units=[500,10], n_classes=2) classifier.fit(x_train, y_train, steps=5000, batch_size=10) y_predict=list(classifier.predict(x_test, as_iterable=True)) score = metrics.accuracy_score(y_test, y_predict) print('DNN Accuracy: {0:f}'.format(score))
def test_gnb_priors(): """Test whether the class prior override is properly used""" clf = GaussianNB(priors=np.array([0.3, 0.7])).fit(X, y) assert_array_almost_equal(clf.predict_proba([[-0.1, -0.1]]), np.array([[0.825303662161683, 0.174696337838317]]), 8) assert_array_equal(clf.class_prior_, np.array([0.3, 0.7]))
def classifyNB(): print 'Classify..' target_names = ['unacc', 'acc','good','v-good'] df = pd.read_csv("data/cars-cleaned.txt", delimiter=","); print df print df.dtypes df_y = df['accept'] df_x = df.ix[:,:-1] #print df_y #print df_x train_y, test_y, train_x, test_x = train_test_split(df_y, df_x, test_size = 0.3, random_state=33) clf = GaussianNB() tstart=time.time() model = clf.fit(train_x, train_y) print "training time:", round(time.time()-tstart, 3), "seconds" y_predictions = model.predict(test_x) print "Accuracy : " , model.score(test_x, test_y) #print y_predictions c_matrix = confusion_matrix(test_y,y_predictions) print "confusion matrix:" print c_matrix plt.matshow(c_matrix) plt.colorbar(); tick_marks = np.arange(len(target_names)) plt.xticks(tick_marks, target_names, rotation=45) plt.yticks(tick_marks, target_names) plt.ylabel('true label') plt.xlabel('predicted label') plt.show()
class GaussianNBClassifier: def __init__(self): """ This is the constructor responsible for initializing the classifier """ self.outputHeader = "#gnb" self.clf = None def buildModel(self): """ This builds the model of the Gaussian NB classifier """ self.clf = GaussianNB() def trainGaussianNB(self,X, Y): """ Training the Gaussian NB Classifier """ self.clf.fit(X, Y) def validateGaussianNB(self,X, Y): """ Validate the Gaussian NB Classifier """ YPred = self.clf.predict(X) print accuracy_score(Y, YPred) def testGaussianNB(self,X, Y): """ Test the Gaussian NB Classifier """ YPred = self.clf.predict(X) print accuracy_score(Y, YPred)
def classify(features_train, labels_train): clf = GaussianNB() clf.fit(features_train, labels_train) ### import the sklearn module for GaussianNB ### create classifier ### fit the classifier on the training features and labels return clf
def test_gnb_prior(): # Test whether class priors are properly set. clf = GaussianNB().fit(X, y) assert_array_almost_equal(np.array([3, 3]) / 6.0, clf.class_prior_, 8) clf.fit(X1, y1) # Check that the class priors sum to 1 assert_array_almost_equal(clf.class_prior_.sum(), 1)
def nb_names(): #generate list of tuple names engine = create_engine('sqlite:///names.db') DBSession = sessionmaker(bind=engine) session = DBSession() db_names = names.Names.getAllNames(session) names_list = [(x,'name') for x in db_names] words_list = generate_words() sample_names = [names_list[i] for i in sorted(random.sample(xrange(len(names_list)), len(words_list)))] data = sample_names + words_list shuffled_data = np.random.permutation(data) strings = [] classification = [] for item in shuffled_data: strings.append([item[0]]) classification.append(str(item[1])) X = np.array(strings) Y = np.array(classification) print X,Y clf = GaussianNB() clf.fit(X, Y)
def trainNB(): featureVector = [] classVector = [] temp= [] headerLine = True #training train = open(r'C:\Python34\alchemyapi_python\TrainingDataDummy.csv') for line in train: if(headerLine): headerLine = False else: temp = line.split(",") x = [float(temp[i]) for i in activeFeatureIndex] #print(x) featureVector.append(x) #temp = [int(x) for x in line.split(",")[-1].rstrip("\n")] classVector.append(int(line.split(",")[-1].rstrip("\n"))) fVector = np.array(featureVector) cVector = np.array(classVector) #print(classVector) print(fVector.shape) print(cVector.shape) clf = GaussianNB() clf.fit(fVector,cVector) train.close() return clf
class CruiseAlgorithm(object): # cruise algorithm is used to classify the cruise phase vs noncruise phase, it uses the differential change in data stream as the input matrix def __init__(self, testing=False): self.core = GaussianNB() self.scaler = RobustScaler() self.X_prev = None self.testing = testing def fit(self,X,Y): # Y should be the label of cruise or not X = self.prepare(X) self.core.fit(X,Y.ravel()) def predict(self, X): if self.testing: X_t = self.prepare(X) else: if self.X_prev: X_t = X - self.X_prev else: X_t = X self.X_prev = X print repr(X_t) prediction_result = self.core.predict(X_t) return np.asmatrix(prediction_result) def prepare(self,X): a = np.zeros((X.shape[0],X.shape[1])) for i in xrange(X.shape[0]-1): a[i+1,:] = X[i+1] - X[i] return a
def naive_bayes(features, labels): classifier = GaussianNB() classifier.fit(features, labels) scores = cross_validation.cross_val_score( classifier, features, labels, cv=10, score_func=metrics.precision_recall_fscore_support ) print_table("Naive Bayes", numpy.around(numpy.mean(scores, axis=0), 2))
def univariateFeatureSelection(f_list, my_dataset): result = [] for feature in f_list: # Replace 'NaN' with 0 for name in my_dataset: data_point = my_dataset[name] if not data_point[feature]: data_point[feature] = 0 elif data_point[feature] == 'NaN': data_point[feature] =0 data = featureFormat(my_dataset, ['poi',feature], sort_keys = True, remove_all_zeroes = False) labels, features = targetFeatureSplit(data) features = [abs(x) for x in features] from sklearn.cross_validation import StratifiedShuffleSplit cv = StratifiedShuffleSplit(labels, 1000, random_state = 42) features_train = [] features_test = [] labels_train = [] labels_test = [] for train_idx, test_idx in cv: for ii in train_idx: features_train.append( features[ii] ) labels_train.append( labels[ii] ) for jj in test_idx: features_test.append( features[jj] ) labels_test.append( labels[jj] ) from sklearn.naive_bayes import GaussianNB clf = GaussianNB() clf.fit(features_train, labels_train) predictions = clf.predict(features_test) score = score_func(labels_test,predictions) result.append((feature,score[0],score[1],score[2])) result = sorted(result, reverse=True, key=lambda x: x[3]) return result
class PatternBasedDiagnosis: """ Pattern Based Diagnosis with Decision Tree """ __slots__ = [ "model" ] def __init__(self): pass def train(self, data, labels): """ Train the decision tree with the training data :param data: :param labels: :return: """ print('Training Data: %s' % (data)) print('Training Labels: %s' % (labels)) self.model = GaussianNB() self.model = self.model.fit(data, labels) def eval(self, obs): # print('Testing Result: %s; %s' % (self.model.predict(obs), self.model.predict_proba(obs))) print('Testing Result: %s' % self.model.predict(obs))
(usernum)) #dist为去重后的序列 # print ("该用户的去重向量表Dist:(%s)" % dist) user_cmd_feature = get_user_cmd_feature_all( user_cmd_list, dist) #150个向量,每个向量有len(dist)个分量,1或0表示 labels = get_label("D:/ml/用户异常行为检测/MasqueradeDat/label.txt", usernum - 1) y = [0] * 50 + labels #加上前50个正常的序列标签 x_train = user_cmd_feature[0:N] #取前N(100)个训练集(序列向量,样本特征集) y_train = y[0:N] #取前N个对应的样本特征标签 x_test = user_cmd_feature[N:150] #测试集特征集 y_test = y[N:150] #测试集特征标签 clf = GaussianNB().fit(x_train, y_train) y_predict = clf.predict(x_test) score = np.mean(y_test == y_predict) * 100 print('User%s实际的后50个操作序列特征标签是(0为正常):' % (usernum), y_test) print(' NB预测的后50个操作序列特征标签是(0为正常):', y_predict.tolist()) print('NB异常操作的预测准确率是:', score) target_name = ['正常', '异常'] print( classification_report(y_test, y_predict, target_names=target_name)) print( model_selection.cross_val_score(clf, user_cmd_feature,
from sklearn.tree import DecisionTreeClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier from sklearn.naive_bayes import GaussianNB from sklearn.linear_model import LogisticRegression from xgboost import XGBClassifier from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis classifiers = [ KNeighborsClassifier(3), svm.SVC(probability=True), DecisionTreeClassifier(), XGBClassifier(), RandomForestClassifier(), AdaBoostClassifier(), GradientBoostingClassifier(), GaussianNB(), LinearDiscriminantAnalysis(), QuadraticDiscriminantAnalysis(), LogisticRegression() ] log_cols = ["Classifier", "Accuracy"] log = pd.DataFrame(columns=log_cols) # In[ ]: from sklearn.metrics import accuracy_score from sklearn.model_selection import train_test_split, StratifiedShuffleSplit SSplit = StratifiedShuffleSplit(test_size=0.3, random_state=7) acc_dict = {}
#KNN classifier from sklearn.neighbors import KNeighborsClassifier KNN_model = KNeighborsClassifier(n_neighbors=5) # Train the model usinfit(X_train, y_train)g the training sets KNN_model.fit(train_F_scaled,train_response) # In[79]: #naive bayes from sklearn.naive_bayes import GaussianNB naive_bayes_model = GaussianNB() naive_bayes_model .fit(train_predictor,train_response) # In[80]: y_pred = naive_bayes_model.predict(train_predictor) # Print results print("Number of mislabeled points out of a total {} points : {}, performance {:05.2f}%" .format( train_predictor.shape[0], (train_response != y_pred).sum(), 100*(1-(response != y_pred).sum()/train_predictor.shape[0])))
import numpy as np import matplotlib.pyplot as plt import seaborn as sns; sns.set() #Gaussian naive Bayes: data from each label is drawn from simple Gaussian distribution from sklearn.datasets import make_blobs X, y = make_blobs(100, 2, centers=2, random_state=2, cluster_std=1.5) plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='RdBu'); #find mean and standard deviation of points within a label, which defines the distribution #can then compute posterior ratio for given point from sklearn.naive_bayes import GaussianNB model = GaussianNB() model.fit(X, y); rng = np.random.RandomState(0) Xnew = [-6, -14] + [14, 18] * rng.rand(2000, 2) ynew = model.predict(Xnew) plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='RdBu') lim = plt.axis() plt.scatter(Xnew[:, 0], Xnew[:, 1], c=ynew, s=20, cmap='RdBu', alpha=0.1) plt.axis(lim); #in general, boundary in Gaussian naive Bayes is quadratic #allows for probabilistic classification yprob = model.predict_proba(Xnew)
def ModelParam_GridSearch(X_train, y_train, cv=4,scoreParam = 'f1'): ''' Basic grid searchCV for multiple classifiers' perf & parameters. This is very limited and computationally expensive. Not guaranteed to reach even a local optima, but good to get a rough idea of parameters for the classifiers. (Does not address pre-processing) More classifiers can be added as desired, and parameters expanded. Later: Add options for RBM + Logit; PCA; ICA; LDA. See also http://scikit-learn-laboratory.readthedocs.org/en/latest/_modules/skll/learner.html TODO: Add parameters + put classifiers/"pipeline_#" in a list. (To allow checking only some params) ''' # pipeline1 = Pipeline('clf', RandomForestClassifier() ) # # pipeline2 = Pipeline( # ('clf', KNeighborsClassifier()),) pipeline1 = RandomForestClassifier(n_jobs=-1) pipeline2 = KNeighborsClassifier() pipeline3 = SVC(cache_size=1500) # pipeline3 = NuSVC(cache_size=1500) pipeline4 = GaussianNB() pipeline5 = GradientBoostingClassifier() pipeline6 = SGDClassifier() pipeline7 = LogisticRegression() 'RandomForestClassifier:' parameters1 = { 'n_estimators': [150], 'criterion': ['gini'], 'max_features': ['auto',0.4], 'max_depth': [8,None], 'min_samples_leaf':[1,2], 'min_samples_split':[2,4], 'n_jobs':[-1] } #, 'entropy' # 'n_jobs':[-1] 'KNeighborsClassifier:' parameters2 = { 'n_neighbors': [7], 'weights': ['distance'] } 'SVC:' parameters3 = { 'C': [0.01,0.1, 1,10,100], 'kernel': ['linear','rbf'], 'gamma': [0.1,0.0, 1.0,20], 'cache_size':[1500], 'class_weight':['auto'], } # , 'poly','sigmoid'] ## 'GaussianNB:' ## parameters4 = {} 'GradientBoostingClassifier' parameters5 = { 'max_depth':[3,5,8], 'n_estimators': [100], 'min_samples_leaf':[1,2], 'learning_rate': [0.1, 0.01], 'max_features': ['auto',0.4] } 'SGDClassifier:' parameters6 = { 'alpha': [0.00001,0.001,0.01], 'penalty': ['l1','l2', 'elasticnet'], 'n_iter': [300], 'loss':['hinge'], 'n_jobs':[-1], 'class_weight':['auto'] } #, 'modified_huber','log' 'LogisticRegression:' parameters7 = { 'C': [0.001,0.01, 0.1, 1.0,10,100], 'penalty': ['l1','l2'], 'class_weight':['auto'] } 'TODO: make this into a seperate method, with pars, pips passed to it as params' pars = [parameters1, parameters2, parameters3,parameters5,parameters6,parameters7] #parameters4 pips = [pipeline1, pipeline2, pipeline3,pipeline5,pipeline6,pipeline7] # pipeline4, print ("Starting Gridsearch To find each model's best parameters") for i in range(len(pars)): print(pips[i]) gs = GridSearchCV(estimator=pips[i], param_grid=pars[i], verbose=0, refit=True, n_jobs=-1,iid=False, pre_dispatch='2*n_jobs',scoring=scoreParam, fit_params={'sample_weight': balance_weights(y)}, cv=StratifiedKFold(y_train,n_folds=cv,shuffle=True)) #Valid scoring options: ['accuracy', 'average_precision', 'f1', 'precision', 'recall', 'roc_auc'] # gs = gs.fit(X_train, y_train) 'http://stackoverflow.com/questions/13051706/scikit-learn-using-sample-weight-in-grid-search?rq=1' 'Note: Remove "class_weight=auto" from the autoweighting classifiers!!' "Set Class weights (then into sample weights: https://github.com/scikit-learn/scikit-learn/blob/8dab222cfe894126dfb67832da2f4e871b87bce7/sklearn/utils/class_weight.py" gs.fit(X_train, y_train) #print ("Finished Gridsearch") #print (gs.best_score_) report(gs.grid_scores_) # http://stackoverflow.com/questions/18210799/scikit-learn-sample-try-out-with-my-classifier-and-data 'Get more exhaustive CV results with the best tuned parameters for the model' est = gs.best_estimator_ scores = cross_val_score(est, X_train, y_train, cv=StratifiedShuffleSplit(y=y_train, n_iter=10, test_size=0.2),scoring=scoreParam, n_jobs=-1, pre_dispatch='1.8*n_jobs') print("Tuned Model's %s Score: %0.3f (+/- %0.3f)" % (scoreParam,scores.mean(), scores.std() * 2))
# scatter_matrix(dataset) # plt.show() array = dataset.values X = array[:, 0:4] Y = array[:, 4] validation_size = 0.20 seed = 7 scoring = 'accuracy' X_train, X_validation, Y_train, Y_validation = \ model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed) # Spot Checking models = [('LR', LogisticRegression()), ('LDA', LinearDiscriminantAnalysis()), ('KNN', KNeighborsClassifier()), ('CART', DecisionTreeClassifier()), ('NB', GaussianNB()), ('SVM', SVC())] results = [] names = [] # Shows KNN as the most accurate model for name, model in models: kfold = model_selection.KFold(n_splits=10, random_state=seed) cv_results = model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring) results.append(cv_results) names.append(name) msg = "{}: {} ({})".format(name, round(cv_results.mean(), 3),
train.fillna(train.mean(),inplace=True) test=test.drop(['Name','Ticket','Cabin','PassengerId'],axis='columns') test['Sex']=test['Sex'].replace({'male':0,'female':1}) test['Embarked']=test['Embarked'].replace({'S':1,'Q':2,'C':3}) test.fillna(test.mean(),inplace=True) models=[] models.append(('CART',DecisionTreeClassifier())) models.append(('RF',RandomForestClassifier())) models.append(('LR',LogisticRegression())) models.append(('PPN',Perceptron())) models.append(('NB',GaussianNB())) models.append(('SVM',SVC())) results=[] names=[] for name,model in models: scores=cross_val_score(model,train,target,cv=10,scoring='accuracy') results.append(scores.mean()) names.append(name) print names print results #fig=plt.figure() #fig.suptitle('Algorithm Comparison')
X = cv.fit_transform(corpus).toarray() # Step 10: Creating dependent Variable y = dataset.iloc[:, 1].values # Step 11: ***//Classification//*** # Based on experiance NLP best fits for //Naive Bayes, Decision Tree, Random Forest// from sklearn.cross_validation import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0) # Choose more Datasets for training and less to test,bcz of 1500 datasets # NO Need of Feature Scaling becz, most of them are Zero and 1 #Lets use Naive bayes from sklearn.naive_bayes import GaussianNB classifier = GaussianNB() classifier.fit(X_train, y_train) #predicting Test Results y_pred = classifier.predict(X_test) # confussion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) #calculating Accuracy (55 + 91) / 200 = .73 (accuracy)
X = dataset.iloc[:, [2, 3]].values y = dataset.iloc[:, 4].values # Splitting the dataset into the Training set and Test set from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0) # Feature Scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train = sc.fit_transform(X_train) X_test = sc.transform(X_test) # Fitting Naive Bayes to the Training set from sklearn.naive_bayes import GaussianNB classifier = GaussianNB() classifier.fit(X_train, y_train) # Predicting the Test set results y_pred = classifier.predict(X_test) # Making the Confusion Matrix from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, y_pred) # Visualising the Training set results from matplotlib.colors import ListedColormap X_set, y_set = X_train, y_train X1, X2 = np.meshgrid(np.arange(start = X_set[:, 0].min() - 1, stop = X_set[:, 0].max() + 1, step = 0.01), np.arange(start = X_set[:, 1].min() - 1, stop = X_set[:, 1].max() + 1, step = 0.01)) plt.contourf(X1, X2, classifier.predict(np.array([X1.ravel(), X2.ravel()]).T).reshape(X1.shape),
shuffle=True) #Baseline - Nearest centroid tic = timeit.default_timer() nc = NearestCentroid() nc.fit(x_train, y_train) print('Train-acc:', nc.score(x_train, y_train)) print('Test-acc:', nc.score(x_test, y_test)) toc = timeit.default_timer() elapsed_time = toc - tic NC_time = elapsed_time print('Elapsed time: ', elapsed_time, 'seconds') #Gaussian Naive Bayes - Gaus eloszlást feltételezünk tic = timeit.default_timer() model = GaussianNB() model.fit(x_train, y_train) toc = timeit.default_timer() print('Train-acc:', model.score(x_train, y_train)) print('Test-acc:', model.score(x_test, y_test)) toc = timeit.default_timer() elapsed_time = toc - tic GNB_time = elapsed_time print('Elapsed time: ', elapsed_time, 'seconds') #Konfidencia intervallum - ellenőrizni predicted_test = model.predict(x_test) test_acc = accuracy_score(y_test, predicted_test) n_success = np.sum(y_test == predicted_test) p = 0.91
stemmed.append(stemmer.stem(item)) return stemmed def tokenize(text): text = "".join([ch for ch in text if ch not in string.punctuation]) tokens = nltk.word_tokenize(text) stems = stem_tokens(tokens, stemmer) return stems #obtain stop words stop_words = text.ENGLISH_STOP_WORDS #define pipeline for tokenizing, feature extraction, feature selection, and naïve Bayes algorithm text_clf = Pipeline([('vect', CountVectorizer(tokenizer=tokenize, stop_words=stop_words,analyzer='word')), ('tfidf', TfidfTransformer()), ('dimensionality_reduction',TruncatedSVD(n_components=50, random_state=42)), ('clf', GaussianNB()), ]) text_clf = text_clf.fit(x_train, y_train) #test data validation predicted = text_clf.predict(x_test) print np.mean(predicted == y_test) #print the statistic summary and confusion matrix names = ['comp.sys.ibm.pc.hardware' , 'comp.sys.mac.hardware', 'misc.forsale', 'soc.religion.christian'] print(metrics.classification_report(y_test, predicted, target_names = names)) print metrics.confusion_matrix(y_test, predicted)
#3.data precessing # select features and Normalization x = Data.loc[:, [ 'ClumpTkns', 'UnofCSize', 'UnofCShape', 'MargAdh', 'SngEpiCSize', 'BareNuc', 'BlandCrmtn', 'NrmlNuc', 'Mitoses' ]] y = Data['Malignant'] # TRANSFROM GIVES 1 % LESS ACCURATE RESULT!!!! min_max_scaler = preprocessing.MaxAbsScaler() x = min_max_scaler.fit_transform(x) # 4.train model and performed testing using logistic regression # using SVM print("\nSelected Algorithm: GaussianNB") clf = GaussianNB() scores = cross_val_score(clf, x, y, cv=5) predictions = cross_val_predict(clf, x, y, cv=5) accuracy = metrics.r2_score(y, predictions) #print("\nCross-validation scores: {}".format(scores)) print("\nmean training result = {}".format(np.mean(scores))) print("\nCross-predicted accuracy: {}\n".format(accuracy)) """ #submission print("Writing submission.csv file...") index = [i for i in range(Data.shape[0])] df2 = pd.DataFrame({'Predictions': predictions}, index=index) submission = pd.concat([Data, df2], axis=1) submission.to_csv('wresult.csv', index=False) """
df = pd.read_table(file, header=None) label = np.concatenate((label, df)) label = np.ravel(label) #将label降成1维 return feature, label if __name__ == '__main__': feature_paths = [ r'A.feature', r'B.feature', r'C.feature', r'D.feature', r'E.featurE' ] label_paths = [r'A.label', r'B.label', r'C.label', r'D.label', r'E.label'] x_train, y_train = load_dataset([feature_paths[0]], [label_paths[0]]) x_test, y_test = load_dataset([feature_paths[1]], [label_paths[1]]) x_train, x_, y_train, y_ = train_test_split( x_train, y_train, test_size=0.0) #由于test_size=0,所以x_,y_都是None,作用是乱序 print('start traing knn') knn = KNeighborsClassifier().fit(x_train, y_train) a_knn = knn.predict(x_test) print('start traing dt') dt = DecisionTreeClassifier().fit(x_train, y_train) a_dt = dt.predict(x_test) print('start traing gusaanb') gnb = GaussianNB().fit(x_train, y_train) a_gnb = gnb.predict(x_test) print('test knn') print(classification_report(y_test, a_knn)) print('test db') print(classification_report(y_test, a_dt)) print('test guassnb') print(classification_report(y_test, a_gnb))
from sklearn import metrics import matplotlib.pyplot as plt from sklearn.datasets import load_wine from sklearn.pipeline import make_pipeline from matplotlib.font_manager import * myfont = FontProperties(fname='C:\Windows\Fonts\simfang.ttf') RANDOM_STATE = 42 FIG_SIZE = (10, 7) features, target = load_wine(return_X_y=True) # Make a train/test split using 30% test size X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.30, random_state=RANDOM_STATE) # Fit to data and predict using pipelined GNB and PCA. unscaled_clf = make_pipeline(PCA(n_components=2), GaussianNB()) unscaled_clf.fit(X_train, y_train) pred_test = unscaled_clf.predict(X_test) # Fit to data and predict using pipelined scaling, GNB and PCA. std_clf = make_pipeline(StandardScaler(), PCA(n_components=2), GaussianNB()) std_clf.fit(X_train, y_train) pred_test_std = std_clf.predict(X_test) # Show prediction accuracies in scaled and unscaled data. print('\nPrediction accuracy for the normal test dataset with PCA') print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_test))) print('\nPrediction accuracy for the standardized test dataset with PCA') print('{:.2%}\n'.format(metrics.accuracy_score(y_test, pred_test_std))) # Extract PCA from pipeline pca = unscaled_clf.named_steps['pca'] pca_std = std_clf.named_steps['pca'] # Show first principal componenets
def evaluate(model, data, alg = None, classifier="lr",fast=False,ratio = None,cv=10,normalize=False,random_state = None,return_y = False): X = model Y = data micros = [] macros = [] # for y,key in enumerate(data.labels.keys()): # for index,paper in enumerate(data.labels[key]): # if paper not in model.paper2id: # print("paper not in model: ", paper) # continue # X.append(model.paper_embeddings[model.paper2id[paper]]) # Y.append(y) print("len X: ", len(X)) print("len Y: ", len(Y)) if normalize: X = sk_normalize(X) scaler = StandardScaler() X = scaler.fit_transform(X) clf = LogisticRegression() df = defaultdict(list) if ratio is None: ratio = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9] for r in ratio: if r <= 0: continue elif r >= 1: break micros = [] macros = [] for i in range(cv): clf = LogisticRegression() if classifier.lower() == "svm": clf = SVC(cache_size=5000) elif classifier.lower() == "mlp": clf = MLPClassifier() elif classifier.lower() == "nb": clf = GaussianNB() X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=1-r,random_state=random_state) clf.fit(X_train,Y_train) prediction = clf.predict(X_test) #lpred = clf.predict_proba(X_test) #print("prediction shape: ", prediction[0]) #print("y_test shape: ", Y_test[0]) #print("Loss: ", log_loss(Y_test,lpred)) micro = f1_score(Y_test, prediction, average='micro') macro = f1_score(Y_test, prediction, average='macro') micros.append(micro) macros.append(macro) micros = np.mean(micros) macros = np.mean(macros) df["ratio"].append(r) df["micro"].append(np.mean(micro)) df["macro"].append(np.mean(macro)) #df["alg"].append(alg) #df["data"].append(str(data)) #df["total_samples"] = model.total_samples #df["negative"].append(model.negative) #df["walk_window"].append(model.walk_window) #df["walk_probability"].append(model.walk_probability) #df["L2"].append(model.l2) logging.info("ratio: %.4f : f1_micro %.4f, f1_macro %.4f" % (r,micros,macros)) if fast: if return_y: return micros,macros,Y_test,prediction return micros,macros else: return pd.DataFrame(df)
def save_data(): with open('data.csv', 'wb') as csvfile: writer = csv.writer(csvfile, delimiter=' ') while len(data) != 0: i = random.randint(0, len(data) - 1) data[i].insert(0, labels[i]) writer.writerow(data[i]) del data[i] del labels[i] data_beg_len = len(data) if data_beg_len != 0: clf1 = SVC() clf1.fit(data, labels) clf2 = GaussianNB() clf2.fit(data, labels) number = 20 new_input = [] def choose_ans(a): ans = [0 for i in range(10)] for n in a: ans[n] += 1 num = 0 for i in range(10): if ans[i] > num: num = i
print('Accuracy of LDA classifier on training set: {:.2f}' .format(lda.score(scaled_X_train, Y_train))) print('Accuracy of LDA classifier on test set: {:.2f}' .format(lda.score(scaled_X_test, Y_test))) pred_lda = lda.predict(scaled_X_test) print(confusion_matrix(Y_test, pred_lda)) print(classification_report(Y_test, pred_lda)) # In[47]: #fit a naive bayes model from sklearn.naive_bayes import GaussianNB gnb = GaussianNB() gnb.fit(scaled_X_train, Y_train) print('Accuracy of GNB classifier on training set: {:.2f}' .format(gnb.score(scaled_X_train, Y_train))) print('Accuracy of GNB classifier on test set: {:.2f}' .format(gnb.score(scaled_X_test, Y_test))) pred_gnb = gnb.predict(scaled_X_test) print(confusion_matrix(Y_test, pred_gnb)) print(classification_report(Y_test, pred_gnb)) # In[48]: #fit a svm classifier
def evaluate_multilabel(model, data, alg = None, classifier="lr",fast=False,ratio = None, cv = 10, random_state = None,normalize=False): X = [] Y = [] for pid in range(len(model.word2id)): X.append(model.word_embeddings[pid]) Y = np.zeros((len(X),len(data.labels))) for y,key in enumerate(data.labels.keys()): for index,paper in enumerate(data.labels[key]): pid = model.word2id[paper] Y[pid][y] = 1 if normalize: X = sk_normalize(X) scaler = StandardScaler() X = scaler.fit_transform(X) df = defaultdict(list) if ratio is None: ratio = [0.1,0.2,0.3,0.4,0.5,0.6,0.7,0.8,0.9] for r in ratio: if r <= 0: continue elif r >= 1: break if classifier.lower() == 'lr': clf = LogisticRegression() elif classifier.lower() == "svm": clf = SVC(cache_size=5000) elif classifier.lower() == "mlp": clf = MLPClassifier() elif classifier.lower() == "nb": clf = GaussianNB() micros = [] macros = [] for i in range(cv): micro,macro = evaluateNodeClassification(X,Y,1-r,clf=clf,random_state = random_state) micros.append(micro) macros.append(macro) micros = np.mean(micros) macros = np.mean(macros) df["ratio"].append(r) df["micro"].append(micros) df["macro"].append(macros) #df["alg"].append(alg) #df["data"].append(str(data)) #df["total_samples"].append(model.total_samples) #df["negative"].append(model.negative) #df["walk_window"].append(model.walk_window) #df["walk_probability"].append(model.walk_probability) #df["L2"].append(model.l2) logging.info("ratio: %.4f : f1_micro %.4f, f1_macro %.4f" % (r,micros,macros)) if fast: return micros,macros else: return df
DB[i] = 2 elif DB[i] == 66: DB[i] = 2 elif DB[i] == 70: DB[i] = 3 elif DB[i] == 74: DB[i] = 3 elif DB[i] == 78: DB[i] = 3 elif DB[i] == 82: DB[i] = 4 elif DB[i] == 86: DB[i] = 4 from sklearn.naive_bayes import GaussianNB clf = GaussianNB() #from sklearn.naive_bayes import MultinomialNB #clf = MultinomialNB() from sklearn.model_selection import train_test_split from sklearn.model_selection import KFold X = sum y = DB kf = KFold(n_splits=20) acc = [] for train_index, test_index in kf.split(X): X_train, X_test = X[train_index], X[test_index] y_train, y_test = y[train_index], y[test_index] clf.fit(X_train, y_train) y_test = y_test.ravel()
from email_preprocess import preprocess ### features_train and features_test are the features for the training ### and testing datasets, respectively ### labels_train and labels_test are the corresponding item labels features_train, features_test, labels_train, labels_test = preprocess() ######################################################### ### your code goes here ### from sklearn.naive_bayes import GaussianNB clf = GaussianNB() t0 = time() clf.fit(features_train, labels_train) print "training time:", round(time()-t0, 3), "s" t0 = time() pred = clf.predict(features_test) print "testing time:", round(time()-t0, 3), "s" accuracy = clf.score(features_test, labels_test) print "Accuracy: " print accuracy #########################################################
from sklearn.linear_model import LogisticRegression from sklearn.neighbors import KNeighborsClassifier from sklearn.naive_bayes import GaussianNB from sklearn.ensemble import RandomForestClassifier from mlxtend.classifier import StackingClassifier ################## load data ##################### iris = datasets.load_iris() x, y = iris.data[:, 1:3], iris.target ################## define classifier ##################### clf1 = KNeighborsClassifier(n_neighbors=1) clf2 = RandomForestClassifier(random_state=1) clf3 = GaussianNB() lr = LogisticRegression() sclf = StackingClassifier(classifiers=[clf1, clf2, clf3], meta_classifier=lr) ################## class result ##################### for clf, label in zip( [clf1, clf2, clf3, sclf], ['KNN', 'Random Forest', 'Naive Bayes', 'StackingClassifier']): scores = model_selection.cross_val_score(clf, x, y, cv=3, scoring='accuracy')
import pandas as pd import numpy as np from sklearn.model_selection import train_test_split from xgboost import XGBClassifier from sklearn.naive_bayes import GaussianNB from sklearn.neighbors import KNeighborsClassifier from sklearn.ensemble.gradient_boosting import GradientBoostingClassifier from sklearn.ensemble import RandomForestClassifier from sklearn.svm import SVC from sklearn.model_selection import cross_validate from varname import nameof sv = SVC() RFC = RandomForestClassifier() GaussianN = GaussianNB() KNC = KNeighborsClassifier(n_neighbors=7) xgboost = XGBClassifier() gradientboost = GradientBoostingClassifier() df = pd.read_csv(r'dataframes/full_csv', index_col=[0]) # with open(r'objects/wektor_lst', 'rb') as f: # res_wek = np.load(f) res_wek = np.load(r'objects/wektors.npy', allow_pickle=True) res_wek = [wek[0:20] for wek in res_wek] zzz = np.stack(res_wek) res_wek = zzz.reshape([7023, 2000]) scoring = ['precision', 'recall', 'f1', 'accuracy'] sv_score_array = cross_validate(sv,
# Import libraries from sklearn.naive_bayes import GaussianNB, MultinomialNB, BernoulliNB # define data, create model and fit data X = Variables Y = Classes Model = GaussianNB(params).fit(X, Y) # Score model Model.score(X, Y) # Predict new classes NewY = Model.Predict(NewX)
def self_projection( X, cell_types, classifier="LR", penalty="l1", sparsity=0.5, fraction=0.5, solver="liblinear", n=0, cv=5, whole=False, n_jobs=None, ): # n = 100 should be good. """ This is the core function for running self-projection. Input ----- X: `numpy.array` or sparse matrix the expression matrix, e.g. ad.raw.X. cell_types: `list of String/int` the cell clustering assignment classifier: `String` optional (defatul: 'LR') a machine learning model in "LR" (logistic regression), \ "RF" (Random Forest), "GNB"(Gaussion Naive Bayes), "SVM" (Support Vector Machine) and "DT"(Decision Tree). penalty: `String` optional (default: 'l2') the standardization mode of logistic regression. Use 'l1' or 'l2'. sparsity: `fload` optional (default: 0.5) The sparsity parameter (C in sklearn.linear_model.LogisticRegression) for the logistic regression model. fraction: `float` optional (default: 0.5) Fraction of data included in the training set. 0.5 means use half of the data for training, if half of the data is fewer than maximum number of cells (n). n: `int` optional (default: 100) Maximum number of cell included in the training set for each cluster of cells. only fraction is used to split the dataset if n is 0. cv: `int` optional (default: 5) fold for cross-validation on the training set. 0 means no cross-validation. whole: `bool` optional (default: False) if measure the performance on the whole dataset (include training and test). n_jobs: `int` optional, number of threads to use with the different classifiers (default: None - unlimited). return ----- y_prob, y_pred, y_test, clf y_prob: `matrix of float` prediction probability y_pred: `list of string/int` predicted clustering of the test set y_test: `list of string/int` real clustering of the test set clf: the classifier model. """ # split the data into training and testing if n > 0: X_train, X_test, y_train, y_test = train_test_split_per_type( X, cell_types, n=n, frac=(1 - fraction)) else: X_train, X_test, y_train, y_test = train_test_split( X, cell_types, stratify=cell_types, test_size=fraction) # fraction means test size # set the classifier if classifier == "LR": clf = LogisticRegression( random_state=1, penalty=penalty, C=sparsity, multi_class="ovr", solver=solver, ) elif classifier == "RF": clf = RandomForestClassifier(random_state=1, n_jobs=n_jobs) elif classifier == "GNB": clf = GaussianNB() elif classifier == "GPC": clf = GaussianProcessClassifier(n_jobs=n_jobs) elif classifier == "SVM": clf = SVC(probability=True) elif classifier == "SH": clf = SGDClassifier(loss="squared_hinge", n_jobs=n_jobs) elif classifier == "PCP": clf = SGDClassifier(loss="perceptron", n_jobs=n_jobs) elif classifier == "DT": clf = DecisionTreeClassifier() # mean cross validation score cvsm = 0 if cv > 0: cvs = cross_val_score(clf, X_train, np.array(y_train), cv=cv, scoring="accuracy", n_jobs=n_jobs) cvsm = cvs.mean() print("Mean CV accuracy: %.4f" % cvsm) # accuracy on cross validation and on test set clf.fit(X_train, y_train) accuracy = clf.score(X_train, y_train) print("Accuracy on the training set: %.4f" % accuracy) accuracy_test = clf.score(X_test, y_test) print("Accuracy on the hold-out set: %.4f" % accuracy_test) # accuracy of the whole dataset if whole: accuracy = clf.score(X, cell_types) print("Accuracy on the whole set: %.4f" % accuracy) # get predicted probability on the test set y_prob = None if not classifier in ["SH", "PCP"]: y_prob = clf.predict_proba(X_test) y_pred = clf.predict(X_test) return y_prob, y_pred, y_test, clf, cvsm, accuracy_test
from sklearn.metrics import confusion_matrix cm_Decision_Tree = confusion_matrix(Y_test, Y_pred_Decision_Tree) #Accuracy score calculation for Decision Tree Model from sklearn.metrics import accuracy_score acc_decision_tree = accuracy_score(Y_test,Y_pred_Decision_Tree) print(acc_decision_tree) # Fitting Naive Bayes Algorithm to Training set #Feature scaling from sklearn.preprocessing import StandardScaler sc = StandardScaler() X_train_naive_bayes = sc.fit_transform(X_train) X_test_naive_bayes = sc.transform(X_test) from sklearn.naive_bayes import GaussianNB classifier = GaussianNB() classifier.fit(X_train_naive_bayes, Y_train) # Predicting naive_bayes Test set results Y_pred_naive_bayes = classifier.predict(X_test_naive_bayes) # Confusion Matrix for naive_bayes_model from sklearn.metrics import confusion_matrix cm_naive_bayes = confusion_matrix(Y_test, Y_pred_naive_bayes) #Accuracy score calculation for naive_bayes_model from sklearn.metrics import accuracy_score acc_naives_bayes = accuracy_score(Y_test,Y_pred_naive_bayes) print(acc_naives_bayes) # Fitting Random Forest Classification to Training set from sklearn.ensemble import RandomForestClassifier
import pandas as pd from sklearn.model_selection import train_test_split from sklearn.naive_bayes import GaussianNB train_df = pd.read_csv('./glass.csv') X_train = train_df.drop("Type", axis=1) Y_train = train_df["Type"] X_train, X_test, Y_train, y_test = train_test_split(X_train, Y_train, test_size=0.4, random_state=0) gnb = GaussianNB() y_pred = gnb.fit(X_train, Y_train).predict(X_test) gnb.fit(X_train, Y_train) Y_pred = gnb.predict(X_test) acc_gnb = round(gnb.score(X_train, Y_train) * 100, 2) print("GNB accuracy is:", acc_gnb)
return y_pred, y_pred_prob ## function to get classifiers score def print_scores(y_test,y_pred,y_pred_prob): print('test-set confusion matrix:\n', confusion_matrix(y_test,y_pred)) print("recall score: ", recall_score(y_test,y_pred)) print("precision score: ", precision_score(y_test,y_pred)) print("f1 score: ", f1_score(y_test,y_pred)) print("accuracy score: ", accuracy_score(y_test,y_pred)) print("ROC AUC: {}".format(roc_auc_score(y_test, y_pred_prob[:,1]))) #%% # training a naive bayes model for classification y_pred, y_pred_prob = get_predictions(GaussianNB(), X_train, y_train, X_test) print_scores(y_test,y_pred,y_pred_prob) # Accuracy = 96.91 % # hence we can see that the model has correclty classified all the 135 values as frauds/ shill bidders #%% # training a logistic regression model y_pred, y_pred_prob = get_predictions(LogisticRegression(C = 0.01, penalty = 'l1'), X_train, y_train, X_test) print_scores(y_test,y_pred,y_pred_prob) # Accuracy = 96.28 %
if not pca and estimator_name not in ['GaussianNB', 'NeuralNetwork']: process_feature_importances(model, estimator_name, pca, fine_tune) gridsearch_param = {'scoring': 'roc_auc', 'verbose': 2 , 'n_jobs': -1, 'cv': 3} estimators_params_grid = { 'LogisticRegression': {'C' : [10**i for i in range(-3,4)], 'penalty': ['l2', 'l1']}, 'DecisionTreeClassifier': {'min_samples_split': [1600, 1800, 2000, 2200, 2400]}, 'RandomForestClassifier': {'n_estimators' : [50,100,200,300,400], 'min_samples_split': [50, 100, 150, 200]}, 'LGBMClassifier': {'num_leaves': [500, 1000, 1500, 2000, 2500], 'n_estimators': [200, 400, 600, 800, 1000]}, } print_info('Start experiments') experiment(LogisticRegression(random_state=SEED, n_jobs=-1, solver='saga', max_iter=500), train_x, train_y, test_x, test_y, pca = False, fine_tune = True) experiment(DecisionTreeClassifier(random_state=SEED), train_x, train_y, test_x, test_y, pca = False, fine_tune = True) experiment(GaussianNB(), train_x, train_y, test_x, test_y, pca = False, fine_tune = False) experiment(RandomForestClassifier(random_state=SEED, n_jobs=-1), train_x, train_y, test_x, test_y, pca = False, fine_tune = True) lgbm = lgb.LGBMClassifier(objective='binary', random_state = SEED, feature_fraction=0.7, learning_rate=0.05, n_jobs=-1, silent = False, ) experiment(lgbm, train_x, train_y, test_x, test_y, pca = False, fine_tune = True) """ Bagging with Lightgbm (Combine boosting and bagging)""" print_info('Start Bagging with Lightgbm') lgbm = lgb.LGBMClassifier(objective='binary', random_state = SEED,
a = pd.Series() x = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] for i in list(range(1, 11)): model = KNeighborsClassifier(n_neighbors=i) model.fit(train_X, train_Y) prediction = model.predict(test_X) a = a.append(pd.Series(metrics.accuracy_score(prediction, test_Y))) plt.plot(a_index, a) plt.xticks(x) fig = plt.gcf() fig.set_size_inches(12, 6) plt.show() print('Accuracies for different values of n are:', a.values, 'with the max value as ', a.values.max()) model = GaussianNB() model.fit(train_X, train_Y) prediction6 = model.predict(test_X) print('The accuracy of the NaiveBayes is', metrics.accuracy_score(prediction6, test_Y)) model = RandomForestClassifier(n_estimators=100) model.fit(train_X, train_Y) prediction7 = model.predict(test_X) print('The accuracy of the Random Forests is', metrics.accuracy_score(prediction7, test_Y)) from sklearn.model_selection import KFold #for K-fold cross validation from sklearn.model_selection import cross_val_score #score evaluation from sklearn.model_selection import cross_val_predict #prediction kfold = KFold(n_splits=10,
shuffled_data = data_file.sample(frac=1) X = shuffled_data.iloc[1:, 0] # Features y = shuffled_data.iloc[1:, 1] # Target variable # vectorize and split data vectorizer = CountVectorizer() X = vectorizer.fit_transform(X) feature_names = vectorizer.get_feature_names() X_train, X_test, y_train, y_test = train_test_split(X.toarray(), y, test_size=0.2, random_state=0) tot_train = np.append(X_train, y_train[:, None], axis=1) # train model gnb = GaussianNB() gnb.fit(X_train, y_train) y_pred = gnb.predict(X_test) print("P(Not Spam): " + str(gnb.class_prior_[0])) print("P(Spam): " + str(gnb.class_prior_[1]) + "\n") # separating spam and non-spam instances not_spam = tot_train[np.where(tot_train[:, -1] == 0), :-1][0] spam = tot_train[np.where(tot_train[:, -1] == 1), :-1][0] # smoothing probs not_spam = not_spam + 1 spam = spam + 1 X_train_smooth = X_train + 1