def stochasticGD(input_file,Output):
    lvltrace.lvltrace("LVLEntree dans stochasticGD")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    clf = SGDClassifier(loss="hinge", penalty="l2")
    clf.fit(X,y)
    y_pred = clf.predict(X)
    print "#########################################################################################################\n"
    print "Stochastic Gradient Descent "
    print "classification accuracy:", metrics.accuracy_score(y, y_pred)
    print "precision:", metrics.precision_score(y, y_pred)
    print "recall:", metrics.recall_score(y, y_pred)
    print "f1 score:", metrics.f1_score(y, y_pred)
    print "\n"
    print "#########################################################################################################\n"
    results = Output+"Stochastic_GD_metrics.txt"
    file = open(results, "w")
    file.write("Stochastic Gradient Descent estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y)):
        file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1)))
    file.close()
    title = "Stochastic Gradient Descent"
    save = Output + "Stochastic_GD_confusion_matrix.png"
    plot_confusion_matrix(y, y_pred,title,save)
    lvltrace.lvltrace("LVLSortie dans stochasticGD")
def SVC_linear(input_file,Output):
    lvltrace.lvltrace("LVLEntree dans SVC_linear")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    clf=svm.SVC(kernel='linear')
    clf.fit(X,y)
    y_pred = clf.predict(X)
    print "#########################################################################################################\n"
    print "C-Support Vector Classifcation (with linear kernel) "
    print "classification accuracy:", metrics.accuracy_score(y, y_pred)
    print "precision:", metrics.precision_score(y, y_pred)
    print "recall:", metrics.recall_score(y, y_pred)
    print "f1 score:", metrics.f1_score(y, y_pred)
    print "\n"
    print "#########################################################################################################\n"
    results = Output+"SVM_Linear_Kernel_metrics.txt"
    file = open(results, "w")
    file.write("Support Vector Machine with Linear Kernel estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y)):
        file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1)))
    file.close()
    title = "SVC - linear Kernel"
    save = Output + "SVC_linear_confusion_matrix.png"
    plot_confusion_matrix(y, y_pred,title,save)
    lvltrace.lvltrace("LVLSortie dans SVC_linear")
def randomforest(input_file,Output):
    lvltrace.lvltrace("LVLEntree dans randomforest")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    clf = RandomForestClassifier(n_estimators=10)
    clf.fit(X,y)
    y_pred = clf.predict(X)
    print "#########################################################################################################\n"
    print "The Random forest algo "
    print "classification accuracy:", metrics.accuracy_score(y, y_pred)
    print "precision:", metrics.precision_score(y, y_pred)
    print "recall:", metrics.recall_score(y, y_pred)
    print "f1 score:", metrics.f1_score(y, y_pred)
    print "\n"
    print "#########################################################################################################\n"
    results = Output+"Random_Forest_metrics.txt"
    file = open(results, "w")
    file.write("Random Forest Classifier estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y)):
        file.write("%f,%f,%i\n"%(y[n],y_pred[n],(n+1)))
    file.close()
    title = "The Random forest"
    save = Output + "Random_Forest_confusion_matrix.png"
    plot_confusion_matrix(y, y_pred,title,save)
    lvltrace.lvltrace("LVLSortie dans randomforest")
def run_model(X_test, X_train, y_test, y_train, prob_threshold = 20, layers = 5, nodes = 64, dropout = 50):
    
    print "run_model RUNNING"
    # Grab the model 
    model = get_model(X_test, layers =layers, dropout = dropout)
    model.fit(X_train, y_train, nb_epoch=20, batch_size=16, verbose = 0)

    # Get the training and test predictions from our model fit. 
    train_predictions  = model.predict_proba(X_train)
    test_predictions = model.predict_proba(X_test)
    # Set these to either 0 or 1 based off the probability threshold we 
    # passed in (divide by 100 becuase we passed in intergers). 
    train_preds = (train_predictions) >= prob_threshold / 100.0
    test_preds = (test_predictions) >= prob_threshold / 100.0

    # Calculate the precision and recall. Only output until 
    precision_score_train = precision_score(y_train, train_preds)
    precision_score_test = precision_score(y_test, test_preds)
    acc_train = accuracy_score(y_train, train_preds)
    acc_test = accuracy_score(y_test, test_preds)

    recall_score_train = recall_score(y_train, train_preds)
    recall_score_test = recall_score(y_test, test_preds)

    return precision_score_train, precision_score_test, recall_score_train, recall_score_test, acc_train, acc_test, model
 def on_epoch_end(self, batch, logs={}):
     # losses
     self.losses_train.append(self.model.evaluate(X_train, Y_train, batch_size=128,verbose =0))
     self.losses_val.append(self.model.evaluate(X_val, Y_val, batch_size=128,verbose = 0))
     
     # Roc train
     train_preds = self.model.predict_proba(X_train, verbose=0)
     train_preds = train_preds[:, 1]
     roc_train = metrics.roc_auc_score(y_train, train_preds)
     self.roc_train.append(roc_train)
     
     # Roc val
     val_preds = self.model.predict_proba(X_val, verbose=0)
     val_preds = val_preds[:, 1]
     roc_val = metrics.roc_auc_score(y_val, val_preds)
     self.roc_val.append(roc_val)
     
     # Metrics train
     y_preds = self.model.predict_classes(X_train,verbose = 0)
     self.f1_train.append(metrics.f1_score(y_train,y_preds))
     self.recal_train.append(metrics.recall_score(y_train,y_preds))
     self.preci_train.append(metrics.precision_score(y_train,y_preds))
     
     # Metrics val
     y_preds = self.model.predict_classes(X_val,verbose =0)
     self.f1_val.append(metrics.f1_score(y_val,y_preds))
     self.recal_val.append(metrics.recall_score(y_val,y_preds))
     self.preci_val.append(metrics.precision_score(y_val,y_preds))
def learning_curve_mod(data, labels, clf, percents, d=100, avg=3, test_size=.2):
    """
    This method calculates the performance of the training and cross validation test set as the training
    set size increases and returns the performance at each percent

    Args:
        :param data: (md.array) The raw data to use for training and cross validation testing
        :param labels: (nd.array) the labels associated with the data
        :param clf: (sklearn classifier) the classifier to be used for training
        :param percents: (nd.array) a list of percent of training data to use
        :param d:  (int) The number of principle components to calculate
        :param avg: (int) The number of iterations to average when calculating performance
        :param test_size: (double [0,1]) The size of the testing set

    Return:
        :return: train_accuracies (list) performance on the training set
        :return: test_accuracies (list) performance on the testing set
    """
    # split into train and testing dataset
    x_train, x_test, y_train, y_test = train_test_split(data.T, labels, test_size=test_size, random_state=0)
    x_test = x_test.T
    train_accuracies = []
    test_accuracies = []
    for percent in percents:
        temp_train_accuracies = []
        temp_test_accuracies = []
        print percent
        for i in range(0, avg):
            x_train_2, x_test_2, y_train_2, y_test_2 = train_test_split(x_train, y_train, test_size=percent)
            x_train_2 = x_train_2.T

            # Subtract off the mean
            mean_face = np.mean(x_train_2, axis=1)
            x_train_2 = x_train_2 - mean_face

            # Find low dimensional subspace using PCA
            pca = PCA(n_components=d)
            pca.fit(x_train_2)
            model = pca.transform(x_train_2)

            # Project the known faces onto the face space
            label_map = np.dot(x_train_2.T, model)

            # Train a KNN classifier
            clf.fit(label_map, y_train_2)

            # project the unknown faces onto face space
            W_train = np.dot(x_train_2.T - mean_face.T, model)
            W_test = np.dot(x_test.T - mean_face.T, model)


            test_prediction = clf.predict(W_test)
            temp_test_accuracies.append(metrics.precision_score(y_test, test_prediction))
            train_prediction = clf.predict(W_train)
            temp_train_accuracies.append(metrics.precision_score(y_train_2, train_prediction))

        train_accuracies.append(np.mean(temp_train_accuracies))
        test_accuracies.append(np.mean(temp_test_accuracies))

    return train_accuracies, test_accuracies
def main():
	f = open("me.stdout", "r").read()

	print f
	
	(confusionMatrix, labels, ytrue, ypred, trueCount) = readConfusionMatrix.readText(f)
	for row in confusionMatrix:
		print row

	precisionMicro = np.float(metrics.precision_score(ytrue, ypred, average="micro"))
	recallMicro = np.float(metrics.recall_score(ytrue, ypred, average="micro"))
	f1Micro = np.float(metrics.f1_score(ytrue, ypred, average="micro"))
	f1Macro = np.float(metrics.f1_score(ytrue, ypred, pos_label=1, average="macro"))
	precisionMacro = np.float(metrics.precision_score(ytrue, ypred, average="macro"))
	recallMacro = np.float(metrics.recall_score(ytrue, ypred, average="macro"))

	mConf = metrics.confusion_matrix(ytrue, ypred)
	print mConf

	print labels
	print len(ytrue)
	print len(ypred)
	print trueCount

	print metrics.accuracy_score(ytrue, ypred)

	print precisionMicro
	print recallMicro
	print f1Micro
	print f1Macro
	print precisionMacro
	print recallMacro
示例#8
0
def calculate_f1_metrics(all_predicted, all_targets):
    first_class = first_meaningful_entity
    class_count = len(set(all_targets))
    filtered_true, filtered_predicted = [], []

    for i in range(len(all_targets)):
        if all_targets[i] > 0:
            filtered_true.append(all_targets[i])
            filtered_predicted.append(all_predicted[i])

    precision_separate_scores = metrics.precision_score(filtered_true, filtered_predicted,
                                                        labels=[i for i in range(first_class, class_count)],
                                                        average=None)
    precision_score = metrics.precision_score(filtered_true, filtered_predicted,
                                              labels=[i for i in range(first_class, class_count)], average='micro')
    recall_separate_scores = metrics.recall_score(filtered_true, filtered_predicted,
                                                  labels=[i for i in range(first_class, class_count)], average=None)
    recall_score = metrics.recall_score(filtered_true, filtered_predicted,
                                        labels=[i for i in range(first_class, class_count)], average='micro')
    f1_separate_scores = metrics.f1_score(filtered_true, filtered_predicted,
                                          labels=[i for i in range(first_class, class_count)], average=None)
    f1_score = metrics.f1_score(filtered_true, filtered_predicted,
                                labels=[i for i in range(first_class, class_count)], average='micro')

    return f1_separate_scores, f1_score, precision_separate_scores, precision_score, recall_separate_scores, recall_score
示例#9
0
def stratified_k_fold(clf,features,labels):
    skf = StratifiedKFold( labels, n_folds=3 )
    precisions = []
    recalls = []
    for train_idx, test_idx in skf:
        features_train = []
        features_test  = []
        labels_train   = []
        labels_test    = []
        for ii in train_idx:
            features_train.append( features[ii] )
            labels_train.append( labels[ii] )
        for jj in test_idx:
            features_test.append( features[jj] )
            labels_test.append( labels[jj] )

        ### fit the classifier using training set, and test on test set
        clf.fit(features_train, labels_train)
        pred = clf.predict(features_test)


        ### for each fold, print some metrics
        print
        print "precision score: ", precision_score( labels_test, pred )
        print "recall score: ", recall_score( labels_test, pred )

        precisions.append( precision_score(labels_test, pred) )
        recalls.append( recall_score(labels_test, pred) )

    ### aggregate precision and recall over all folds
    print "average precision: ", sum(precisions)/2.
    print "average recall: ", sum(recalls)/2.
示例#10
0
def evaluate(ytest, ypred, filename='metrics.txt'):
    true_result = [1 if item > 0.5 else 0 for item in ytest]
    pred_result = [1 if item > 0.5 else 0 for item in ypred]
    
    cm = confusion_matrix(true_result, pred_result)
    print('\nConfusion matrix:')
    print(cm)
    print("\nLoss classified as loss", cm[0][0])
    print("Wins classified as wins", cm[1][1])
    print("Wins classified as loss", cm[1][0])
    print("Loss classified as wins", cm[0][1])
    print('\nAccuracy:\t', accuracy_score(true_result, pred_result))
    print('Precision:\t', precision_score(true_result, pred_result))
    print('Recall: \t', recall_score(true_result, pred_result))
    print('F1 score:\t', f1_score(true_result, pred_result))
    print('Mean absolute error:\t', mean_absolute_error(ytest, ypred))
    
    # print to file
    print("Loss classified as loss", cm[0][0], file=open(filename, "a"))
    print("Wins classified as wins", cm[1][1], file=open(filename, "a"))
    print("Wins classified as loss", cm[1][0], file=open(filename, "a"))
    print("Loss classified as wins", cm[0][1], file=open(filename, "a"))
    print('\nAccuracy:\t', accuracy_score(true_result, pred_result), file=open(filename, "a"))
    print('Precision:\t', precision_score(true_result, pred_result), file=open(filename, "a"))
    print('Recall: \t', recall_score(true_result, pred_result), file=open(filename, "a"))
    print('F1 score:\t', f1_score(true_result, pred_result), file=open(filename, "a"))
    print('Mean absolute error:\t', mean_absolute_error(ytest, ypred), file=open(filename, "a"))
示例#11
0
	def do_cv(self):
		from sklearn.metrics import precision_score


		data  = util.get_x_y_for_cv()
		X = data['x']
		y = data['y']['Survived']
		skf = StratifiedKFold(y, n_folds=5)
		e_test = []
		e_train = []
		for train_idx, test_idx in skf:
			train_x = X.iloc[train_idx]
			train_y = y.iloc[train_idx]

			test_x = X.iloc[test_idx]
			test_y = y.iloc[test_idx]

			self.model.fit(train_x, train_y)
			yhat = self.model.predict(test_x)
			e_test.append(precision_score(test_y, yhat))



			yhat_train = self.model.predict(train_x)
			e_train.append(precision_score(train_y, yhat_train))


		print np.mean(e_train)
		print np.mean(e_test)
示例#12
0
def predictSVD(svd, row, column, d):
    # start = timeit.default_timer()
    u = svd[0] #clf.components_ 
    s = svd[1] #clf.explained_variance_
    vt = svd[2] #clf.fit_transform(X)
    # print "   fitting done.";
    # stop = timeit.default_timer()
    # print "   runtime: " + str(stop - start)
    # print "d:"
    # print d

    # matrixY = clf.components_ 
    probsY = []
    # print "dot products:"
    for i in range(len(row)):
        # print np.dot(u[:,column[i]], v[row[i],:])
        prob = np.sum(u[column[i],:]*s*vt[:,row[i]])
        if(prob < 0): prob = 0
        if(prob > 1): prob = 1
        probsY.append(prob)

    probsY = np.array(probsY)
    preds = np.zeros(shape=len(probsY))
    preds[probsY >= 0.5] = 1

    print "Precision"
    print precision_score(d, preds)
    print "Recall"
    print recall_score(d, preds)
    print "F-Score"
    print f1_score(d, preds)

    return probsY, preds
    def trainModel(self,folds):
        
        kf = cross_validation.StratifiedKFold(self.y_total,n_folds=folds,shuffle=True,random_state=random.randint(1,100))

        for (train_index,test_index) in (kf):
          
            self.X_train = [self.X_total[i] for i in train_index]
            self.X_test = [self.X_total[i] for i in test_index] 
            self.y_train = [self.y_total[i] for i in train_index]
            self.y_test = [self.y_total[i] for i in test_index] 

            print "################"
            print "Original"
            print np.array(self.y_test)
            print "################"
            self.clf = self.clf.fit(self.X_train,self.y_train)
            print "Predicted"
            y_pred = self.clf.predict(self.X_test)
            print y_pred
            print "################"
            print "Evaluation\n"           
            cm = confusion_matrix(self.y_test,y_pred)            
            print cm
            print "Precision Score:"
            print precision_score(self.y_test,y_pred,average="macro")
            print "Recall Score:"
            print recall_score(self.y_test,y_pred,average="macro") 
            print "Accuracy Score:"
            print accuracy_score(self.y_test,y_pred)
def nearest_centroid(input_file,Output,test_size):
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    print X_train.shape, X_test.shape
    clf = NearestCentroid()
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print "Nearest Centroid Classifier "
    print "classification accuracy:", metrics.accuracy_score(y_test, y_pred)
    print "precision:", metrics.precision_score(y_test, y_pred)
    print "recall:", metrics.recall_score(y_test, y_pred)
    print "f1 score:", metrics.f1_score(y_test, y_pred)
    print "\n"
    results = Output+"Nearest_Centroid_metrics_test.txt"
    file = open(results, "w")
    file.write("Nearest Centroid Classifier estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y_test)):
        file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1)))
    file.close()
    title = "Nearest Centroid %f"%test_size
    save = Output + "Nearest_Centroid_confusion_matrix"+"_%s.png"%test_size
    plot_confusion_matrix(y_test, y_pred,title,save)
    lvltrace.lvltrace("LVLSortie dans stochasticGD split_test")
def SVC_linear(input_file,Output,test_size):
    lvltrace.lvltrace("LVLEntree dans SVC_linear split_test")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    print X_train.shape, X_test.shape
    clf=svm.SVC(kernel='linear')
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print "C-Support Vector Classifcation (with RBF linear) "
    print "y_test, y_pred, iteration"
    print "classification accuracy:", metrics.accuracy_score(y_test, y_pred)
    print "precision:", metrics.precision_score(y_test, y_pred)
    print "recall:", metrics.recall_score(y_test, y_pred)
    print "f1 score:", metrics.f1_score(y_test, y_pred)
    print "\n"
    results = Output+"SVM_Linear_Kernel_metrics_test.txt"
    file = open(results, "w")
    file.write("Support Vector Machine with Linear Kernel estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y_test)):
        file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1)))
    file.close()
    title = "SVC linear %f"%test_size
    save = Output + "SVC_linear_confusion_matrix"+"_%s.png"%test_size
    plot_confusion_matrix(y_test, y_pred,title,save)
    lvltrace.lvltrace("LVLsortie dans SVC_linear split_test")
def extratreeclassifier(input_file,Output,test_size):
    lvltrace.lvltrace("LVLEntree dans extratreeclassifier split_test")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    print X_train.shape, X_test.shape
    clf = ExtraTreesClassifier(n_estimators=10)
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    print "Extremely Randomized Trees"
    print "classification accuracy:", metrics.accuracy_score(y_test, y_pred)
    print "precision:", metrics.precision_score(y_test, y_pred)
    print "recall:", metrics.recall_score(y_test, y_pred)
    print "f1 score:", metrics.f1_score(y_test, y_pred)
    print "\n"
    results = Output+"_Extremely_Random_Forest_metrics_test.txt"
    file = open(results, "w")
    file.write("Extremely Random Forest Classifier estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred))
    file.write("\n")
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y_test)):
        file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1)))
    file.close()
    title = "Extremely Randomized Trees %f"%test_size
    save = Output + "Extremely_Randomized_Trees_confusion_matrix"+"_%s.png"%test_size
    plot_confusion_matrix(y_test, y_pred,title,save)
    lvltrace.lvltrace("LVLSortie dans extratreeclassifier split_test")
def gaussianNB(input_file,Output,test_size):
    lvltrace.lvltrace("LVLEntree dans gaussianNB split_test")
    ncol=tools.file_col_coma(input_file)
    data = np.loadtxt(input_file, delimiter=',', usecols=range(ncol-1))
    X = data[:,1:]
    y = data[:,0]
    n_samples, n_features = X.shape
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size)
    print X_train.shape, X_test.shape
    # Instantiate the estimator
    clf = GaussianNB()
    # Fit the estimator to the data
    clf.fit(X_train, y_train)
    # Use the model to predict the last several labels
    y_pred = clf.predict(X_test)
    print "Gaussian Naive Bayes estimator accuracy "
    print "classification accuracy:", metrics.accuracy_score(y_test, y_pred)
    print "precision:", metrics.precision_score(y_test, y_pred)
    print "recall:", metrics.recall_score(y_test, y_pred)
    print "f1 score:", metrics.f1_score(y_test, y_pred)
    results = Output+"GaussianNB_metrics_test.txt"
    file = open(results, "w")
    file.write("Gaussian Naive Bayes estimator accuracy\n")
    file.write("Classification Accuracy Score: %f\n"%metrics.accuracy_score(y_test, y_pred))
    file.write("Precision Score: %f\n"%metrics.precision_score(y_test, y_pred))
    file.write("Recall Score: %f\n"%metrics.recall_score(y_test, y_pred))
    file.write("F1 Score: %f\n"%metrics.f1_score(y_test, y_pred))
    file.write("True Value, Predicted Value, Iteration\n")
    for n in xrange(len(y_test)):
        file.write("%f,%f,%i\n"%(y_test[n],y_pred[n],(n+1)))
    file.close()
    title = "Gaussian Naive Bayes %f"%test_size
    save = Output + "Gaussian_NB_confusion_matrix"+"_%s.png"%test_size
    plot_confusion_matrix(y_test, y_pred,title,save)
    lvltrace.lvltrace("LVLsortie dans gaussianNB split_test")
示例#18
0
def main():
    resize_shape = 64
    print "data is loading..."
    train_X, train_Y, test_X, test_Y = load_data(resize_shape)
    print "data is loaded"
    print "feature engineering..."
    learning_rate = 0.01
    training_iters = 100000
    batch_size = 128
    display_step = 10

    # Network Parameters
    n_input = resize_shape*resize_shape # MNIST data input (img shape: 28*28)
    n_classes = 62 # MNIST total classes (0-9 digits)
    dropout = 0.5 # Dropout, probability to keep units

    with tf.Session() as sess:
        cnn = CNN(sess, learning_rate, training_iters, batch_size, display_step, n_input, n_classes, dropout,resize_shape)
        train_X = cnn.inference(train_X)
        test_X = cnn.inference(test_X)

    print "feature engineering is complete"

    print 'training phase'
    clf = svm.LinearSVC().fit(train_X, train_Y)
    print 'test phase'
    predicts = clf.predict(test_X)

    # measure function
    print 'measure phase'
    print confusion_matrix(test_Y, predicts)
    print f1_score(test_Y, predicts, average=None)
    print precision_score(test_Y, predicts, average=None)
    print recall_score(test_Y, predicts, average=None)
    print accuracy_score(test_Y, predicts)
示例#19
0
 def applyClassifier(self, clf, name, training_set, testing_set, y_train, y_test):
     print("\nMODEL " + name)
     
     t0 = time()
     classifier = clf.fit(training_set, y_train)
     train_time = time() - t0
     print("train time: %0.3fs" % train_time)
     
     t0 = time()       
     y_nb_predicted = classifier.predict(testing_set)
     test_time = time() - t0
     print("test time:  %0.3fs" % test_time)
     
     precision = metrics.precision_score(y_test, y_nb_predicted)
     recall = metrics.recall_score(y_test, y_nb_predicted)
     f1_score = metrics.f1_score(y_test, y_nb_predicted)
     accuracy = metrics.accuracy_score(y_test, y_nb_predicted)
     micro_recall = metrics.recall_score(y_test, y_nb_predicted, average="micro")
     macro_recall = metrics.recall_score(y_test, y_nb_predicted, average="macro")
     micro_precision = metrics.precision_score(y_test, y_nb_predicted, average="micro")
     macro_precision = metrics.precision_score(y_test, y_nb_predicted, average="macro")        
     print 'The precision for this classifier is ' + str(precision)
     print 'The micro averaged precision for this classifier is ' + str(micro_precision)
     print 'The macro averaged precision for this classifier is ' + str(macro_precision)
     print 'The recall for this classifier is ' + str(recall)
     print 'The micro averaged recall for this classifier is ' + str(micro_recall)
     print 'The macro averaged recall for this classifier is ' + str(macro_recall)        
     print 'The f1 for this classifier is ' + str(f1_score)
     print 'The accuracy for this classifier is ' + str(accuracy) 
     
     return name, accuracy, precision, recall, micro_precision, micro_recall, macro_precision, macro_recall, train_time, test_time
示例#20
0
def cross_val(data_x, data_y, classifier, kFold, b_cost=1, h_cost=1, w=0.5):
    e_h, e_b = 0, 0
    y_tests, pred_probas = [], []
    
    for train_index, test_index in kFold:
        data_x_, data_y_ = np.array(data_x), np.array(data_y)
        X_train, X_test = list(data_x_[train_index]), list(data_x_[test_index])
        y_train, y_test = list(data_y_[train_index]), list(data_y_[test_index])
        classifier.fit(X_train, y_train)
        pred_proba = [r[0] for r in classifier.predict_proba(X_test)]
        y_tests += y_test
        pred_probas += pred_proba
    
    predictions = [0 if p*b_cost > (1-p)*h_cost else 1 for p in pred_probas]
    roc_auc = roc_auc_score(y_tests, pred_probas)
    total_acc = accuracy_score(y_tests, predictions)
    precision, recall, thresholds = precision_recall_curve(y_tests, pred_probas, pos_label=0)
    fpr, tpr, thresholds = roc_curve(y_tests, pred_probas, pos_label=0)
    precision_bots = precision_score(y_tests, predictions, pos_label = 0)
    precision_humans = precision_score(y_tests, predictions, pos_label = 1)
    recall_bots = recall_score(y_tests, predictions, pos_label = 0)
    recall_humans = recall_score(y_tests, predictions, pos_label = 1)
    f1_bots = f1_score(y_tests, predictions, pos_label = 0)
    f1_humans = f1_score(y_tests, predictions, pos_label = 1)
    conf_matrix = np.matrix(list(confusion_matrix(y_tests, predictions)))
    
    #plot_curve(fpr, tpr, 'ROC', w)
    #plot_curve(recall, precision, 'PR', w)
    
    return [total_acc, precision_bots, precision_humans, recall_bots, recall_humans, f1_bots, f1_humans, roc_auc, conf_matrix]
示例#21
0
文件: chroma.py 项目: agangzz/dl4mir
def score(y_true, y_pred):
    precision_weighted = metrics.precision_score(
        y_true, y_pred, average='weighted')
    precision_ave = np.mean(metrics.precision_score(
        y_true, y_pred, average=None)[::12])

    recall_weighted = metrics.recall_score(
        y_true, y_pred, average='weighted')
    recall_ave = np.mean(metrics.recall_score(
        y_true, y_pred, average=None)[::12])

    f1_weighted = metrics.f1_score(
        y_true, y_pred, average='weighted')
    f1_ave = np.mean(metrics.f1_score(
        y_true, y_pred, average=None)[::12])

    stat_line = "  Precision: %0.4f\t Recall: %0.4f\tf1: %0.4f"
    res1 = "Weighted: " + stat_line % (100*precision_weighted,
                                       100*recall_weighted,
                                       100*f1_weighted)

    res2 = "Averaged: " + stat_line % (100*precision_ave,
                                       100*recall_ave,
                                       100*f1_ave)
    res3 = "-"*72
    outputs = [res3, res1, res2, res3]
    return "\n".join(outputs)
示例#22
0
def confusion_matrix(true_y, pred_y, labels):
    c_matrix = metrics.confusion_matrix(true_y, pred_y)

    confusion_table = []
    first_row = ["C.Matrix"] + labels + ["ACTUAL"] + ["RECALL"]
    confusion_table.append(first_row)

    recall = metrics.recall_score(true_y, pred_y, average=None)
    for r, row in enumerate(c_matrix):
        new_row = [labels[r]]
        new_row.extend(row)
        new_row.append(sum(row))
        new_row.append(recall[r])
        confusion_table.append(new_row)

    new_row = ["PREDICTED"]
    for l in labels:
        new_row.append(len([t for t in pred_y if t == l]))
    new_row.append(len(true_y))
    new_row.append(metrics.recall_score(true_y, pred_y, average='macro'))
    confusion_table.append(new_row)

    new_row = ["PRECISION"]
    new_row.extend(metrics.precision_score(true_y, pred_y, average=None))
    new_row.append(metrics.precision_score(true_y, pred_y, average='macro'))
    new_row.append(metrics.f1_score(true_y, pred_y, average='macro'))
    confusion_table.append(new_row)

    confusion_table = pd.DataFrame(confusion_table)
    return confusion_table
示例#23
0
def _clf_mlp(trX,teX,trY,teY):
	print "MLP"
	print trX.shape,"trX shape"
	print "Enter Layer for MLP"
	layer=input()
	# print "enter delIdx"
	# delIdx=input()
	# while(delIdx):
	# 	trX=np.delete(trX,-1,axis=0)
	# 	trY=np.delete(trY,-1,axis=0)
	# 	delIdx=delIdx-1
	print "factors",factors(trX.shape[0])	
	teY=teY.astype(np.int32)
	trY=trY.astype(np.int32)
	print trX.shape,"trX shape"
	print "enter no of mini batch"
	mini_batch=int(input())
	mlp = TfMultiLayerPerceptron(eta=0.01, 
                             epochs=100, 
                             hidden_layers=layer,
                             activations=['relu' for i in range(len(layer))],
                             print_progress=3, 
                             minibatches=mini_batch, 
                             optimizer='adam',
                             random_seed=1)
	mlp.fit(trX,trY)
	pred=mlp.predict(teX)
	print _f_count(teY),"test f count"
	pred=pred.astype(np.int32)
	print _f_count(pred),"pred f count"
	conf_mat=confusion_matrix(teY, pred)
	process_cm(conf_mat, to_print=True)
	print precision_score(teY,pred),"Precision Score"
	print recall_score(teY,pred),"Recall Score"
	print roc_auc_score(teY,pred), "ROC_AUC"
示例#24
0
    def get_scores(self, **kwargs):
        
        """
        Calculate scores.
        
        """
        
        data = kwargs['data']
        true_values = np.array(data['targets'])
        predicted_values = kwargs['predicted_values']
        le = kwargs["data_args"]["LabelEncoder"]

        out_args = {}
        scores = []

        sc = accuracy_score (true_values, predicted_values)
        score = {}
        score['name'] = 'Accuracy'
        score['value'] = sc
        scores.append(score)        
        
        sc = f1_score(true_values, predicted_values, average='weighted')
        score_by_class = f1_score(true_values, predicted_values, average=None)        
        score = {}
        score['name'] = 'F1 score'
        score['summary_name'] = 'Weighted average F1 score'
        score['summary_value'] = sc
        score['class_wise'] = {}
        score['class_wise']['names'] = list(le.classes_)
        score['class_wise']['values'] = list(score_by_class)
        scores.append(score)
        
        sc = precision_score(true_values, predicted_values, average='weighted')
        score_by_class = precision_score (true_values, predicted_values, average=None)        
        score = {}
        score['name'] = 'Precision'
        score['summary_name'] = 'Weighted average precision score'
        score['summary_value'] = sc
        score['class_wise'] = {}
        score['class_wise']['names'] = list(le.classes_)
        score['class_wise']['values'] = list(score_by_class)
        scores.append(score)
        
        sc = recall_score(true_values, predicted_values, average='weighted')
        score_by_class = precision_score (true_values, predicted_values, average=None)
        score = {}
        score['name'] = 'Recall'
        score['summary_name'] = 'Weighted average recall score'
        score['summary_value'] = sc
        score['class_wise'] = {}
        score['class_wise']['names'] = list(le.classes_)
        score['class_wise']['values'] = list(score_by_class)
        scores.append(score)
        
        scores_out = {}
        scores_out["scores"] = scores
        scores_out["schema_version"] = "0.02"

        return scores_out, out_args
示例#25
0
def gradient_boosting(X,y, nf = 2, lr = .1, ne = 100):
    col_names = X.columns
    y = y.astype(float)
    Xs = X.astype(float)
    Xs_t, Xs_holdout, y_t, y_holdout = train_test_split(Xs, y, train_size=.8)
    Xs_t = Xs_t.set_index([range(len(Xs_t))])
    Xs_holdout = Xs_holdout.set_index([range(len(Xs_holdout))])
    y_t = pd.DataFrame(y_t).set_index([range(len(y_t))])
    y_holdout = pd.DataFrame(y_holdout).set_index([range(len(y_holdout))])

    kf = KFold(len(Xs_t), nf)

    output_table = []
    precisions = []
    accuracies = []
    F1s = []
    fold_count = 1
    for train_index, test_index in kf:
        results = []
        Xs_train, Xs_test = Xs_t.iloc[train_index,:], Xs_t.iloc[test_index,:]
        y_train, y_test = y_t.iloc[train_index,:], y_t.iloc[test_index,:]
        y_train = np.array(y_train)
        y_test = np.array(y_test)
        Gboost = GradientBoostingRegressor(learning_rate=lr, loss='ls', n_estimators=ne)
        Gboost.fit(Xs_train, y_train)
        pred = Gboost.predict(Xs_test)
        pred = np.array(pred)
        pred = pred.round()
        output_table.append(' ')
        output_table.append("Fold "+ str(fold_count) + ':')
        output_table.append("Precision Score: "+str(precision_score(pred, y_test)))
        output_table.append("Accuracy Score: "+ str(accuracy_score(pred, y_test)))
        output_table.append("F1 Score: "+str(f1_score(pred, y_test)))
        precisions.append(precision_score(pred, y_test))
        accuracies.append(accuracy_score(pred, y_test))
        F1s.append(f1_score(pred, y_test))
        fold_count += 1
    pred_holdout = Gboost.predict(Xs_holdout)
    pred_holdout = np.array(pred_holdout)
    pred_holdout = pred_holdout.round()
    cm = confusion_matrix(y_holdout, pred_holdout)
    TN = cm[0][0]
    FN = cm[0][1]
    TP = cm[1][1]
    FP = cm[1][0]
    print "Mean Precision: ", np.mean(precisions)
    print "Mean F1s: ", np.mean(F1s)
    print "True Positive Rate (Sensitivity): ", TP*1./(TP+FN)#cm[1][1]*1./(cm[1][1]+cm[0][1])
    print "True Negative Rate (Specificity): ", TN*1./(TN+FP)#cm[0][0]*1./(cm[0][0]+cm[1][0])
    print "Precision: ", TP*1./(TP+FP), #precision_score(pred_holdout, y_holdout)
    print "Accuracy: ", (TP+TN)*1./(TP+TN+FP+FN), #accuracy_score(pred_holdout, y_holdout)
    indices = np.argsort(Gboost.feature_importances_)
    figure = plt.figure(figsize=(10,7))
    plt.barh(np.arange(len(col_names)), Gboost.feature_importances_[indices],
             align='center', alpha=.5)
    plt.yticks(np.arange(len(col_names)), np.array(col_names)[indices], fontsize=14)
    plt.xticks(fontsize=14)
    _ = plt.xlabel('Relative importance', fontsize=18)
    return Gboost
def DTree(X, Y, XTest, YTest):
    print '-----------------------------------------------------'
    # dot_data = StringIO()
    # tree.export_graphviz(dtree_model, out_file=dot_data)
    # graph = pydot.graph_from_dot_data(dot_data.getvalue())
    # graph.write_pdf("../dtree.pdf")

    # param_grid = {'max_depth': np.arange(1, 15)}

    # tree_grid = GridSearchCV(DecisionTreeClassifier(), param_grid)
    tree_grid = DecisionTreeClassifier(max_depth=3)
    tree_grid.fit(X, Y)
    export_graphviz(tree_grid, out_file=dot_data)
    graph = pydot.graph_from_dot_data(dot_data.getvalue())
    graph.write_pdf("dtreevis.pdf")

    # print("The best parameters are %s with a score of %0.2f"
    #       % (tree_grid.best_params_, tree_grid.best_score_))

    print "Computing training statistics"
    dtree_predict_time_training = time.time()
    Ypred_dtree_training = tree_grid.predict(X)
    dtree_predict_time_training = time.time() - dtree_predict_time_training

    dtree_accuracy_training = metrics.accuracy_score(Y, Ypred_dtree_training)
    dt_precision_training = metrics.precision_score(Y, Ypred_dtree_training,
                                                    average='binary')
    dtree_recall_training = metrics.recall_score(Y, Ypred_dtree_training,
                                                 average='binary')

    print "DT training prediction time: " + str(dtree_predict_time_training)
    print "DT training accuracy Score: " + str(dtree_accuracy_training)
    print "DT training precision Score: " + str(dt_precision_training)
    print "DT training recall Score: " + str(dtree_recall_training)

    print "Computing testing statistics"
    dtree_predict_time_test = time.time()
    Ypred_dtree_test = tree_grid.predict(XTest)
    dtree_predict_time_test = time.time() - dtree_predict_time_test

    dtree_accuracy_test = metrics.accuracy_score(YTest, Ypred_dtree_test)
    dt_precision_test = metrics.precision_score(YTest, Ypred_dtree_test,
                                                average='binary')
    dtree_recall_test = metrics.recall_score(YTest, Ypred_dtree_test,
                                             average='binary')

    print "DT test prediction time: " + str(dtree_predict_time_test)
    print "DT test accuracy Score: " + str(dtree_accuracy_test)
    print "DT test precision Score: " + str(dt_precision_test)
    print "DT test recall Score: " + str(dtree_recall_test)

    print "Creating ROC curve"
    y_true = YTest
    y_score = tree_grid.predict_proba(XTest)
    fprSVM, trpSVM, _ = metrics.roc_curve(y_true=y_true,
                                          y_score=y_score[:, 0],
                                          pos_label=0)
    plt.plot(fprSVM, trpSVM, 'r-', label='DT')
示例#27
0
	def cv(self, X, y, eval_size=.33, nfold=3):
		metrics=['roc_auc','f1','recall','precision']
		Xtrain, Xeval , ytrain, yeval = train_test_split(X,y,test_size=eval_size)
		Xtrain.reset_index(drop = True)
		Xeval.reset_index(drop = True)
		ytrain.reset_index(drop = True)
		yeval.reset_index(drop = True)
		self.fit(Xtrain, ytrain)

		ypred = self.predict(Xeval)
		yprob = self.predict_proba(Xeval)

		eroc = roc_auc_score(yeval, yprob)
		ef1 = f1_score(yeval, ypred)
		erecall = recall_score(yeval, ypred)
		eprecision = precision_score(yeval, ypred)

		# print confusion_matrix(yeval, ypred, labels = [0,1])
		# eroc = roc_auc_score(yeval, yprob, sample_weight=sw)
		# ef1 = f1_score(yeval, ypred,sample_weight=sw)
		# erecall = recall_score(yeval, ypred, sample_weight=sw)
		# eprecision = precision_score(yeval, ypred, sample_weight=sw)
		escores = [eroc, ef1, erecall, eprecision]

		
		skfscores = []

		skf = StratifiedKFold(ytrain,n_folds=nfold, random_state=2016)
		for trainIndex, testIndex in skf:
			skfxtrain, skfxtest = X.loc[trainIndex,:], X.loc[testIndex,:]
			skfytrain, skfytest = y.values[trainIndex], y.values[testIndex]
			self.fit(skfxtrain, skfytrain)
			ypred = self.predict(skfxtest)
			yprob = self.predict_proba(skfxtest)

			# roc = roc_auc_score(skfytest, yprob, sample_weight=sw)
			# f1 = f1_score(skfytest, ypred, sample_weight=sw)
			# recall = recall_score(skfytest, ypred,sample_weight=sw)
			# precision = precision_score(skfytest, ypred, sample_weight=sw)

			roc = roc_auc_score(skfytest, yprob)
			f1 = f1_score(skfytest, ypred)
			recall = recall_score(skfytest, ypred)
			precision = precision_score(skfytest, ypred)

			# print confusion_matrix(skfytest, ypred, labels=[0,1])

			scores = [roc, f1, recall, precision]
			print 'cv scores:'
			print scores
			skfscores.append(scores)

		skfscores = np.array(skfscores)	
		skfscores = skfscores.mean(0)		
		
		report = pd.DataFrame({'eval': escores, 'train': skfscores}, index=metrics)

		return report
示例#28
0
def print_scores(model, X_train, y_train, X_test, y_test):
    """
    Compute scores for given model with training and test sets
    
    Input:
        model (sklearn.linear_model): the model with which to calculate scores
        X_train (numpy_array): training design matrix X
        y_train (numpy_array): training labels y
        X_test (numpy_array): test design matrix X
        y_test (numpy_array): test labels y
        
    Output:
        F1-score in test set
    
    Side Effects:
        prints the scores
        
    Comments:
        model must be fitted before calling this function
    
    """
    
    y_train_predicted = model.predict(X_train)
    y_test_predicted = model.predict(X_test)

    
    # accuracy scores
    print("Accuracy")
    print("Train: ", model.score(X_train,y_train))
    print("Test: ", model.score(X_test, y_test))
    print("\n")

    # use precision and recall metrics
    from sklearn.metrics import precision_score, recall_score

    precision_train = precision_score(y_train, y_train_predicted)
    recall_train = recall_score(y_train, y_train_predicted)

    precision_test = precision_score(y_test, y_test_predicted)
    recall_test = recall_score(y_test, y_test_predicted)

    print("Precision and Recall")
    print ("Train: ", precision_train, recall_train)
    print ("Test: ", precision_test, recall_test)
    print("\n")


    # F1 score
    from tilestools import F1score
    f1_train = F1score(y_train, y_train_predicted)
    f1_test = F1score(y_test, y_test_predicted)

    print("F1 score")
    print ("Train: ", f1_train)
    print ("Test: ", f1_test)

    
    return f1_test
示例#29
0
def helpfulPrediction(y_actual,X,grid_search_best,model_name):
   #use grid_search.best_estimator_ (the best parameters found) to predict
   X_train, X_test, y_actual_train, y_actual_test = train_test_split(X, y_actual, test_size=0.15, random_state=0)
   y_true, y_pred = y_actual_test['is_helpful'].values, grid_search_best.predict(X_test.iloc[:,0:len(X.columns)-2])
   print confusion_matrix(y_true, y_pred)
   print "Training precision score:"+str(precision_score(y_actual_train, grid_search_best.predict(X_train.iloc[:,0:len(X.columns)-2])))
   print "Testing precision score:"+str(precision_score(y_true,y_pred))

   pd.DataFrame({'comment_id': X_test['id_x'].values,'users_count':X_test['users_count'].values,'comments_count':X_test['comments_count'].values,'contains_image':X_test['contains_image'].values,'thanking_in_reply':X_test['gratitude_in_reply'].values,'is_response_to_question':X_test['is_response_to_question'].values,'is_response_to_image':X_test['is_response_to_image'].values,'contains_link':X_test['contains_link'].values,'contains_question':X_test['contains_question'].values,'contains_hashtag':X_test['contains_hashtag'].values,'comment_order':X_test['comment_order'].values,'word_count':X_test['word_count'].values,'polarity':X_test['polarity'].values,'subjectivity':X_test['subjectivity'].values,'body': X_test['body'].values,'y_true':y_true,'y_pred':y_pred}).to_csv(model_name+"_pred_true.csv", index=False)
示例#30
0
    def test_precision_score(self):
        result = self.df.metrics.precision_score(average='weighted')
        expected = metrics.precision_score(self.target, self.pred, average='weighted')
        self.assertEqual(result, expected)

        result = self.df.metrics.precision_score(average=None)
        expected = metrics.precision_score(self.target, self.pred, average=None)
        self.assertTrue(isinstance(result, pdml.ModelSeries))
        self.assert_numpy_array_almost_equal(result.values, expected)
示例#31
0
        num_round,
        valid_sets=[trn_data, val_data],
        verbose_eval=500,
        early_stopping_rounds=200,
        categorical_feature=cat_cols,
    )
    feat_imp_df['imp'] += clf.feature_importance() / 5
    oof_lgb[val_idx] = clf.predict(X_train.iloc[val_idx][features],
                                   num_iteration=clf.best_iteration)
    predictions_lgb[:] += clf.predict(X_test[features],
                                      num_iteration=clf.best_iteration)
print("AUC score: {}".format(roc_auc_score(y, oof_lgb)))
print("F1 score: {}".format(
    f1_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb])))
print("Precision score: {}".format(
    precision_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb])))
print("Recall score: {}".format(
    recall_score(y, [1 if i >= 0.5 else 0 for i in oof_lgb])))

drop_f = feat_imp_df[feat_imp_df['imp'] < 0.2]['feat']
for f in drop_f:
    if f in cat_cols:
        cat_cols.remove(f)
features = features.drop(drop_f)

print("得到最终特征共计{}维".format(len(features)))

print("开始模型一的训练与预测")
# 模型训练、预测
KF = StratifiedKFold(n_splits=5, random_state=2020, shuffle=True)
X_train[cat_cols] = X_train[cat_cols].astype('category')
# Come si può osservare dall'output il modello usato (Random Forest) ha una precisione media del 94% con una deviazione 
# standard del 0.06%,che ci mostra, quanto precise sono le stime. Ciò significa che la precisione del nostro modello 
# può variare di +/- 0.06%. Dunque, a seguito di questa verifica, la precisione continua ad essere ancora buona per cui 
# nelle fasi successive si proverà a migliorare ulteriormente le prestazioni del Random Forest.


# Matrice di Confusione per Random Forest
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
predictions = cross_val_predict(random_forest, X_train, Y_train, cv=3)
confusion_matrix(Y_train, predictions)

# PRECISIONE E RECALL (per il random forest)
from sklearn.metrics import precision_score, recall_score
print("Precision: ",  precision_score(Y_train, predictions))
print("Recall: ", recall_score(Y_train, predictions))


"""
# Ora invece CROSS VALIDATION per Decision Tree
from sklearn.model_selection import cross_val_score
rf = DecisionTreeClassifier()
scores = cross_val_score(rf, X_train, Y_train, cv=10, scoring = "accuracy")
print("Scores: ", scores)
print("Mean: ", scores.mean())
print("Standard deviation: ", scores.std())

# Abbiamo all'incirca gli stessi risultati del Random Forest per quanto riguarda la deviazione standard (0.04%), mentre
# la precisione media è del 96%, quindi molto più accurato del Random Forest.
sorted_positive_features = np.argsort(positive_features)[::-1]

print("Top 20 Important Features and their log probabilities For Negative Class :\n\n")
for i in list(sorted_negative_features[0:20]):
    print("%s\t -->\t%f  "%(feature_names[i],negative_features[i]))
    
print("\n\nTop 20 Important Features and their log probabilities For Positive Class :\n\n")
for i in list(sorted_positive_features[0:20]):
    print("%s\t -->\t%f  "%(feature_names[i],positive_features[i]))
    
# evaluate accuracy
acc = accuracy_score(Y_test, predictions) * 100
print('\nThe Test Accuracy of the Bernoulli naive Bayes classifier for alpha = %.3f is %f%%' % (optimal_alpha, acc))

# evaluate precision
acc = precision_score(Y_test, predictions, pos_label = 'positive') 
print('\nThe Test Precision of the Bernoulli naive Bayes classifier for alpha = %.3f is %f' % (optimal_alpha, acc))

# evaluate recall
acc = recall_score(Y_test, predictions, pos_label = 'positive')
print('\nThe Test Recall of the Bernoulli naive Bayes classifier for alpha = %.3f is %f' % (optimal_alpha, acc))

# evaluate f1-score
acc = f1_score(Y_test, predictions, pos_label = 'positive')
print('\nThe Test F1-Score of the Bernoulli naive Bayes classifier for alpha = %.3f is %f' % (optimal_alpha, acc))

# Evaluate TPR , FPR , TNR , FNR
TrueNeg,FalseNeg,FalsePos, TruePos = confusion_matrix(Y_test, predictions).ravel()

# Evaluate TPR (TPR = TP/(FN+TP))
TPR = TruePos/(FalseNeg + TruePos)
示例#34
0
res = confusion_matrix(y_train_5, y_train_pred)
print(res)
'''
#p.87
print(res)
[[53124  1455] #5以外の画像 
 [  949  4472]] #5の画像
[[TN, FP]
[FN, TP]]
'''

#Precision and Recall Rate (p.88)
#F-value 適合率と再現率の調和平均(harmonic mean)
from sklearn.metrics import precision_score, recall_score, f1_score

precision_rate = precision_score(y_train_5, y_train_pred)
recall_rate = recall_score(y_train_5, y_train_pred)
f_value = f1_score(y_train_5, y_train_pred)

# pracision: 0.754513244474439  recall: 0.8249400479616307  f-value: 0.7881565033486078
print('pracision:', precision_rate, ' recall:', recall_rate, ' f-value:',
      f_value)

#Threshold閾値を決定して、プロジェクトに合った適合率 または、再現率にする。(適合率、再現率はトレードオフ)
#例えば、適合率をあげたい場合、

#y_score = cross_val_predict(sgd_clf, X_train, y_train_5, cv=3, method="decision_function") #決定閾値
'''
y_score_Threshold = 10000
y_train_pred_90 = (y_score > y_score_Threshold)
print(y_score, max(y_score), min(y_score))
def modelselection(X, y, modelname, featureExtractionVector):
    dictionarylist = {}
    accuracyscores = []
    precisionscores = []
    recallscores = []
    TP = []
    FP = []
    TN = []
    FN = []
    f1scores = []
    for modelname in modelname:
        if modelname == 'Naive Bayes':
            model = naive_bayes.MultinomialNB()
        if modelname == 'Logistic Regression':
            model = LogisticRegression(C=1., solver = 'lbfgs')
        if modelname == 'SVM':
            model = svm.LinearSVC()
        if modelname == 'K-NN':
            model = KNeighborsClassifier()
        if modelname == 'AdaBoost':
            model = AdaBoostClassifier()
        if modelname == 'Random Forest Classifier':
            model = RandomForestClassifier(n_estimators=100)
        if modelname == 'Gradient Boosting Classifier':
            model = GradientBoostingClassifier()

        kf = KFold(n_splits=10)
        for train_index, test_index in kf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            model.fit(X_train, y_train)
            predict = model.predict(X_test)
            accuracyscore = accuracy_score(y_test, predict)
            accuracyscores.append(accuracyscore)
            precisionscores.append(precision_score(y_test, predict))
            recallscores.append(recall_score(y_test, predict))
            TP.append(confusion_matrix(y_test, predict)[1][1])
            FP.append(confusion_matrix(y_test, predict)[1][0])
            TN.append(confusion_matrix(y_test, predict)[0][0])
            FN.append(confusion_matrix(y_test, predict)[0][1])
            f1score = f1_score(y_test, predict)
            f1scores.append(f1score)
            if modelname != 'SVM':
                y_pred_prob = model.predict_proba(X_test)[:, 1]
                fpr, tpr, threshold = roc_curve(y_test, y_pred_prob)
                roc_auc = roc_auc_score(y_test, y_pred_prob)
            # plot roc curves
            plotroccurves(modelname, y_pred_prob,
                          fpr, tpr, roc_auc, predict)

        # Convert the array to list and store average
        accuracyscores = np.asarray(accuracyscores)
        TP = np.asarray(TP)
        FP = np.asarray(FP)
        TN = np.asarray(TN)
        FN = np.asarray(FN)
        f1scores = np.asarray(f1scores)
        precisionscores = np.asarray(precisionscores)
        recallscores = np.asarray(recallscores)
        print(
            f"The average accuracy score for training dataset length: {len(train_index)} for {modelname}:")
        print("%0.6f (+/- %0.6f)" %
              (accuracyscores.mean(), accuracyscores.std() * 2))
        # accuracyscores = []
        print('TP = ', int(TP.mean()))
        print('FP = ', int(FP.mean()))
        print('TN = ', int(TN.mean()))
        print('FN = ', int(FN.mean()))
        print('F1 Score =', f1scores.mean())s
        print('The precision score = ', precisionscores.mean())
        print('The recall score = ', recallscores.mean())
        dictionary = dict()
        dictionary = ({modelname: {'Accuracy score': accuracyscores.mean(), 'f1score': f1scores.mean(), 'precision': precisionscores.mean(),
                                   'recall': recallscores.mean(), 'true positive': int(TP.mean()), 'true negative': int(TN.mean()), 'false positive': int(FP.mean()), 'false negative': int(FN.mean())}})
        dictionarylist.update(dictionary)
        accuracyscores = []
        TP = []
        FP = []
        TN = []
        FN = []
        f1scores = []
        precisionscores = []
        recallscores = []
        # The code below is used to enter a query to check the sentiment. Query is a sentence.
        '''
        if modelname == 'Naive Bayes':
                model = MultinomialNB()
                model.fit(X, y)
                while(1):

                    inputtext = input('Enter a string; "exit" to exit.\n')
                    if inputtext == 'exit':
                        exit(0)
                    inputtext = preprocessdata(inputtext)
                    inputtext = stem_sentences(inputtext)
                    inputtext = [inputtext]
                    inputvector = featureExtractionTest(
                        inputtext, featureExtractionVector)
                    predict = model.predict(inputvector)
                # Converting input string to Array for the vectorizer.

                # for i in range(len(inputtext)):

                #     inputtext[i] = preprocessdata(inputtext[i])
                #     inputtext[i] = stem_sentences(inputtext[i])
                #     inputvector[i] = featureExtractionTest(
                #         inputtext[i], featureExtractionVector)
                #     predict = model.predict(inputvector[i])
                #     # inputvector.clear()

                    # print(predict)
                    if predict == 1:
                        print('The sentiment score is positive')
                    elif predict == 0:
                        print('The sentiment score is negative')
                    else:
                        print('Error: Predict is: ', predict)
        '''
    return dictionarylist
示例#36
0
class_names = [0, 1]  # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu", fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

print("Accuracy:", metrics.accuracy_score(y_test, y_pred))
print("Precision:", metrics.precision_score(y_test, y_pred))
print("Recall:", metrics.recall_score(y_test, y_pred))

############################################################################################################################

# <a href = "#top">Back to index</a><br />

# <a id = "reg_plot"></a>

# In[53]:

############################################################################################################################
# Regresson plot

y_pred_proba = logreg.predict_proba(X_test)[::, 1]
fpr, tpr, _ = metrics.roc_curve(y_test, y_pred_proba)
示例#37
0
def crossvalidation(filepath):
    raw_df = pd.read_csv(filepath)
    Options_df = make_clean_Options_df(raw_df)
    print (Options_df.head())
    y = Options_df.pop('Options').values
    X = Options_df.values
    X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y,test_size=0.33)
    # After the split
    #X_train,y_train = oversample0(X_train,y_train)


    kfold = StratifiedKFold(5,shuffle=False)

    accuracies = []
    precisions = []
    recalls = []
    sumofy = []
    sumofy2 = []
    F1s = []

    for train_index, test_index in kfold.split(X_train,y_train):
        model = LR(solver='liblinear')
        model.fit(X_train[train_index], y_train[train_index])
        #y_predict = model.predict(X_train[test_index])
        y_proba = model.predict_proba(X_train[test_index])[:,1]
        #  Above the model is using predict_proba this returns the 'probability' of a 1
        #  the code below uses this with a differnt threshold to get the actual prediciton
        y_predict = np.array([(lambda z: 1 if z >0.45177236 else 0)(z) for z in y_proba])
        y_true = y_train[test_index]
        #print(y_proba)
        #print('Predict',y_predict)
        #print('True   ',y_true)
        accuracies.append(accuracy_score(y_true, y_predict))
        precisions.append(precision_score(y_true, y_predict))
        recalls.append(recall_score(y_true, y_predict))
        sumofy.append(len(y_predict)-y_predict.sum())
        sumofy2.append(len(y_true)-y_true.sum())
        F1s.append(f1_score(y_true, y_predict))


        # print ("accuracy:", np.average(accuracies))
        # print ("precision:", np.average(precisions))
        # print ("recall:", np.average(recalls))
        # print ("sumofyPridict:", np.average(sumofy))
        # print ("sumofyTrue:", np.average(sumofy2))
        # print ('F1', np.average(F1s))

    y_proba = model.predict_proba(X_train)[:,1]
    y_predict = np.array([(lambda z: 1 if z >0.6443742 else 0)(z) for z in y_proba])
    #print(y_train, y_predict)
    plotROC(y_train, y_proba,"images/plotROC_Training.png","ROC Curve Training")

    y_proba = model.predict_proba(X_test)[:,1]
    y_predict = np.array([(lambda z: 1 if z >0.6443742 else 0)(z) for z in y_proba])
    print("__________________test")

    #print(y_proba,y_test, y_predict)
    print("Threashold 0.6443742")
    plotROC(y_test, y_proba,'images/ROC_Test.png',"ROC Curve Test")
    print ('Test accuracy_score',accuracy_score(y_test, y_predict))
    print ('Test precision_score',precision_score(y_test, y_predict))
    print ('Test recall_score',recall_score(y_test, y_predict))
    print ('Test f1_score',f1_score(y_test, y_predict))
    print('---------------------summmary')
    print(raw_df.Options.value_counts())
    print('train sum',sum(y_train),'train len',len(y_train))
    zipped = set(zip(y_proba,y_test, y_predict))
    cw = csv.writer(open("zipped.csv",'w'))
    cw.writerow(zipped)
    y_proba = model.predict_proba(X_train)[:,1]
    y_predict = np.array([(lambda z: 1 if z >0.0 else 0)(z) for z in y_proba])
    print('------All ones')
    print(len(y_proba),len(y_train),len(y_predict))
    #print(y_proba,y_train, y_predict)
    # print("Threashold 0.6443742")
    plotROC(y_train, y_proba,'images/plotAll1.png','ROC Curve All 1')
    print ('All1 accuracy_score',accuracy_score(y_train, y_predict))
    print ('All1 precision_score',precision_score(y_train, y_predict))
    print ('All1 recall_score',recall_score(y_train, y_predict))
    print ('All1 f1_score',f1_score(y_train, y_predict))
示例#38
0
def fun(in_road):  # ok
    start = time.time()
    index = []
    # 获取csv文件里面一共有几列
    col_num = get_col.getCol(in_road)
    data_dimension = col_num - 1

    # 载入数据集
    dataset = loadtxt(in_road, delimiter=",", skiprows=1)
    print(type(dataset))

    # split data into x and y
    x = dataset[:, 0:data_dimension]  # x[:,m:n],即取所有数据的第m到n-1列数据,含左不含右
    y = dataset[:, data_dimension]

    random_s = [8, 20, 40, 100, 200, 1000]  # 依据不同的种子运算多次,之后进行投票选择继续约减
    for rs in random_s:
        # 把数据集拆分成训练集和测试集
        x_train, x_test, y_train, y_test = train_test_split(x,
                                                            y,
                                                            test_size=0.33,
                                                            random_state=rs)

        print("-----------------XGBoost-----------------")

        # 拟合XGBoost模型
        model1 = XGBClassifier(
            learning_rate=0.1,
            n_estimators=1000,  # 树的个数--1000棵树建立xgboost
            max_depth=5,  # 树的深度
            min_child_weight=1,  # 叶子节点最小权重
            gamma=0.,  # 惩罚项中叶子结点个数前的参数
            subsample=0.8,  # 随机选择80%样本建立决策树
            colsample_btree=0.8,  # 随机选择80%特征建立决策树
            objective='reg:logistic',  # 指定损失函数
            scale_pos_weight=1,  # 解决样本个数不平衡的问题
            random_state=27  # 随机数种子
        )
        model1.fit(x_train, y_train)

        # 强特征排序
        importance = model1.feature_importances_
        top = pd.Series(importance).sort_values(ascending=False)

        # 输出前10的index索引
        print(list(top.index)[:top_num])
        index.extend(list(top.index)[:top_num])

        # 对测试集做预测
        y_pred = model1.predict(x_test)
        predictions = [round(value) for value in y_pred]
        accuracy = accuracy_score(y_test, predictions)
        print("Accuracy: %.2f%%" % (accuracy * 100.0))
        precision = precision_score(y_test, predictions)
        print("precision: %.2f%%" % (precision * 100.0))

        print("-----------------LightGBM-----------------")

        params = {
            'task': 'train',
            'boosting_type': 'gbdt',  # GBDT算法为基础
            'objective': 'binary',
            'metric': 'auc',  # 评判指标
            'max_bin': 255,  # 大会有更准的效果,更慢的速度
            'learning_rate': 0.1,  # 学习率
            'num_leaves': 64,  # 大会更准,但可能过拟合
            # 'max_depth': -1,   小数据集下限制最大深度可防止过拟合,小于0表示无限制
            'feature_fraction': 0.8,  # 防止过拟合
            'bagging_freq': 5,  # 防止过拟合
            'bagging_fraction': 0.8,  # 防止过拟合
            'min_data_in_leaf': 10,  # 防止过拟合
            'min_sum_hessian_in_leaf': 3.0,  # 防止过拟合
            # 'header': True   数据集是否带表头
            'verbose':
            -1  # 忽略掉警告:No further splits with positive gain, best gain: -inf
        }

        lgb_train = lgb.Dataset(x_train, label=y_train)
        model2 = lgb.train(params, train_set=lgb_train)

        importance = model2.feature_importance()
        top = pd.Series(importance).sort_values(ascending=False)
        print(list(top.index)[:top_num])
        index.extend(list(top.index)[:top_num])

        y_pred = model2.predict(x_test)
        predictions = [round(value) for value in y_pred]
        accuracy = accuracy_score(y_test, predictions)
        print("Accuracy: %.2f%%" % (accuracy * 100.0))
        precision = precision_score(y_test, predictions)
        print("precision: %.2f%%" % (precision * 100.0))

        print("-----------------ExtraTree是随机森林的一个变种-----------------")

        model4 = ExtraTreesClassifier(n_estimators=10,
                                      max_depth=None,
                                      min_samples_split=2,
                                      random_state=0)
        model4.fit(x_train, y_train)

        importance = model4.feature_importances_
        top = pd.Series(importance).sort_values(ascending=False)
        print(list(top.index)[:top_num])
        index.extend(list(top.index)[:top_num])

        y_pred = model4.predict(x_test)
        predictions = [round(value) for value in y_pred]
        accuracy = accuracy_score(y_test, predictions)
        print("Accuracy: %.2f%%" % (accuracy * 100.0))
        precision = precision_score(y_test, predictions)
        print("precision: %.2f%%" % (precision * 100.0))

    end = time.time()
    running_time = end - start
    print('-----------time--------')
    print(running_time)

    print(index)

    #排序
    sort = get_count_by_counter(index)
    top_index = sort.most_common(top_num)

    return top_index
示例#39
0
def scores(y_test, y_pred, th=0.5):           
    y_predlabel = [(0. if item < th else 1.) for item in y_pred]
    tn, fp, fn, tp = confusion_matrix(y_test, y_predlabel).flatten()
    SPE = tn*1./(tn+fp)
    MCC = matthews_corrcoef(y_test, y_predlabel)
    fpr,tpr,threshold = roc_curve(y_test, y_predlabel)
    sen, spe, pre, f1, mcc, acc, auc, tn, fp, fn, tp = np.array([recall_score(y_test, y_predlabel), SPE, precision_score(y_test, y_predlabel,average='macro'), 
                                                                 f1_score(y_test, y_predlabel), MCC, accuracy_score(y_test, y_predlabel), 
                                                                 roc_auc_score(y_test, y_pred), tn, fp, fn, tp])
    return sen, spe, pre, f1, mcc, acc,auc,tn,fp,fn,tp  
示例#40
0
def run():

    train_e = getParticleSet('/home/drozd/analysis/data_train_elecs.npy')
    train_p = getParticleSet('/home/drozd/analysis/data_train_prots.npy')
    train = np.concatenate((train_e, train_p))
    np.random.shuffle(train)

    X_train = train[:, 0:-1]
    Y_train = train[:, -1]
    del train_e, train_p, train

    val_e = np.concatenate(
        (getParticleSet(
            '/home/drozd/analysis/fraction1/data_validate_elecs_1.npy'),
         getParticleSet('/home/drozd/analysis/fraction1/data_test_elecs_1.npy')
         ))
    val_p = np.concatenate(
        (getParticleSet(
            '/home/drozd/analysis/fraction1/data_validate_prots_1.npy'),
         getParticleSet('/home/drozd/analysis/fraction1/data_test_prots_1.npy')
         ))

    val = np.concatenate((val_e, val_p))
    np.random.shuffle(val)

    X_val = val[:, 0:-1]
    Y_val = val[:, -1]

    val_imba = np.concatenate((val_e[0:int(val_p.shape[0] / 100)], val_p))
    np.random.shuffle(val_imba)
    X_val_imba = val_imba[:, 0:-1]
    Y_val_imba = val_imba[:, -1]

    del val_e, val_p, val, val_imba

    model = Sequential()
    model.add(
        Dense(300,
              input_shape=(X_train.shape[1], ),
              kernel_initializer='he_uniform',
              activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(150, kernel_initializer='he_uniform', activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(70, kernel_initializer='he_uniform', activation='relu'))
    model.add(Dropout(0.1))
    model.add(Dense(1, kernel_initializer='he_uniform', activation='sigmoid'))
    model.compile(loss='binary_crossentropy',
                  optimizer='adam',
                  metrics=['binary_accuracy'])

    rdlronplt = ReduceLROnPlateau(monitor='loss', patience=3, min_lr=0.001)
    earl = EarlyStopping(monitor='loss', min_delta=0.0001, patience=5)
    callbacks = [rdlronplt, earl]

    history = model.fit(X_train,
                        Y_train,
                        batch_size=150,
                        epochs=40,
                        verbose=0,
                        callbacks=callbacks,
                        validation_data=(X_val, Y_val))

    # --------------------------------

    predictions_balanced = model.predict(X_val)
    predictions_imba = model.predict(X_val_imba)
    predictions_train = model.predict(X_train)

    del X_val, X_val_imba, X_train

    sk_l_precision_b, sk_l_recall_b, sk_l_thresholds_b = precision_recall_curve(
        Y_val, predictions_balanced)
    sk_l_precision_i, sk_l_recall_i, sk_l_thresholds_i = precision_recall_curve(
        Y_val_imba, predictions_imba)
    sk_l_precision_t, sk_l_recall_t, sk_l_thresholds_t = precision_recall_curve(
        Y_train, predictions_train)

    sk_l_fpr_b, sk_l_tpr_b, sk_l_roc_thresholds_b = roc_curve(
        Y_val, predictions_balanced)
    sk_l_fpr_i, sk_l_tpr_i, sk_l_roc_thresholds_i = roc_curve(
        Y_val_imba, predictions_imba)
    sk_l_fpr_t, sk_l_tpr_t, sk_l_roc_thresholds_t = roc_curve(
        Y_train, predictions_train)

    man_l_precision_b, man_l_recall_b, man_l_thresholds_b = getPR(
        Y_val, predictions_balanced, 100)
    man_l_precision_i, man_l_recall_i, man_l_thresholds_i = getPR(
        Y_val_imba, predictions_imba, 100)

    man_l_fpr_b, man_l_tpr_b, man_l_roc_thresholds_b = getROC(
        Y_val, predictions_balanced, 100)
    man_l_fpr_i, man_l_tpr_i, man_l_roc_thresholds_i = getROC(
        Y_val_imba, predictions_imba, 100)

    print("----- AUC -----")
    print("Train:", average_precision_score(Y_train, predictions_train))
    print("Validate:", average_precision_score(Y_val, predictions_balanced))
    print("----- F1 -----")
    print("Train:", f1_score(Y_train, np.around(predictions_train)))
    print("Validate:", f1_score(Y_val, np.around(predictions_balanced)))
    print("----- Precision/Recall -----")
    print("Train:", precision_score(Y_train, np.around(predictions_train)),
          " / ", recall_score(Y_train, np.around(predictions_train)))
    print("Validate:", precision_score(Y_val, np.around(predictions_balanced)),
          " / ", recall_score(Y_val, np.around(predictions_balanced)))

    fig1 = plt.figure()
    plt.plot(sk_l_precision_b, sk_l_recall_b, label='balanced')
    plt.plot(sk_l_precision_i, sk_l_recall_i, label='imbalanced')
    #~ plt.plot(sk_l_precision_t, sk_l_recall_t,label='training set')
    #~ plt.plot(man_l_precision_b, man_l_recall_b,'o',label='balanced, hand')
    #~ plt.plot(man_l_precision_i, man_l_recall_i,'o',label='imbalanced, hand')
    plt.xlabel('Precision')
    plt.ylabel('Recall')
    plt.legend(loc='best')
    plt.savefig('PR')

    fig1b = plt.figure()
    plt.plot(sk_l_precision_b, sk_l_recall_b, label='validation set')
    plt.plot(sk_l_precision_t, sk_l_recall_t, label='training set')
    plt.xlabel('Precision')
    plt.ylabel('Recall')
    plt.legend(loc='best')
    plt.savefig('PRb')

    fig2 = plt.figure()
    plt.plot(sk_l_fpr_b, sk_l_tpr_b, label='balanced')
    plt.plot(sk_l_fpr_i, sk_l_tpr_i, label='imbalanced')
    #~ plt.plot(man_l_fpr_b, man_l_tpr_b,'o',label='balanced, hand')
    #~ plt.plot(man_l_fpr_i, man_l_tpr_i,'o',label='imbalanced, hand')
    plt.xlabel('False Positive')
    plt.ylabel('True Positive')
    plt.legend(loc='best')
    plt.savefig('ROC')

    fig2b = plt.figure()
    plt.plot(sk_l_fpr_b, sk_l_tpr_b, label='validation set')
    plt.plot(sk_l_fpr_t, sk_l_tpr_t, label='training set')
    plt.xlabel('False Positive')
    plt.ylabel('True Positive')
    plt.legend(loc='best')
    plt.savefig('ROCb')

    Nbins = 50
    binList = [x / Nbins for x in range(0, Nbins + 1)]

    elecs_t, prots_t = getClassifierScore(Y_train, predictions_train)
    fig3 = plt.figure()
    plt.hist(elecs_t,
             bins=binList,
             label='e',
             alpha=0.7,
             histtype='step',
             color='green')
    plt.hist(prots_t,
             bins=binList,
             label='p',
             alpha=0.7,
             histtype='step',
             color='red')
    plt.xlabel('Classifier score')
    plt.ylabel('Number of events')
    plt.title('Training set')
    plt.legend(loc='best')
    plt.yscale('log')
    plt.savefig('predHisto_train')

    fig3b = plt.figure()
    plt.hist(elecs_t,
             bins=binList,
             label='e',
             alpha=0.7,
             histtype='step',
             color='green',
             normed=True)
    plt.hist(prots_t,
             bins=binList,
             label='p',
             alpha=0.7,
             histtype='step',
             color='red',
             normed=True)
    plt.xlabel('Classifier score')
    plt.ylabel('Fraction of events')
    plt.title('Training set - normalised')
    plt.legend(loc='best')
    plt.yscale('log')
    plt.savefig('predHisto_train_n')
    del elecs_t, prots_t, Y_train, predictions_train

    elecs_b, prots_b = getClassifierScore(Y_val, predictions_balanced)
    fig4 = plt.figure()
    plt.hist(elecs_b,
             bins=binList,
             label='e',
             alpha=0.7,
             histtype='step',
             color='green')
    plt.hist(prots_b,
             bins=binList,
             label='p',
             alpha=0.7,
             histtype='step',
             color='red')
    plt.xlabel('Classifier score')
    plt.ylabel('Number of events')
    plt.title('Balanced validation set')
    plt.legend(loc='best')
    plt.yscale('log')
    plt.savefig('predHisto_bal')

    fig4b = plt.figure()
    plt.hist(elecs_b,
             bins=binList,
             label='e',
             alpha=0.7,
             histtype='step',
             color='green',
             normed=True)
    plt.hist(prots_b,
             bins=binList,
             label='p',
             alpha=0.7,
             histtype='step',
             color='red',
             normed=True)
    plt.xlabel('Classifier score')
    plt.ylabel('Fraction of events')
    plt.title('Balanced validation set - normalised')
    plt.legend(loc='best')
    plt.yscale('log')
    plt.savefig('predHisto_bal_n')
    del elecs_b, prots_b, Y_val, predictions_balanced

    elecs_i, prots_i = getClassifierScore(Y_val_imba, predictions_imba)
    fig5 = plt.figure()
    plt.hist(elecs_i,
             bins=binList,
             label='e',
             alpha=0.7,
             histtype='step',
             color='green')
    plt.hist(prots_i,
             bins=binList,
             label='p',
             alpha=0.7,
             histtype='step',
             color='red')
    plt.xlabel('Classifier score')
    plt.ylabel('Number of events')
    plt.legend(loc='best')
    plt.title('Imbalanced validation set')
    plt.yscale('log')
    plt.savefig('predHisto_imba')

    fig5b = plt.figure()
    plt.hist(elecs_i,
             bins=binList,
             label='e',
             alpha=0.7,
             histtype='step',
             color='green',
             normed=True)
    plt.hist(prots_i,
             bins=binList,
             label='p',
             alpha=0.7,
             histtype='step',
             color='red',
             normed=True)
    plt.xlabel('Classifier score')
    plt.ylabel('Fraction of events')
    plt.title('Imbalanced validation set - normalised')
    plt.legend(loc='best')
    plt.yscale('log')
    plt.savefig('predHisto_imba_n')
def test_fasttext():
    """Test FASTTEXT model."""
    # Print parameters used for the model
    dh.tab_printer(args, logger)

    # Load word2vec model
    word2idx, embedding_matrix = dh.load_word2vec_matrix(args.word2vec_file)

    # Load data
    logger.info("Loading data...")
    logger.info("Data processing...")
    test_data = dh.load_data_and_labels(args, args.test_file, word2idx)

    # Load fasttext model
    OPTION = dh._option(pattern=1)
    if OPTION == 'B':
        logger.info("Loading best model...")
        checkpoint_file = cm.get_best_checkpoint(BEST_CPT_DIR,
                                                 select_maximum_value=True)
    else:
        logger.info("Loading latest model...")
        checkpoint_file = tf.train.latest_checkpoint(CPT_DIR)
    logger.info(checkpoint_file)

    graph = tf.Graph()
    with graph.as_default():
        session_conf = tf.ConfigProto(
            allow_soft_placement=args.allow_soft_placement,
            log_device_placement=args.log_device_placement)
        session_conf.gpu_options.allow_growth = args.gpu_options_allow_growth
        sess = tf.Session(config=session_conf)
        with sess.as_default():
            # Load the saved meta graph and restore variables
            saver = tf.train.import_meta_graph(
                "{0}.meta".format(checkpoint_file))
            saver.restore(sess, checkpoint_file)

            # Get the placeholders from the graph by name
            input_x = graph.get_operation_by_name("input_x").outputs[0]
            input_y = graph.get_operation_by_name("input_y").outputs[0]
            dropout_keep_prob = graph.get_operation_by_name(
                "dropout_keep_prob").outputs[0]
            is_training = graph.get_operation_by_name("is_training").outputs[0]

            # Tensors we want to evaluate
            scores = graph.get_operation_by_name("output/scores").outputs[0]
            loss = graph.get_operation_by_name("loss/loss").outputs[0]

            # Split the output nodes name by '|' if you have several output nodes
            output_node_names = "output/scores"

            # Save the .pb model file
            output_graph_def = tf.graph_util.convert_variables_to_constants(
                sess, sess.graph_def, output_node_names.split("|"))
            tf.train.write_graph(output_graph_def,
                                 "graph",
                                 "graph-fasttext-{0}.pb".format(MODEL),
                                 as_text=False)

            # Generate batches for one epoch
            batches = dh.batch_iter(list(create_input_data(test_data)),
                                    args.batch_size,
                                    1,
                                    shuffle=False)

            # Collect the predictions here
            test_counter, test_loss = 0, 0.0
            test_pre_tk = [0.0] * args.topK
            test_rec_tk = [0.0] * args.topK
            test_F1_tk = [0.0] * args.topK

            # Collect the predictions here
            true_labels = []
            predicted_labels = []
            predicted_scores = []

            # Collect for calculating metrics
            true_onehot_labels = []
            predicted_onehot_scores = []
            predicted_onehot_labels_ts = []
            predicted_onehot_labels_tk = [[] for _ in range(args.topK)]

            for batch_test in batches:
                x, y_onehot, y = zip(*batch_test)
                feed_dict = {
                    input_x: x,
                    input_y: y_onehot,
                    dropout_keep_prob: 1.0,
                    is_training: False
                }

                batch_scores, cur_loss = sess.run([scores, loss], feed_dict)

                # Prepare for calculating metrics
                for i in y_onehot:
                    true_onehot_labels.append(i)
                for j in batch_scores:
                    predicted_onehot_scores.append(j)

                # Get the predicted labels by threshold
                batch_predicted_labels_ts, batch_predicted_scores_ts = \
                    dh.get_label_threshold(scores=batch_scores, threshold=args.threshold)

                # Add results to collection
                for i in y:
                    true_labels.append(i)
                for j in batch_predicted_labels_ts:
                    predicted_labels.append(j)
                for k in batch_predicted_scores_ts:
                    predicted_scores.append(k)

                # Get onehot predictions by threshold
                batch_predicted_onehot_labels_ts = \
                    dh.get_onehot_label_threshold(scores=batch_scores, threshold=args.threshold)
                for i in batch_predicted_onehot_labels_ts:
                    predicted_onehot_labels_ts.append(i)

                # Get onehot predictions by topK
                for top_num in range(args.topK):
                    batch_predicted_onehot_labels_tk = dh.get_onehot_label_topk(
                        scores=batch_scores, top_num=top_num + 1)

                    for i in batch_predicted_onehot_labels_tk:
                        predicted_onehot_labels_tk[top_num].append(i)

                test_loss = test_loss + cur_loss
                test_counter = test_counter + 1

            # Calculate Precision & Recall & F1
            test_pre_ts = precision_score(
                y_true=np.array(true_onehot_labels),
                y_pred=np.array(predicted_onehot_labels_ts),
                average='micro')
            test_rec_ts = recall_score(
                y_true=np.array(true_onehot_labels),
                y_pred=np.array(predicted_onehot_labels_ts),
                average='micro')
            test_F1_ts = f1_score(y_true=np.array(true_onehot_labels),
                                  y_pred=np.array(predicted_onehot_labels_ts),
                                  average='micro')

            for top_num in range(args.topK):
                test_pre_tk[top_num] = precision_score(
                    y_true=np.array(true_onehot_labels),
                    y_pred=np.array(predicted_onehot_labels_tk[top_num]),
                    average='micro')
                test_rec_tk[top_num] = recall_score(
                    y_true=np.array(true_onehot_labels),
                    y_pred=np.array(predicted_onehot_labels_tk[top_num]),
                    average='micro')
                test_F1_tk[top_num] = f1_score(
                    y_true=np.array(true_onehot_labels),
                    y_pred=np.array(predicted_onehot_labels_tk[top_num]),
                    average='micro')

            # Calculate the average AUC
            test_auc = roc_auc_score(y_true=np.array(true_onehot_labels),
                                     y_score=np.array(predicted_onehot_scores),
                                     average='micro')

            # Calculate the average PR
            test_prc = average_precision_score(
                y_true=np.array(true_onehot_labels),
                y_score=np.array(predicted_onehot_scores),
                average="micro")
            test_loss = float(test_loss / test_counter)

            logger.info(
                "All Test Dataset: Loss {0:g} | AUC {1:g} | AUPRC {2:g}".
                format(test_loss, test_auc, test_prc))

            # Predict by threshold
            logger.info(
                "Predict by threshold: Precision {0:g}, Recall {1:g}, F1 {2:g}"
                .format(test_pre_ts, test_rec_ts, test_F1_ts))

            # Predict by topK
            logger.info("Predict by topK:")
            for top_num in range(args.topK):
                logger.info(
                    "Top{0}: Precision {1:g}, Recall {2:g}, F1 {3:g}".format(
                        top_num + 1, test_pre_tk[top_num],
                        test_rec_tk[top_num], test_F1_tk[top_num]))

            # Save the prediction result
            if not os.path.exists(SAVE_DIR):
                os.makedirs(SAVE_DIR)
            dh.create_prediction_file(output_file=SAVE_DIR +
                                      "/predictions.json",
                                      data_id=test_data['id'],
                                      true_labels=true_labels,
                                      predict_labels=predicted_labels,
                                      predict_scores=predicted_scores)

    logger.info("All Done.")
示例#42
0
# Calculate precision and recall for both classes (0 and 1)
r1 = recall_score(y_vall,
                  y_pred,
                  labels=None,
                  pos_label=1,
                  average='binary',
                  sample_weight=None)
r0 = recall_score(y_vall,
                  y_pred,
                  labels=None,
                  pos_label=0,
                  average='binary',
                  sample_weight=None)
p1 = precision_score(y_vall,
                     y_pred,
                     labels=None,
                     pos_label=1,
                     average='binary',
                     sample_weight=None)
p0 = precision_score(y_vall,
                     y_pred,
                     labels=None,
                     pos_label=0,
                     average='binary',
                     sample_weight=None)
print('Recall for class 1 is {}'.format(r1))
print('Recall for class 0 is {}'.format(r0))
print('Precision for class 1 is {}'.format(p1))
print('Precision for class 0 is {}'.format(p0))
示例#43
0
def train_and_eval_DNN(df, X_train, X_test, y_train, y_test, y_names,
                       feature_set, metrics_manager, fold):
    """ Train and Evaulate Deep Neural Networks

    Args:
    df => pandas dataframe
    fold => n-fold cross validation
    
    Classifier names used as key in metrics_manager
        Keras-TensorFlow => keras
        Fast.ai => fastai
    Returns:
    None
    """

    # Keras-TensorFlow DNN Model
    print('Training and Evaluating Keras-Tensoflow...')
    dnn_keras = Sequential(layers=[
        Dense(128,
              kernel_regularizer=l2(0.001),
              activation='relu',
              input_shape=(len(X_train.columns), )),
        BatchNormalization(),
        Dense(64, activation='relu', kernel_regularizer=l2(0.001)),
        BatchNormalization(),
        Dense(y_train.nunique(), activation='softmax')
    ])
    dnn_keras.compile(optimizer='adam', loss='binary_crossentropy')

    dnn_keras.fit(X_train,
                  pd.get_dummies(y_train),
                  epochs=100,
                  verbose=0,
                  batch_size=512)
    #loss, acc = dnn_keras.evaluate(X_test, pd.get_dummies(y_test), verbose=0)
    y_pred = dnn_keras.predict_classes(X_test)

    acc = accuracy_score(y_test, y_pred)
    bal_acc = balanced_accuracy_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred, average='weighted')
    prec = precision_score(y_test, y_pred, average='weighted')
    auc = roc_auc_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    m = Metric('Keras-Tensorflow', fold=fold)
    m.addValue('acc', 100 * acc)
    m.addValue('bal-acc', 100 * bal_acc)
    m.addValue('rec', 100 * rec)
    m.addValue('prec', 100 * prec)
    m.addValue('auc', 100 * auc)
    m.addValue('f1', 100 * f1)
    metrics_manager.addMetric(m)
    metrics_manager.printMeasures()

    # Fast.ai DNN Model, v.2
    print('Training and Evaluating Fast.ai...')
    splits = RandomSplitter(valid_pct=0.2)(range_of(X_train))
    #print(feature_set)
    #print(df[:5])
    tp = TabularPandas(df,
                       procs=[],
                       cat_names=[],
                       cont_names=list(feature_set),
                       y_names=y_names,
                       splits=splits)

    dls = tp.dataloaders(bs=64)
    #dls.show_batch()
    #return
    dnn_fastai = tabular_learner(dls, metrics=accuracy)
    dnn_fastai.fit_one_cycle(5)

    # acquire predictions
    y_pred = []
    #print('Length of test set: {}'.format(len(y_test)))
    for j in range(len(y_test)):
        row, clas, probs = dnn_fastai.predict(X_test.iloc[j])
        #print(clas)
        pred = 0
        if clas >= tensor(0.5):
            pred = 1
        y_pred.append(pred)

    acc = accuracy_score(y_test, y_pred)
    bal_acc = balanced_accuracy_score(y_test, y_pred)
    rec = recall_score(y_test, y_pred, average='weighted')
    prec = precision_score(y_test, y_pred, average='weighted')
    auc = roc_auc_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)

    m = Metric('fastai', fold=fold)
    m.addValue('acc', 100 * acc)
    m.addValue('bal-acc', 100 * bal_acc)
    m.addValue('rec', 100 * rec)
    m.addValue('prec', 100 * prec)
    m.addValue('auc', 100 * auc)
    m.addValue('f1', 100 * f1)
    metrics_manager.addMetric(m)
def run(k, j, filename, seednum=10, threshold = 0.5, resultdir=None):
#    classes = ["P1a1" , "P1a2"  , "P2b"  , "P2c" ] 
    classes = ["P1a1" , "P1a2", "P2b", "P2c", "H1" ]
    # H1 H2  O (1) P1a1 (4)  P1a2 (6)   P2b   P2c   S1a (0)   S1c    S2    S3 
    joind = gp.read_file(filename, layer = layers[j])
    print(f'\n------\n------{layers[j]}----\n-----\n')
    df1 = pd.DataFrame(joind.drop(columns='geometry'))
    df1 = df1.replace([np.inf, -np.inf], np.nan).dropna()
    
    Pcl = df1.loc[df1['geocode_2'].isin(classes)] # filter only classes of interest
    print(Pcl['geocode_2'].value_counts())
    # regroup, geocode_2 from here on becomes binary!
    Pcl['geocode_2'] = np.where(Pcl['geocode_2'].str.contains(classes[k]),classes[k],'Others')
    print(Pcl['geocode_2'].value_counts())
    minc = min(Pcl['geocode_2'].value_counts() ) # skip if less than 20 objects 
    if minc< 20 or minc==len(Pcl):
        print("minimum class less than 20")
        return (-1, -1) # -1 -1 if not calculated
    else:    
        print(f'total {len(df1)}, P_H1_classes: {len(Pcl)}, minimun class: {minc}')       
        # bootstrap and get averaged accuracy
        avepre = np.zeros(1) # store all the precisions in each CV
        averec = np.zeros(1)
        for seeds in range(seednum):
            np.random.seed(seeds)
            # use groupby to sample the same amount for each group. 
            # use 70% of data for training, get the index
            train = Pcl.groupby('geocode_2').sample(n = int(minc*0.7)).index 
            test = Pcl[~Pcl.index.isin(train)].index
            #len(train)+len(test)
             
            df_covar = Pcl
            X_train = df_covar.loc [train ].drop(columns=["geocode_2","layer","OBJECTID","path"])
            X_test  = df_covar.loc [test ].drop(columns=["geocode_2","layer","OBJECTID","path"])
            
            Y_train  =Pcl.filter(regex='geocode_2').loc[train].values
            Y_test  =Pcl.filter(regex='geocode_2').loc[test].values
            
            # relable
            label_all = [classes[k], "Others"]
            #classtype  =  [(j, "float32") for j in classes]
            
            #Pcl.geocode_2.unique()
            i = 0
            idx2class = {}
            class2idx = {}
            for tp in label_all:
                idx2class[i] = tp
                class2idx[tp] = i 
                i+= 1
            
            Y_trainnum = cl2idx(Y_train, class2idx).astype(int)
            Y_testnum = cl2idx(Y_test, class2idx).astype(int)
             
            np.unique(Y_trainnum)
            # can consider use scikitlearn or h2o to replace the xgb API. 
            # note the estimators can only be specified in the xgb.train, not in the params. 
            dtrain = xgb.DMatrix(X_train, label=Y_trainnum)
            dtest = xgb.DMatrix(X_test, label=Y_testnum)
            params = {'max_depth': 6, 'eta': 0.002, 
                      'objective':'binary:logistic', 'num_class': 1, 'eval_metric':['merror', 'mlogloss', 'auc' ] }
            # Fit
            #print("Train and test shapes, dividing number of classes for the sample size (i.e. 2 for binary case)")
            #print(X_train.shape, Y_trainnum.shape, X_test.shape, Y_testnum.shape)
            model = xgb.train(params, dtrain,  500) #numroudnd = 500
 
            yhat = model.predict(dtest)
            # threshold 0.5, probability higher than 0.5 -> positive. 
            yhat_labels = yhat>threshold
            yhat_labels = yhat_labels.astype(int)
            
            #get accuracy score
            accuracy = accuracy_score(Y_testnum, yhat_labels)
            # get precision and recall
            # print("precision:  tp / (tp + fp)")
            # print(label_all)
            recall=np.round(recall_score(Y_testnum, yhat_labels, average = None),2)[0]
            # only get the recall and precision for the class of interest, therefore "[0]"
            precision = np.round(precision_score(Y_testnum, yhat_labels, average = None),2)[0]
            averec = np.append(averec, recall) #store all of them
            avepre= np.append(avepre, precision)
        recall = averec.sum()/seednum #get the mean but exclude the first one (0)
        precision = avepre.sum()/seednum
        print(averec, recall)
        if resultdir is not None:
            Y_testnum =  Y_testnum.astype(int)
            plt.rcParams.update({'font.size': 8})
            ax = xgb.plot_importance(model, grid=False, importance_type='gain', title='Feature importance')
            ax.set_title(f'xgboost importance {layers[j]} {classes[k]}')
            fname = f"{resultdir}P_{layers[j]}_{classes[k]}_imp"
            plt.savefig(fname, dpi=1200)
        return (recall, precision)
示例#45
0
def crossvalidationThr(filepath):
    raw_df = pd.read_csv(filepath)
    Options_df = make_clean_Options_df(raw_df)
    print (Options_df.head())
    y = Options_df.pop('Options').values
    X = Options_df.values
    X_train, X_test, y_train, y_test = train_test_split(X, y,stratify=y,test_size=0.33)
    # After the split
    #X_train,y_train = oversample0(X_train,y_train)


    kfold = StratifiedKFold(5,shuffle=False)

    accuracies = []
    precisions = []
    recalls = []
    sumofy = []
    sumofy2 = []
    F1s = []
    thr = [0.27411032 ,0.45177236 ,0.43848543 ,0.57502369 ,0.52727048, 0.26294993,
 0.3734106,  0.55862627 ,0.35298962, 0.55236403, 0.86459271, 0.47550826,
 0.36730173, 0.69804784 ,0.42914183, 0.56947256, 0.59538206, 0.58619825,
 0.73952725, 0.76668073 ,0.7893238 , 0.65972535, 0.77526139, 0.43201526,
 0.73858921, 0.60181424 ,0.43968636, 0.75375776, 0.78417738, 0.53074894,
 0.75953991, 0.62555268 ,0.48809615, 0.67150554, 0.52648925]
    bestthr = 0
    bestF1  = 0
    for i in thr:
        for train_index, test_index in kfold.split(X_train,y_train):
            model = LR(solver='liblinear')
            model.fit(X_train[train_index], y_train[train_index])
            #y_predict = model.predict(X_train[test_index])
            y_proba = model.predict_proba(X_train[test_index])[:,1]
            #  Above the model is using predict_proba this returns the 'probability' of a 1
            #  the code below uses this with a differnt threshold to get the actual prediciton
            y_predict = np.array([(lambda z: 1 if z >i else 0)(z) for z in y_proba])
            y_true = y_train[test_index]
            print(y_proba)
            print('Predict',y_predict)
            print('True   ',y_true)
            accuracies.append(accuracy_score(y_true, y_predict))
            precisions.append(precision_score(y_true, y_predict))
            recalls.append(recall_score(y_true, y_predict))
            sumofy.append(len(y_predict)-y_predict.sum())
            sumofy2.append(len(y_true)-y_true.sum())
            F1s.append(f1_score(y_true, y_predict))


            print ("accuracy:", np.average(accuracies))
            print ("precision:", np.average(precisions))
            print ("recall:", np.average(recalls))
            print ("sumofyPridict:", np.average(sumofy))
            print ("sumofyTrue:", np.average(sumofy2))
            print ('F1', np.average(F1s))
            if(np.average(F1s) > bestF1):
               bestF1 = np.average(F1s)
               bestthr = i
            accuracies = []
            precisions = []
            recalls = []
            sumofy = []
            sumofy2 = []
            F1s = []
    print('BestF1 and threashold',bestF1,bestthr)
def FULL_onehot_chem(Dropout1=0,Epochs= 20,Batch_size=64):
    # 优化器选择 Adam 优化器。
    # 损失函数使用 sparse_categorical_crossentropy,
    # 还有一个损失函数是 categorical_crossentropy,两者的区别在于输入的真实标签的形式,
    # sparse_categorical 输入的是整形的标签,例如 [1, 2, 3, 4],categorical 输入的是 one-hot 编码的标签。
    
    Feature_test = np.load("../../data_all/TCRB_train_feature_array.npy")    
    Label_array = np.load("../../data_all/TCRB_train_label_array.npy")
       
    X = Feature_test#[:,0:29,:] #提取one-hot特征
    #print(X[0])
    Y = Label_array[:,1]

    X = X.reshape(len(X),-1)
    #loo = LeaveOneOut()
    
    kf = KFold(n_splits=5,shuffle=True,random_state=0)
    kf.get_n_splits(X)
    TN = FP = FN = TP = 0
    aa = 1 
    for train_index, test_index in kf.split(X):
        np.random.shuffle(train_index)
        np.random.shuffle(test_index)
        
        X_train, X_test = X[train_index], X[test_index]
        Y_train, Y_test = Y[train_index], Y[test_index]
        
        
        
        
        
        
        X_train= X_train.reshape([len(X_train),29,20,2])
        X_test = X_test.reshape([len(X_test),29,20,2])
        X_test=tf.cast(X_test, tf.float32)

        model = tf.keras.models.Sequential([


            tf.keras.layers.Flatten(input_shape=(29,20,2)),

            tf.keras.layers.Dense(256,activation='relu'),# kernel_regularizer=regularizers.l2(0.01)),#  activation='relu',
            tf.keras.layers.Dense(512,activation='relu'),# kernel_regularizer=regularizers.l2(0.01)),#  activation='relu',

            tf.keras.layers.Dense(256,activation='relu'),# kernel_regularizer=regularizers.l2(0.01)),#  activation='relu',
            #tf.keras.layers.LeakyReLU(alpha=0.05), 



            tf.keras.layers.Dense(128,activation='relu'),
            #tf.keras.layers.LeakyReLU(alpha=0.05), 
            tf.keras.layers.Dense(64,activation='relu'),
            #tf.keras.layers.LeakyReLU(alpha=0.05), 
            tf.keras.layers.Dropout(Dropout1),# Dropout:在 0 和 1 之间浮动。需要丢弃的输入比例
            tf.keras.layers.Dense(1, activation='sigmoid')




        ]) 

        model.compile(optimizer="Adam",
                      loss=keras.losses.binary_crossentropy,
                      metrics=['accuracy'])
        model.fit(X_train, Y_train, epochs= Epochs , batch_size= Batch_size, verbose=0,)
        
        
        Y_pred = model.predict_classes(X_test)
        #print(Y_pred)
        confusion_matrix1 =confusion_matrix(Y_test,Y_pred)
        
        
        
        
        
        TP += confusion_matrix1[0,0]
        FN += confusion_matrix1[0,1]
        FP += confusion_matrix1[1,0]
        TN += confusion_matrix1[1,1]
        
#         accuracy = accuracy_score(Y_test,Y_pred) #准确率
#         precision = precision_score(Y_test,Y_pred) #精确率
#         recall = recall_score(Y_test,Y_pred) #召回率
#         f1= f1_score(Y_test,Y_pred) #F1
               
#         print('混淆矩阵\n',confusion_matrix1,
#               '\n准确率ACC:',accuracy,
#               '\n精确率precision:',precision,
#               '\n召回率recall:',recall,
#               '\nF1:',f1,
#              )
               
#         y_predict = model.predict(X_test)      
        
#         y_probs = model.predict_proba(X_test) #模型的预测得分
#         #print(y_probs)
        
#         fpr, tpr, thresholds = metrics.roc_curve(Y_test,y_probs)
#         roc_auc = auc(fpr, tpr)  #auc为Roc曲线下的面积
#         #开始画ROC曲线
#         plt.plot(fpr, tpr, 'b',label='AUC = %0.2f'% roc_auc)
#         plt.legend(loc='lower right')
#         plt.plot([0,1],[0,1],'r--')
#         plt.xlim([-0.1,1.1])
#         plt.ylim([-0.1,1.1])
#         plt.xlabel('False Positive Rate') #横坐标是fpr
#         plt.ylabel('True Positive Rate')  #纵坐标是tpr
#         plt.title('Receiver operating characteristic example')
#         plt.show()
        
        #model.save('./data_625/model_'+str(aa)+'.h5')
        #print(aa)
        
        
        if aa == 1:
            Y_test_all = Y_test
            Y_pred_all = Y_pred
        else:
            Y_test_all = np.append(Y_test_all, Y_test, axis=0)
            Y_pred_all = np.append(Y_pred_all, Y_pred, axis=0) 

        aa += 1
        del model
        
    print('\n\n总混淆矩阵')
    print(TP,FN)
    print(FP,TN)
    
    #print(Y_test_all[0])
            
    accuracy = accuracy_score(Y_test_all,Y_pred_all) #准确率
    precision = precision_score(Y_test_all,Y_pred_all) #精确率
    recall = recall_score(Y_test_all,Y_pred_all) #召回率
    f1= f1_score(Y_test_all,Y_pred_all) #F1
    
    MCC = matthews_corrcoef(Y_test_all,Y_pred_all) #MCC

   
    
    print('准确率ACC:',accuracy,
          '\n精确率precision:',precision,
          '\n召回率recall:',recall,
          '\nF1:',f1,
          '\nMCC:',MCC 
         )
示例#47
0
def main():
    np.random.seed(1)
    random.seed(1)
    feat_data, labels, adj_lists, train, test, edge_map = load_cora()
    num_nodes = feat_data.shape[0]
    feat_dim = feat_data.shape[1]
    hidden_dim = 15
    features = nn.Embedding(num_nodes, feat_dim)
    features.weight = nn.Parameter(torch.FloatTensor(feat_data),
                                   requires_grad=False)
    #features.cuda()

    agg1 = MeanAggregator(features, cuda=True)
    enc1 = Encoder(features,
                   feat_dim,
                   hidden_dim,
                   adj_lists,
                   agg1,
                   gcn=False,
                   cuda=False)
    agg2 = MeanAggregator(lambda nodes: enc1(nodes).t(), cuda=False)
    enc2 = Encoder(lambda nodes: enc1(nodes).t(),
                   enc1.embed_dim,
                   hidden_dim,
                   adj_lists,
                   agg2,
                   base_model=enc1,
                   gcn=False,
                   cuda=False)
    enc1.num_samples = 5
    enc2.num_samples = 5

    graphsage = SupervisedGraphSage(1, enc2, edge_map)
    #graphsage.cuda()

    optimizer = torch.optim.Adam(filter(lambda p: p.requires_grad,
                                        graphsage.parameters()),
                                 lr=0.001,
                                 weight_decay=1e-5)
    times = []
    epoch = 10
    batch_size = 512
    num_batch = len(train) // batch_size
    best = 1e9
    cnt_wait = 0
    patience = 20
    best_t = 0
    for e in range(epoch):
        for i in range(num_batch):
            if i < num_batch - 1:
                batch_nodes = train[i * batch_size:i * batch_size + batch_size]
            else:
                batch_nodes = train[i * batch_size:len(train)]
            start_time = time.time()
            optimizer.zero_grad()
            loss = graphsage.loss(batch_nodes,\
                Variable(torch.FloatTensor(labels[np.array(batch_nodes)])))
            loss.backward()
            optimizer.step()
            end_time = time.time()
            times.append(end_time - start_time)
            print("The {}-th epoch ".format(e), "{}-th batch".format(i),
                  "Loss: ", loss.item())

            if loss.item() < best:
                best_loss = loss.item()
                cnt_wait = 0
                best_t = e
                torch.save(graphsage.state_dict(), 'best_model.pkl')
            else:
                cnt_wait += 1

            if cnt_wait == patience:
                print("early stopping!")
                break

    print('Loading {}th epoch'.format(best_t))
    graphsage.load_state_dict(torch.load('best_model.pkl'))

    if len(test) < 100000:
        test_output = torch.sigmoid(graphsage.forward(test))
        pred = (np.where(test_output.data.numpy() < 0.5, 0, 1))
        print("Test F1:",
              f1_score(labels[test], pred, labels=[1], average="micro"))
        print("Test Recall:",
              recall_score(labels[test], pred, labels=[1], average="micro"))
        print("Test Precision:",
              precision_score(labels[test], pred, labels=[1], average="micro"))
        cm = plot_confusion_matrix(
            labels[test],
            pred,
            np.array([0, 1]),
            title='Confusion matrix, without normalization')
        #recall = cm[1][1]/(cm[1][0]+cm[1][1])
        #precision = cm[1][1]/(cm[1][1]+cm[0][1])
        #f1 = 2*recall*precision/(recall+precision)
        #print("Test F1 micro:", f1)
        #print("Test Recall micro:", recall)
        #print("Test Precision micro:", precision)

    ### Inference on large graph, avoid out of memory
    else:
        chunk_size = 5120
        pred = []
        for j in range(len(test) // chunk_size):
            if j < (len(test) // chunk_size - 1):
                test_output = torch.sigmoid(
                    graphsage.forward(test[j * chunk_size:(j + 1) *
                                           chunk_size]))
            else:
                test_output = torch.sigmoid(
                    graphsage.forward(test[j * chunk_size:len(test)]))
            pred += (np.where(test_output.data.numpy() < 0.5, 0, 1)).tolist()
            print("Inference on the {}-th chunk".format(j))
        cm = plot_confusion_matrix(
            labels[test],
            np.asarray(pred),
            np.array([0, 1]),
            title='Confusion matrix, without normalization')
        print(
            "Test F1:",
            f1_score(labels[test],
                     np.asarray(pred),
                     labels=[1],
                     average="micro"))
        print(
            "Test Recall:",
            recall_score(labels[test],
                         np.asarray(pred),
                         labels=[1],
                         average="micro"))
        print(
            "Test Precision:",
            precision_score(labels[test],
                            np.asarray(pred),
                            labels=[1],
                            average="micro"))

    print("Average batch time:", np.mean(times))
示例#48
0
        #======================
        # SCORE CALCULATION
        #======================
        score = clf.score(X_test, Y_test)

        f1_weighted = metrics.f1_score(
            Y_test,
            Y_pred,
            labels=None,
            pos_label=1,
            average='weighted',
            sample_weight=None)  #toma en cuenta el desbalance de etiqueta
        precision_score = metrics.precision_score(Y_test,
                                                  Y_pred,
                                                  labels=None,
                                                  pos_label=1,
                                                  average='weighted',
                                                  sample_weight=None)
        recall_score = metrics.recall_score(Y_test,
                                            Y_pred,
                                            labels=None,
                                            pos_label=1,
                                            average='weighted',
                                            sample_weight=None)
        G = math.sqrt(recall_score * precision_score)
        specifity = sensitivity_specifity.specificity_score(Y_test,
                                                            Y_pred,
                                                            average='weighted')
        #===============================
        # CONFUSSION MATRIX CALCULATION
        #===============================
示例#49
0
文件: Metrics.py 项目: Quincy1994/HUE
def score(pred, labels):
    pre = precision_score(labels, pred, average='macro')
    recall = recall_score(labels, pred, average='macro')
    f1 = f1_score(labels, pred, average='macro')
    return pre, recall, f1
示例#50
0
features_test = scaler.transform(features_test)


def classify_NB(features_train, labels_train):

    ### your code goes here--should return a trained decision tree classifer

    clf = GaussianNB()
    clf.fit(features_train, labels_train)
    return clf


clf = classify_NB(features_train, labels_train)
pred = clf.predict(features_test)
print accuracy_score(pred, labels_test)
print precision_score(pred, labels_test)
print recall_score(pred, labels_test)
print f1_score(pred, labels_test)


def classify_DTC(features_train, labels_train):

    ### your code goes here--should return a trained decision tree classifer

    clf = DecisionTreeClassifier()
    clf.fit(features_train, labels_train)
    return clf


clf = classify_DTC(features_train, labels_train)
pred = clf.predict(features_test)
示例#51
0
    #CLF = DecisionTreeClassifier(random_state=0)
    #CLF = GaussianNB()
    #CLF = LogisticRegression()
    CLF = SVC(C=25)
    # Load dataset
    corpus, y = parse_dataset(trn_dataset)  #3802 in total
    Xtrn = featurize(corpus)
    print(np.asarray(Xtrn).shape)
    class_counts = np.asarray(np.unique(y, return_counts=True)).T.tolist()
    print(class_counts)

    # Returns an array of the same size as 'y' where each entry is a prediction obtained by cross validated
    predicted = cross_val_predict(CLF, Xtrn, y, cv=K_FOLDS)
    #predicted = libsvm.cross_validation(Z, np.asarray(y,'float64'), 5, kernel = 'rbf')

    score = metrics.f1_score(y, predicted, pos_label=1)
    acc = metrics.accuracy_score(y, predicted)
    preci = metrics.precision_score(y, predicted)
    recall = metrics.recall_score(y, predicted)

    print("F1-score:", score)
    print("Accuracy:", acc)
    print("Precision:", preci)
    print("Recall:", recall)

    testing(Xtrn, y, True)

    for p in predicted:
        PREDICTIONSFILE.write("{}\n".format(p))
    PREDICTIONSFILE.close()
示例#52
0
print "\nPrecision Score"
print precision_score(y_test, y_pred)
print "\nRecall Score"
print recall_score(y_test, y_pred)
print "\nF1 Score"
print f1_score(y_test, y_pred)"""

print("################### SVM Classifier ###############")

from sklearn.svm import LinearSVC

clf = LinearSVC(random_state=20, tol=1e-5)
clf = clf.fit(x_train, y_train)

print "\nAccuracy on Training Set :"
print clf.score(x_train, y_train)

print "Checking on Test Set"
print "\nAccuracy on Testing Set :"
print clf.score(x_test, y_test)

y_pred = clf.predict(x_test)

print "\nPrecision Score"
print precision_score(y_test, y_pred)
print "\nRecall Score"
print recall_score(y_test, y_pred)
print "\nF1 Score"
print f1_score(y_test, y_pred)
示例#53
0
        all_terms_list.extend(term_prob[i].keys())
    all_terms_list = set(all_terms_list)
#"""

#-------------------------- Classification --------------------------

classifier = CopulaClassifier(corcoeff, vocab_choice, priors)
predictions = classifier.predict_multilabelBR(test_docs,
                                              all_terms=all_terms_list)

print "The Classification is complete and it took", print_time(start_time)
#print "Avg time taken per doc: ", (print_time(start_time)/float(len(test_docs)))
start_time = time.time()

#-------------------------- Evaluation ----------------------
precision = precision_score(test_labels, predictions, average='micro')
recall = recall_score(test_labels, predictions, average='micro')
f1 = f1_score(test_labels, predictions, average='micro')

print("Micro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(
    precision, recall, f1))

precision = precision_score(test_labels, predictions, average='macro')
recall = recall_score(test_labels, predictions, average='macro')
f1 = f1_score(test_labels, predictions, average='macro')

print("Macro-average quality numbers")
print("Precision: {:.4f}, Recall: {:.4f}, F1-measure: {:.4f}".format(
    precision, recall, f1))
示例#54
0
#
# For classification problems that are skewed in their classification distributions like in our case, for example if we had a 100 text messages and only 2 were spam and the rest 98 weren't, accuracy by itself is not a very good metric. We could classify 90 messages as not spam(including the 2 that were spam but we classify them as not spam, hence they would be false negatives) and 10 as spam(all 10 false positives) and still get a reasonably good accuracy score. For such cases, precision and recall come in very handy. These two metrics can be combined to get the F1 score, which is weighted average of the precision and recall scores. This score can range from 0 to 1, with 1 being the best possible F1 score.

# We will be using all 4 metrics to make sure our model does well. For all 4 metrics whose values can range from 0 to 1, having a score as close to 1 as possible is a good indicator of how well our model is doing.

# In[ ]:
'''
Instructions:
Compute the accuracy, precision, recall and F1 scores of your model using your test data 'y_test' and the predictions
you made earlier stored in the 'predictions' variable.
'''

# In[51]:
'''
Solution
'''
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
print('Precision score: ', format(precision_score(y_test, predictions)))
print('Recall score: ', format(recall_score(y_test, predictions)))
print('F1 score: ', format(f1_score(y_test, predictions)))

# ### Step 7: Conclusion ###
#
# One of the major advantages that Naive Bayes has over other classification algorithms is its ability to handle an extremely large number of features. In our case, each word is treated as a feature and there are thousands of different words. Also, it performs well even with the presence of irrelevant features and is relatively unaffected by them. The other major advantage it has is its relative simplicity. Naive Bayes' works well right out of the box and tuning it's parameters is rarely ever necessary, except usually in cases where the distribution of the data is known.
# It rarely ever overfits the data. Another important advantage is that its model training and prediction times are very fast for the amount of data it can handle. All in all, Naive Bayes' really is a gem of an algorithm!
#
# Congratulations! You have successfully designed a model that can efficiently predict if an SMS message is spam or not!
#
# Thank you for learning with us!
示例#55
0
#print('five fold:')
print(mean_sensitivity)
print(mean_SP)
print(mean_ACC)
print(mean_MCC)
print(mean_AUC)

clf.fit(gram_train, y_train)

y_score = clf.predict_proba(gram_test)
y_score = get_y_score(y_score)
precision, recall, thresholds = metrics.precision_recall_curve(y_test, y_score)

y_pred = clf.predict(gram_test)
ACC = metrics.accuracy_score(y_test, y_pred)
precision = metrics.precision_score(y_test, y_pred)
sensitivity = metrics.recall_score(y_test, y_pred)
specificity = specificity_score(y_test, y_pred)
AUC = metrics.roc_auc_score(y_test, y_score)
MCC = metrics.matthews_corrcoef(y_test, y_pred)
AUPR = get_AUPR(y_test, y_score)

#print("===========================")
#print('testing:')
print(sensitivity)
print(specificity)
print(ACC)
print(MCC)
print(AUC)
#print('AUPR =', AUPR)
示例#56
0
np.set_printoptions(precision=2)

plt.figure(figsize=(10, 6), dpi=220)
plot_confusion_matrix(cmatrix, title='Confusion matrix - No Normalisation')
plt.figure(figsize=(10, 6), dpi=220)
plot_confusion_matrix(cmatrix,
                      normalize=True,
                      title='Confusion Matrix - Normalised')
plt.show()

# In[9]:

print('Recall score: %0.2f' % recall_score(y_test, y_pred))
print('Accuracy score: %0.2f' % accuracy_score(y_test, y_pred))
print('Precision score: %0.2f' % precision_score(y_test, y_pred))

# In[10]:

model = RandomForestClassifier(n_estimators=64)

scores = cross_val_score(model, X_train, y_train, cv=20)
print('Accuracy: %0.2f (+/- %0.2f)' % (scores.mean(), scores.std() * 2))

# In[28]:

model = DummyClassifier(strategy="most_frequent")

model.fit(X_train, y_train)

y_pred = model.predict(X_test)
                    ensemble_scores_pos = ensemble_scores_pos + scores[:, 1][labelzz == 1].tolist()

                    print('Makes a prediction for', (len(positive_pred) + len(negative_pred)), 'domains')
                    print('Would predict', np.sum(predictions), 'domains malicious')
            else:
                total_manual += len(labelzz.index)
                ensemble_predictions = ensemble_predictions + labelzz.values.tolist()
                ensemble_labels = ensemble_labels + labelzz.values.tolist()

        print('Total work reduced', (total_amount_of_domains-total_manual)/total_amount_of_domains)
        print('Total FNR', total_fp/total_amount_negative)
        print('Total FPR', total_fn/total_amount_positive)

        print('Accuracy', accuracy_score(ensemble_labels, ensemble_predictions))
        print('F1', f1_score(ensemble_labels, ensemble_predictions))
        print('Precision', precision_score(ensemble_labels, ensemble_predictions))
        print('Recall', recall_score(ensemble_labels, ensemble_predictions))

        print('Little check', total_amount_positive+total_amount_negative == total_amount_of_domains)
        print('Little check', total_pred+total_manual == total_amount_of_domains)
        print('Little check', len(ensemble_scores_pos) + len(ensemble_scores_neg) == total_amount_of_domains)
        print('Little check', len(ensemble_scores_pos) == total_amount_positive)
        print('Little check', len(ensemble_scores_neg) == total_amount_negative)

        results_posteriori['work_reduction_metric'].append((total_amount_of_domains - total_manual) / total_amount_of_domains)
        results_posteriori['fnr_metric'].append(total_fn / total_amount_positive)
        results_posteriori['fpr_metric'].append(total_fp / total_amount_negative)
        results_posteriori['accuracy_metric'].append(accuracy_score(ensemble_labels, ensemble_predictions))
        results_posteriori['f1_metric'].append(f1_score(ensemble_labels, ensemble_predictions))
        results_posteriori['precision_metric'].append(precision_score(ensemble_labels, ensemble_predictions))
        results_posteriori['recall_metric'].append(recall_score(ensemble_labels, ensemble_predictions))
示例#58
0
# In[26]:

#applying on training dataset
y_train = []
pred = []
for row in train_dataset:
    prediction = predict(network, row)
    y_train.append(int(row[-1]))
    pred.append(prediction)

# In[27]:

print("Accuracy: ", accuracy_score(y_train, pred))
print("Confusion Matrix: ", confusion_matrix(y_train, pred))
print("Precision: ", precision_score(y_train, pred))
print("recall: ", recall_score(y_train, pred))

# In[28]:

#applying on testing dataset
y_test = []
pred = []
for row in test_dataset:
    prediction = predict(network, row)
    y_test.append(row[-1])
    pred.append(prediction)

# In[29]:

print("Accuracy: ", accuracy_score(y_test, pred))
def eval_once(cdir, saver, top_k_op, labels_np, logits, predict):
    """Run Eval once.

    Args:
      saver: Saver.
      summary_writer: Summary writer.
      top_k_op: Top K op.
      summary_op: Summary op.
    """
    # gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=0.1)

    gpu_options = tf.GPUOptions(per_process_gpu_memory_fraction=1)
    with tf.Session(config=tf.ConfigProto(
            log_device_placement=False,  #
            gpu_options=gpu_options)) as sess:
        ckpt = tf.train.get_checkpoint_state(checkpoint_dir=cdir)
        if ckpt and ckpt.model_checkpoint_path:
            # Restores from checkpoint
            saver.restore(sess, ckpt.model_checkpoint_path)
            # Assuming model_checkpoint_path looks something like:
            #   /my-favorite-path/cifar10_train/model.ckpt-0,
            # extract global_step from it.
            # global_step = ckpt.model_checkpoint_path.split('/')[-1].split('-')[-1]
        else:
            print('No checkpoint file found')
            return

        # Start the queue runners.
        try:
            # num_iter = int(math.ceil(FLAGS.num_examples / FLAGS.batch_size))
            # true_count = 0  # Counts the number of correct predictions.
            # total_sample_count = num_iter * FLAGS.batch_size
            # step = 0
            # while step < num_iter and not coord.should_stop():
            #     predictions = sess.run([top_k_op])
            #     true_count += np.sum(predictions)
            #     step += 1
            # predictions = sess.run([top_k_op])
            # num_iter = int(math.ceil(FLAGS.num_examples / FLAGS.batch_size_test))
            # true_count = 0  # Counts the number of correct predictions.
            # total_sample_count = num_iter * FLAGS.batch_size_test
            # step = 0
            # while step < num_iter and not coord.should_stop():
            _, logits, pred = sess.run([top_k_op, logits, predict])

            # with open("testresultss_LSTM.txt", "a") as myfile:
            #     myfile.write(np.array2string(logits) + '\n')

            # print(pred)
            # logits_max = np.argmax(logits,axis=1)
            # print(logits_max)
            # print(labels_np)

            precision = precision_score(labels_np, pred)
            recall = recall_score(labels_np, pred)
            acc = accuracy_score(labels_np, pred)
            cm = confusion_matrix(labels_np, pred)
            # recall = sess.run(rec_op)
            # acc = sess.run(acc_op)#accuracy

            # true_count = np.sum(predictions)
            # false_count = FLAGS.num_examples - np.count_nonzero(predictions)
            # Compute precision @ 1.
            # precision = true_count / FLAGS.num_examples
            # recall = tf.metrics.recall(labels=labels,predictions=predictions)
            print('precision @ 1 = %.3f recall @ 1 = %.3f acc @ 1 = %.3f' %
                  (precision, recall, acc))
            with open("testresultss_LSTM.txt", "a") as myfile:
                # myfile.write(cdir + '\n')
                myfile.write(cdir + ',%.3f,%.3f,%.3f \n' %
                             (precision, recall, acc))
                myfile.write(np.array2string(cm) + '\n')

        # summary = tf.Summary()
        # summary.ParseFromString(sess.run(summary_op))
        # summary.value.add(tag='Precision @ 1', simple_value=precision)
        # summary.value.add(tag='Recall @ 1', simple_value=recall)
        # summary.value.add(tag='Accuracy @ 1', simple_value=acc)
        # summary_writer.add_summary(summary, global_step)
        except Exception as e:  # pylint: disable=broad-except
            print('error in ' + cdir)
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(variables,labels)

pred = neigh.predict(variables)
accuracy=sklearn.metrics.accuracy_score(labels, pred)
print(accuracy)

from sklearn.metrics import f1_score
knn_f1_score = f1_score(labels, pred, average='macro')
print(knn_f1_score)

from sklearn.metrics import precision_score
knn_precision_score = precision_score(labels, pred, average='macro')
print(knn_precision_score)

from sklearn.metrics import recall_score
knn_recall_score = recall_score(labels, pred, average='macro')
print(knn_recall_score)

################################################################################

variables_train, variables_test, labels_train, labels_test=train_test_split(
        variables, labels, test_size=.9, random_state=1)

neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(variables_train,labels_train)
pred = neigh.predict(variables_test)
accuracy=sklearn.metrics.accuracy_score(labels_test, pred)
print(accuracy)