def test_topic_distribution(doc_topic_weights_filename, annotated_data_filename, k, test_indice, train_prop, column_of_interest): (X, Y, topic_to_index) = process_dataset(doc_topic_weights_filename, annotated_data_filename, k, column_of_interest) num_train = int(X.shape[0]) # We repeat the experiments and report the average scores = [] # Divide the set into train and test sets X_train = X[:num_train] Y_train = Y[:num_train] test_start_index = test_indice[0] test_end_index = test_indice[1] X_test = X[test_start_index:test_end_index] Y_test = Y[test_start_index:test_end_index] # Build a classifier clf = LogisticRegression().fit(X_train, Y_train) # Make prediction predicted_labels = clf.predict(X_test) # Report the accuracy true_labels = Y_test score = f1_score(predicted_labels, true_labels) print "---------------- %s --------------------" % str(test_indice) print classification_report(predicted_labels, true_labels) print topic_to_index print "-----------------------------------------" return score
def test_classification_report_multiclass_with_string_label(): y_true, y_pred, _ = make_prediction(binary=False) y_true = np.array(["blue", "green", "red"])[y_true] y_pred = np.array(["blue", "green", "red"])[y_pred] expected_report = """\ precision recall f1-score support blue 0.83 0.79 0.81 24 green 0.33 0.10 0.15 31 red 0.42 0.90 0.57 20 avg / total 0.51 0.53 0.47 75 """ report = classification_report(y_true, y_pred) assert_equal(report, expected_report) expected_report = """\ precision recall f1-score support a 0.83 0.79 0.81 24 b 0.33 0.10 0.15 31 c 0.42 0.90 0.57 20 avg / total 0.51 0.53 0.47 75 """ report = classification_report(y_true, y_pred, target_names=["a", "b", "c"]) assert_equal(report, expected_report)
def test_classification_report_multiclass(): """Test performance report""" iris = datasets.load_iris() y_true, y_pred, _ = make_prediction(dataset=iris, binary=False) # print classification report with class names expected_report = """\ precision recall f1-score support setosa 0.83 0.79 0.81 24 versicolor 0.33 0.10 0.15 31 virginica 0.42 0.90 0.57 20 avg / total 0.51 0.53 0.47 75 """ report = classification_report( y_true, y_pred, labels=np.arange(len(iris.target_names)), target_names=iris.target_names) assert_equal(report, expected_report) # print classification report with label detection expected_report = """\ precision recall f1-score support 0 0.83 0.79 0.81 24 1 0.33 0.10 0.15 31 2 0.42 0.90 0.57 20 avg / total 0.51 0.53 0.47 75 """ report = classification_report(y_true, y_pred) assert_equal(report, expected_report)
def simple_classification_without_cross_fold_validation(x, y, estimator, scoring): ''' Run normal SVM classification without cross-fold validation. ''' x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3) # 30% reserved for validation # feature selection since we have a small sample space fs = SelectPercentile(scoring, percentile=20) pipeline = Pipeline([('featureselector', fs), ('scaler', StandardScaler()), ('estimator', estimator)]) pipeline = OneVsRestClassifier(pipeline) clfer = pipeline.fit(x_train, y_train) y_predict_train = clfer.predict(x_train) print "%% Accuracy on training set: %2.3f" % metrics.accuracy_score(y_train, y_predict_train) y_predict_test = clfer.predict(x_test) print "\n%% Accuracy on testing set: %2.3f" % metrics.accuracy_score(y_test, y_predict_test) print "\nClassification Report:" print metrics.classification_report(y_test, y_predict_test) print "Confusion Matrix:" print metrics.confusion_matrix(y_test, y_predict_test)
def test_classification_report_multiclass_with_digits(): """Test performance report with added digits in floating point values""" iris = datasets.load_iris() y_true, y_pred, _ = make_prediction(dataset=iris, binary=False) # print classification report with class names expected_report = """\ precision recall f1-score support setosa 0.82609 0.79167 0.80851 24 versicolor 0.33333 0.09677 0.15000 31 virginica 0.41860 0.90000 0.57143 20 avg / total 0.51375 0.53333 0.47310 75 """ report = classification_report( y_true, y_pred, labels=np.arange(len(iris.target_names)), target_names=iris.target_names, digits=5) assert_equal(report, expected_report) # print classification report with label detection expected_report = """\ precision recall f1-score support 0 0.83 0.79 0.81 24 1 0.33 0.10 0.15 31 2 0.42 0.90 0.57 20 avg / total 0.51 0.53 0.47 75 """ report = classification_report(y_true, y_pred) assert_equal(report, expected_report)
def model_search(estimator, tuned_params, scores, X_train, y_train, X_test, y_test): cv = ShuffleSplit(len(X_train), n_iter=3, test_size=0.30, random_state=0) for score in scores: print"# Tuning hyper-parameters for %s" % score print clf = GridSearchCV(estimator, tuned_params, cv=cv, scoring='%s' % score) clf.fit(X_train, y_train) print"Best parameters set found on development set:" print print clf.best_params_ print print "Grid scores on development set:" print for params, mean_score, scores in clf.grid_scores_: print "%0.3f (+/-%0.03f) for %r" % (mean_score, scores.std() * 2, params) print print "Detailed classification report:" print print "The model is trained on the full development set." print "The scores are computed on the full evaluation set." print y_true, y_pred = y_test, clf.predict(X_test) print classification_report(y_true, y_pred) print
def main(): # Get the data and targets df = pd.read_csv('train1.csv') df = df[df.rating != 'rating'] corpus = [review for review in df.review] splitPoint = len(corpus)*2/3 trainingCorpus = corpus[:splitPoint] testCorpus = corpus[splitPoint:] target = [rating for rating in df.rating] trainingTarget = np.array(target[:splitPoint]) testTarget = np.array(target[splitPoint:]) # Train the algorithm train_X, vocabList = createVectorizer(trainingCorpus, 'None', True) NB_Bern_model = BernoulliNB().fit(train_X, trainingTarget) # Test the algorithm test_X = createVectorizer(testCorpus, vocabList, True) test_predict = NB_Bern_model.predict(test_X) print(np.mean(test_predict == testTarget)) print metrics.classification_report(testTarget, test_predict, target_names=['0', '1']) # Make Predictions predict_df = pd.read_csv('test2.csv') predictCorpus = [review for review in predict_df.review] member = [memberid for memberid in predict_df.ID] predict_X = createVectorizer(predictCorpus, vocabList, True) predictions = NB_Bern_model.predict(predict_X) predict_df.columns = ['ID', 'Predicted'] for i in range(len(member)): predict_df.loc[predict_df['ID'] == member[i], 'Predicted'] = predictions[i] predict_df.to_csv('submission1.csv', sep = ',', index=False)
def print_metrics(self, y_true, y_pred, print_averages=True): print print "{:^30}".format("Confusion matrix") categories = sorted(self.categories) labels = " ".join("{:>10}".format(c) for c in categories) print "{:>10} {} {:>10}".format("gold\pred", labels, "total") for cat, predictions in zip(categories, metrics.confusion_matrix(y_true, y_pred)): vals = " ".join("{:>10d}".format(p) for p in predictions) print "{:>10} {} {:>10}".format(cat, vals, sum(predictions)) print acc = metrics.accuracy_score(y_true, y_pred) print "Accuracy: {:.4f}".format(acc) idx = 0 d = {} for l in self.categories: d[l] = idx idx += 1 print metrics.classification_report([d[y] for y in y_true], [d[y] for y in y_pred], target_names=self.categories) if print_averages: print "Macro averaging" self._print_metrics(y_true, y_pred, average='macro') print "Micro averaging" self._print_metrics(y_true, y_pred, average='micro')
def test_classification_report(): """Test performance report""" iris = datasets.load_iris() y_true, y_pred, _ = make_prediction(dataset=iris, binary=False) # print classification report with class names expected_report = """\ precision recall f1-score support setosa 0.82 0.92 0.87 25 versicolor 0.56 0.17 0.26 30 virginica 0.47 0.90 0.62 20 avg / total 0.62 0.61 0.56 75 """ report = classification_report( y_true, y_pred, labels=range(len(iris.target_names)), target_names=iris.target_names) assert_equal(report, expected_report) # print classification report with label detection expected_report = """\ precision recall f1-score support 0 0.82 0.92 0.87 25 1 0.56 0.17 0.26 30 2 0.47 0.90 0.62 20 avg / total 0.62 0.61 0.56 75 """ report = classification_report(y_true, y_pred) assert_equal(report, expected_report)
def test_one_rf(): Xtrain_raw, ytrain_raw = load_raw_data("sentidata_train_raw.pkl") print "training data loaded" print_label_frequency(ytrain_raw) ############# create the pipeline pipeline = Pipeline([ ('vect', CountVectorizer(analyzer=lambda x:x,max_features=3000)), ('tfidf', TfidfTransformer()), ('rf', RandomForestClassifier(n_estimators=500, max_depth=200, min_samples_split=10, oob_score=True, n_jobs=-1,verbose=1,class_weight='balanced')), ]) ############# train pipeline.fit(Xtrain_raw,ytrain_raw) ############# check result rf = pipeline.steps[-1][1] rf.oob_score_ ############# training error ytrain_predict = pipeline.predict(Xtrain_raw) print classification_report(y_true=ytrain_raw,y_pred=ytrain_predict) print confusion_matrix(y_true=ytrain_raw,y_pred=ytrain_predict) ############# testing error Xtest_raw, ytest_raw = load_raw_data("sentidata_test_raw.pkl") ytest_predict = pipeline.predict(Xtest_raw) accuracy_score(y_true=ytest_raw,y_pred=ytest_predict) print classification_report(y_true=ytest_raw,y_pred=ytest_predict)
def main(train_data, test_data): print "Training" m = HiddenMarkovModelTagger.train(train_data) print "Predicting" predicted_labels = [] for i, sent in enumerate(test_data): if i % 500 == 0: print "%d / %d" %(i, len(test_data)) predicted_labels += [tag for _, tag in m.tag( [word for word, _ in sent] )] correct_labels = [tag for sent in test_data for _, tag in sent] # print predicted_labels # print correct_labels from sklearn.metrics import classification_report print classification_report(correct_labels, predicted_labels) correct_n = len([1 for p, c in zip(predicted_labels, correct_labels) if p == c]) print "Item accuracy:", float(correct_n) / len(correct_labels)
def separable_demo(): """ Generate a linearly-separable dataset D, train a linear SVM on D, then output the resulting decision boundary on a figure. """ from sklearn.datasets import make_blobs X, y = make_blobs(n_samples=200, n_features=2, centers=((0,0), (4, 4)), cluster_std=1.0) plot_data(X, y) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25) svc = svm.SVC(class_weight='auto') param_grid = {'kernel': ['linear'], 'C': [1e0, 1e1, 1e2, 1e3, 1e4]} strat_2fold = StratifiedKFold(y_train, k=2) print " Parameters to be chosen through cross validation:" for name, vals in param_grid.iteritems(): if name != 'kernel': print " {0}: {1}".format(name, vals) clf = GridSearchCV(svc, param_grid, n_jobs=1, cv=strat_2fold) clf.fit(X_train, y_train) print "== Best Parameters:", clf.best_params_ y_pred = clf.predict(X_test) acc = len(np.where(y_pred == y_test)[0]) / float(len(y_pred)) print "== Accuracy:", acc print classification_report(y_test, y_pred) plot_svm(clf.best_estimator_, X, y, X_test, y_test, title="SVM Decision Boundary, Linear Kernel ({0} accuracy, C={1})".format(acc, clf.best_params_['C']))
def benchmark(clf_class, params, name): print "parameters:", params t0 = time() clf = clf_class(**params).fit(X_train, y_train) print "done in %fs" % (time() - t0) if hasattr(clf, 'coef_'): print "Percentage of non zeros coef: %f" % ( np.mean(clf.coef_ != 0) * 100) print "Predicting the outcomes of the testing set" t0 = time() pred = clf.predict(X_test) print "done in %fs" % (time() - t0) print "Classification report on test set for classifier:" print clf print print classification_report(y_test, pred, target_names=news_test.target_names) cm = confusion_matrix(y_test, pred) print "Confusion matrix:" print cm # Show confusion matrix pl.matshow(cm) pl.title('Confusion matrix of the %s classifier' % name) pl.colorbar()
def score(train_labels, train_features, test_labels, test_features, save_file, use_tree=False): if use_tree: train_clf = Classifier(tree.DecisionTreeClassifier()) else: train_clf = Classifier() print train_clf.clf print '' t_start = time.clock() train_clf.learn(train_features, train_labels) t_end = time.clock() if save_file: train_clf.save_to_file(open(save_file, 'w')) p_start = time.clock() predicted = train_clf.clf.predict(test_features) p_end = time.clock() test_labels_t = train_clf.labels.transform(test_labels) print classification_report(test_labels_t, predicted, target_names=train_clf.labels.classes_) print 'Training time: %fs' % (t_end - t_start) print 'Predicting time: %fs' % (p_end - p_start) print 'Mean squared error: %f' % mean_squared_error(test_labels_t, predicted) return train_clf.score(test_features, test_labels)
def PredictAndAnalyze(data = data,target = target2,clf_cv = svm.SVC(kernel='linear', probability=True),checkauc = False,ifprint = False,balancing = True): kf = KFold(len(target), n_folds=10, shuffle=True) aucs = [] y_trueall = [] y_pridictall = [] for train, val in kf: X_train, y_train = np.array(data)[train], np.array(target)[train] if balancing == True: length = min([len(y_train[y_train == 0]),len(y_train[y_train == 1]),len(y_train[y_train == 2])]) X_train = np.r_[X_train[y_train == 0][0:length],X_train[y_train == 1][0:length],X_train[y_train == 2][0:length]] y_train = np.r_[y_train[y_train == 0][0:length],y_train[y_train == 1][0:length],y_train[y_train == 2][0:length]] X_test, y_test = np.array(data)[val], np.array(target)[val] clf_cv.fit(X_train, y_train) y_pred = clf_cv.predict(X_test) y_true = y_test y_trueall = y_trueall + list(y_true) y_pridictall = y_pridictall + list(y_pred) if ifprint == True: print(classification_report(y_true, y_pred)) if checkauc == True: y_pred_cv = clf_cv.predict_proba(X_test)[:, 1] auc = roc_auc_score(y_test, y_pred_cv) aucs.append(auc) if checkauc == True: print np.mean(aucs), np.std(aucs) print(classification_report(y_trueall, y_pridictall)) return y_trueall, y_pridictall
def xgb_model(x_train, x_test, y_train, y_test): dtrain = xgb.DMatrix( x_train, label=y_train) del x_train dtest = xgb.DMatrix( x_test, label=y_test) del x_test param = {} param['eta'] = 0.1 param['max_depth'] = 10 param['silent'] = 1 param['num_class'] = 4 param['objective'] = 'multi:softmax' param['nthread'] = 2 param['n_estimators']=100 #param['eval_metric'] = 'auc' plst = param.items() watchlist = [ (dtrain,'train'), (dtest, 'test') ] #evallist = [(dtest,'eval'), (dtrain,'train')] num_round = 237 bst = xgb.train( param, dtrain, num_round) #xgb.plot_importance(bst) #rf = RandomForestClassifier(n_estimators=3000, max_depth=10,n_jobs=2) #pipe1 = Pipeline([('sel',ColumnSelector(range(col_count))),('clf',bst)]) #pipe2 = Pipeline([('sel',ColumnSelector(range(col_count[:-4]))),('clf',rf)]) y_pred = bst.predict(dtest,ntree_limit=bst.best_ntree_limit) target_names = ['Start','Mid','End','Others'] #eclf = EnsembleClassifier(clfs=[pipe1, pipe2],voting='soft',weights=[0.5,0.2]) #eclf.fit(x_train,y_train) #y_pred = eclf.predict(x_test) print classification_report(y_test, y_pred, target_names=target_names) return bst
def test_prf(fn1,fn2,sth,L): y_true=[] y_score=[] edges_1=prep.read_edges(fn1) edges_2=prep.read_edges(fn2) predict_set={} for key in sth.keys(): predict_set[key]=predict_set.get(key,0.)+sth[key] predict_set=sorted(predict_set.iteritems(),key=lambda d:d[1],reverse=True)# threshold=predict_set[L][1] for i in edges_1: if sth[i]>threshold: y_score.append(1) else: y_score.append(0) for i in edges_1: if i not in edges_2: y_true.append(0) else: y_true.append(1) print classification_report(y_true,y_score) print auc_score(y_true,y_score)
def nearest_centroid_classifier(X_train, categories, X_test, test_categories): from sklearn.neighbors import NearestCentroid clf = NearestCentroid().fit(X_train, categories) y_roccio_predicted = clf.predict(X_test) print "\n Here is the classification report for NearestCentroid classifier:" print metrics.classification_report(test_categories, y_roccio_predicted) to_latex(test_categories, y_roccio_predicted)
def test(self): lenW = len(self.vectorizer.vocabulary_) W = 3*lenW Y_true = [] Y_pred = [] for i,line in enumerate(self.test_lines): if line['type'] == 'q': r = line['answer'] id = line['id']-1 indices = [idx for idx in range(i-id, i+1)] memory_list = self.L_test[indices] m_o1 = O_t([id], memory_list, self.s_Ot) m_o2 = O_t([id, m_o1], memory_list, self.s_Ot) bestVal = None best = None for w in self.vectorizer.vocabulary_: val = self.sR([id, m_o1, m_o2], self.H[w], memory_list, self.V) if bestVal is None or val > bestVal: bestVal = val best = w Y_true.append(r) Y_pred.append(best) print metrics.classification_report(Y_true, Y_pred)
def assess_classification_performance(model, X_train, y_train, X_test, y_test, short = False): accuracy_train = metrics.accuracy_score(y_train, model.predict(X_train)) accuracy_test = metrics.accuracy_score(y_test, model.predict(X_test)) print('accuracy (train/test): {} / {}\n'.format(accuracy_train, accuracy_test)) if not short: # confusion matrix # rows: actual group # columns: predicted group print('Confusion_matrix (training data):') print(metrics.confusion_matrix(y_train, model.predict(X_train))) print('Confusion_matrix (test data):') print(metrics.confusion_matrix(y_test, model.predict(X_test))) # precision = tp / (tp + fp) # recall = tp / (tp + fn) (= sensitivity) # F1 = 2 * (precision * recall) / (precision + recall) print('\nPrecision - recall (training data):') print(metrics.classification_report(y_train, model.predict(X_train))) print('\nPrecision - recall (test data):') print(metrics.classification_report(y_test, model.predict(X_test)))
def svc_classifier(X_train, categories,X_test, test_categories): from sklearn.svm import SVC svm_classifier = SVC(C=100, gamma=0.1).fit(X_train, categories) y_svm_predicted = svm_classifier.predict(X_test) print '\n Here is the classification report for support vector machine classiffier:' print metrics.classification_report(test_categories, y_svm_predicted) to_latex(test_categories, y_svm_predicted)
def print_classification_metrics(estimated_labels, actual_labels): mapping = get_most_likely_class_map(estimated_labels, actual_labels) predicted_labels = [] for i in estimated_labels: predicted_labels.append(mapping[i]) print metrics.classification_report(estimated_labels, actual_labels)
def faces(): from os import walk,path import numpy as np import mahotas as mh from sklearn.cross_validation import train_test_split from sklearn.cross_validation import cross_val_score from sklearn.preprocessing import scale from sklearn.decomposition import PCA from sklearn.linear_model import LogisticRegression from sklearn.metrics import classification_report X = [] y = [] for dir_path,dir_names,file_names in walk('./data/att_faces'): for fn in file_names: if fn[-3:] == 'pgm': image_filename = path.join(dir_path,fn) X.append(scale(mh.imread(image_filename,as_grey=True).reshape(10304).astype('float32'))) y.append(dir_path) X = np.array(X) X_train,X_test,y_train,y_test = train_test_split(X,y) pca = PCA(n_components = 150) X_train_reduced = pca.fit_transform(X_train) X_test_reduced = pca.transform(X_test) print 'original data were',X_train.shape print 'reduced is ',X_train_reduced.shape classifier = LogisticRegression() accuracies = cross_val_score(classifier,X_train_reduced,y_train) print 'cross val: ',np.mean(accuracies),accuracies classifier.fit(X_train_reduced,y_train) predictions = classifier.predict(X_test_reduced) print classification_report(y_test,predictions)
def train_logreg(X, y, test_X, test_y, load_vec=True): """ Trains logistic regression on the feature set. """ full_y = y + test_y lb = LabelBinarizer() lb.fit(full_y) # Convert into 1-D array print len(X), len(test_X) model = LogisticRegression() big_X = X + test_X features = featurize(big_X) X, test_X = features[:4500], features[4500:] print X.shape, X model.fit(X, y) y_pred = model.predict(X) print set(y_pred) print metrics.classification_report(y, y_pred, digits = 3) y_pred = model.predict(test_X) print set(y_pred) print metrics.classification_report(test_y, y_pred, digits = 3)
def sklearn_lp(X, y, output=None, kernel='knn', gamma=None, n_neighbors=10, alpha=1, max_iter=1000, tol=0.00001): from sklearn.cross_validation import train_test_split from sklearn.metrics import classification_report from sklearn.metrics import accuracy_score X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9, random_state=3) label_prop_model = LabelPropagation(kernel=kernel, gamma=gamma, n_neighbors=n_neighbors, alpha=alpha, max_iter=max_iter, tol=tol) label_prop_model.fit(X_train, y_train) y_predict = label_prop_model.predict(X_test) print 'y_train: ', y_train print 'y_predict: ', y_predict print '+--------------------------------------------------------+' print '| Report +' print '+--------------------------------------------------------+' print classification_report(y_test, y_predict) print 'accuracy: ' + str(accuracy_score(y_test, y_predict)) print '\n\n'
def main(): data_id = 'B' data_path = '/broad/compbio/maxwshen/data/1-MAKETRAINTEST/complete/golf/' print 'train...', datetime.datetime.now() train_set = readin(data_id, 'train', data_path) print 'valid...', datetime.datetime.now() valid_set = readin(data_id, 'valid', data_path) print 'test...', datetime.datetime.now() test_set = readin(data_id, 'test', data_path) # Input to 300 node RBM to 2 node output dbn = DBN( \ [xtrain.shape[1], 300, 2], \ learn_rates = 5, \ learn_rate_decays = 0.9, \ epochs = 31, \ verbose = 1) dbn.fit(dat_train, y_train) preds = dbn.predict(dat_test) print classification_report(y_test, preds) out_fn = 'dbn.pickle' with open(out_fn, 'w') as f: pickle.dump(dbn, out_fn) return
def all_report(auto_narrative_entities,auto_original_entities,auto_negative,manual_positive,manual_negative): y_true = [] y_pred = [] for e in manual_positive: y_true.append(1) if e in auto_narrative_entities or e in auto_original_entities: y_pred.append(1) elif manual_negative: y_pred.append(0) else: y_pred.append(-1) for e in manual_negative: y_true.append(0) if e in auto_narrative_entities or e in auto_original_entities: y_pred.append(1) elif manual_negative: y_pred.append(0) else: y_pred.append(-1) print classification_report(y_true, y_pred)
def on_epoch_end(self, epoch, logs={}): print logs corr=0 tot=0 preds = self.model.predict(self.dev_data, verbose=1) preds_text=[] for l in preds: preds_text.append(self.index2label[np.argmax(l)]) print "Micro f-score:", f1_score(self.dev_labels_text,preds_text,average=u"micro") print "Macro f-score:", f1_score(self.dev_labels_text,preds_text,average=u"macro") print "Macro recall:", recall_score(self.dev_labels_text,preds_text,average=u"macro") if self.best_mr < recall_score(self.dev_labels_text,preds_text,average=u"macro"): self.best_mr = recall_score(self.dev_labels_text,preds_text,average=u"macro") model.save_weights(self.model_name + '_full_' + str(epoch) + '_MR_' + str(self.best_mr) + '.hdf5') print 'Saved Weights!' print classification_report(self.dev_labels_text, preds_text) for i in xrange(len(self.dev_labels)): # next_index = sample(preds[i]) next_index = np.argmax(preds[i]) # print preds[i],next_index,index2label[next_index] l = self.index2label[next_index] # print "correct:", index2label[np.argmax(dev_labels[i])], "predicted:",l if self.index2label[np.argmax(self.dev_labels[i])]==l: corr+=1 tot+=1 print corr,"/",tot
def tune_parameters(X_train,X_test, y_train,y_test,param_grid): ''' Function to tune an SVM classifier and choose its parameters :param feature_matrix: training data :param labels: labels for training data :param param_grid: grid of parameters to try :return: clf.best_estimator_ the best classifier ''' #X_train, X_test, y_train, y_test = train_test_split(feature_matrix, labels, test_size=0.2, random_state=0) #X_train,X_test,y_train,y_test = split_data(feature_matrix,labels,params['split_percentage']) score = 'f1_weighted' clf = GridSearchCV(SVC(C=1), param_grid, cv=5, scoring=score, n_jobs=10) clf.fit(X_train, y_train) print "Best score during training: ", clf.best_score_ print "Best estimator", clf.best_estimator_ print "Classification report for validation set:" print classification_report(y_test,clf.predict(X_test)) return clf.best_estimator_
def test_digits() : from sklearn.cross_validation import train_test_split from sklearn.datasets import load_digits from sklearn.metrics import confusion_matrix, classification_report, accuracy_score from sklearn.preprocessing import LabelBinarizer digits = load_digits() X = digits.data y = digits.target #labels X /= X.max() #norm nn = NeuralNetwork([64,100,10],'logistic') #8x8 input, 10 output X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0) labels_train = LabelBinarizer().fit_transform(y_train) #convert no to vector labels_test = LabelBinarizer().fit_transform(y_test) nn.fit(X_train,labels_train,epochs=100) predictions = [] for i in range(X_test.shape[0]) : o = nn.predict(X_test[i]) predictions.append(np.argmax(o)) print confusion_matrix(y_test,predictions) print classification_report(y_test,predictions) print 'accuracy at %0.3f'%accuracy_score(y_test,predictions)
def main(): global start_dens st.title("Multi Variate Contamination in Fuel") menu = ["Home", "Model Training", "Contaminant Prediction"] choice = st.sidebar.selectbox("Menu", menu) if choice == "Home": st.subheader("Home") image_file = st.file_uploader("Upload Image", type=['png', 'jpeg', 'jpg']) if image_file is not None: # To See Details # st.write(type(image_file)) # st.write(dir(image_file)) file_details = { "Filename": image_file.name, "FileType": image_file.type, "FileSize": image_file.size } st.write(file_details) img = load_image(image_file) st.image(img, width=250) if choice == "Model Training": if st.button("Start Train"): allResults = glob.glob( 'batch31-38_with_target_diff_prx_2000tampered/*.csv', recursive=True) allResults = sorted(allResults, key=lambda x: (x.split("/")[-1])) #st.write(allResults) newpath1 = 'batch31-38_with_target_diff_prx_2000tampered/' # newpath1='/content/drive/MyDrive/OIL SAMPLES DATA1/' folder = newpath1 ## data directory tag = str('.csv') ## format to import initString = '-' ## string in csv file name to search for category (normal, sludge, water, together) fileList = directorySearch(folder, tag) # print(fileList) final_filelist = pd.DataFrame( index=range(0, len(fileList)), columns=['file', 'Target', 'file_dir', 'window_id']) for i in range(0, (len(fileList))): fileName = fileList[i] res1 = fileName.find(initString) if res1 == -1: res1 = fileName.find('_') if res1 == -1: print(res1) res1 = 5 c1 = int(res1 + 1) c5 = int(res1 + 12) wloc = fileName.rfind('W', c1, c5) sloc = fileName.rfind('S', c1, c5) tloc = fileName.rfind('T', c1, c5) finalCat = max([wloc, sloc, tloc]) strCat = fileName[finalCat] # print(strCat) classLabel = int(0) if strCat == 'S': final_filelist['file'][i] = fileName final_filelist['Target'][i] = strCat # print(fileName,'---Sludge') classLabel = int(1) if strCat == 'W': final_filelist['file'][i] = fileName final_filelist['Target'][i] = strCat # print(fileName,'---Water') classLabel = int(2) if strCat == 'T': final_filelist['file'][i] = fileName final_filelist['Target'][i] = strCat # print(fileName,'--- Mix') classLabel = int(3) if strCat not in ['S', 'T', 'W']: final_filelist['file'][i] = fileName final_filelist['Target'][i] = strCat final_filelist['file_dir'][i] = allResults[i] final_filelist['window_id'][i] = i + 1 ll = [] for i, j in enumerate(final_filelist['file']): # print(i,j) head, tail = os.path.split(j) r1 = re.split('_', tail) r2 = re.split('-', r1[0]) print(r2) # if len(r2)==3 and int(r2[1]) < 37 and int(r2[1])<37 and not 'A' in r1[0] : if len(r2) == 3 and 'A' not in r2[2]: ll.append(tail) elif len(r2) == 2 and 'A' not in (r2[1]): ll.append(tail) elif len(r2) == 4 and 'A' not in (r2[3]): ll.append(tail) dff = pd.DataFrame({'file': ll}) dff['file'].count() df4 = pd.DataFrame() c = 0 # for i,j in enumerate(allResults): for i, j in enumerate(dff['file']): # print(i,j) df = pd.read_csv( 'batch31-38_with_target_diff_prx_2000tampered/' + j) head, tail = os.path.split(j) # print(i,df.shape[1]) df4[tail] = (df['Pressure_tmp'].rolling(300).std()) df9 = pd.DataFrame(index=range(0, len(df4.columns)), columns=[ 'file', 'pre-trans_mean', 'trans_mean', 'post-trans_mean', 'transient_width' ]) for z, col in enumerate(df4.columns): start = 0 end = 0 a = df4[col] b = a.quantile(0.7) # threshold set here : 70 percentile x = df4[col] > b # find values greater than threshold # print(a) for i, j in enumerate(a): # print(i,j) if j > b: # find value greater than threshold start = i # get the position of value greater than threshold break for k, l in enumerate( a[start:] ): # now start checking from position that was marked earlier # print(i,j) if l < b and abs( k ) > 200: # find values that are less than threshold and makesure check after 200 positions (for finding out better transient part) end = start + k break df9['file'][z] = col df9['pre-trans_mean'][z] = (df4[col].iloc[:start].mean()) df9['trans_mean'][z] = (df4[col].iloc[start:end].mean()) df9['post-trans_mean'][z] = (df4[col].iloc[end:].mean()) if (end - start) > 0: df9['transient_width'][z] = end - start else: df9['transient_width'][z] = 0 df5 = df4.describe().transpose() df5 = df5.reset_index() df10 = pd.merge(df9, df5[['index', 'std', 'max']], left_on='file', right_on='index', how='left') del df10['index'] df10 = df10.set_index('file') df11 = pd.merge(df10, final_filelist[['file', 'Target']], left_on='file', right_on='file', how='left') df11 = df11.set_index('file') df11 = df11.astype({ 'pre-trans_mean': 'float64', 'trans_mean': 'float64', 'post-trans_mean': 'float64', 'transient_width': 'float64' }) df12 = pd.DataFrame() for i, j in enumerate(dff['file']): # print(i,j) df = pd.read_csv( 'batch31-38_with_target_diff_prx_2000tampered/' + j) head, tail = os.path.split(j) # print(i,df.shape[1]) df12[tail] = (df['Density'].rolling(300).std()) df13 = pd.DataFrame(index=range(0, len(df12.columns)), columns=[ 'file', 'pre-trans_mean-density', 'trans_mean-density', 'post-trans_mean-density', 'transient_width-density' ]) for z, col in enumerate(df12.columns): start = 0 end = 0 print(col) # file name a = df12[col] b = a.quantile(0.7) # threshold set here : 70 percentile x = df12[col] > b # find values greater than threshold # print(a) for i, j in enumerate(a): # print(i,j) if j > b: # find value greater than threshold start = i # get the position of value greater than threshold break for k, l in enumerate( a[start:] ): # now start checking from position that was marked earlier # print(i,j) if l < b and abs( k ) > 200: # find values that are less than threshold and makesure check after 200 positions (for finding out better transient part) end = start + k break df13['file'][z] = col df13['pre-trans_mean-density'][z] = ( df12[col].iloc[:start].mean()) df13['trans_mean-density'][z] = ( df12[col].iloc[start:end].mean()) df13['post-trans_mean-density'][z] = ( df12[col].iloc[end:].mean()) if (end - start) > 0: df13['transient_width-density'][z] = end - start else: df13['transient_width-density'][z] = 0 df13 = df13.astype({ 'pre-trans_mean-density': 'float64', 'trans_mean-density': 'float64', 'post-trans_mean-density': 'float64', 'transient_width-density': 'float64' }) df11.drop(['std'], axis=1, inplace=True) df14 = df13[[ 'file', 'pre-trans_mean-density', 'post-trans_mean-density' ]] df14['pre-trans_mean-density'] = df14[ 'pre-trans_mean-density'].fillna(0) df11.dropna(inplace=True) le = preprocessing.LabelEncoder() df11['Target'] = le.fit_transform(df11['Target']) df11.loc[:, 'Target'] df15 = df11.merge(df14, how='inner', on='file') del df15['file'] df15 = df15[[ 'pre-trans_mean', 'trans_mean', 'post-trans_mean', 'transient_width', 'max', 'pre-trans_mean-density', 'post-trans_mean-density', 'Target' ]] st.write(df15) col = df15.columns features = col.tolist() feature = features[:-1] target = features[-1] # x=dff_tr.loc[:,feature].values # y=dff_tr.loc[:,target].values x = df15.loc[:, feature].values y = df15.loc[:, target].values x_train, x_test, y_train, y_test = train_test_split( x, y, test_size=0.3, random_state=98) ost = SMOTE() os_data_X, os_data_y = ost.fit_resample(x_train, y_train) os_data_X = pd.DataFrame(data=os_data_X, columns=feature) os_data_y = pd.DataFrame(data=os_data_y, columns=['Target']) # print('After Oversampling:') os_data_X, os_data_y = ost.fit_resample(x_train, y_train) clf_rf_bal = RandomForestClassifier(n_estimators=10, random_state=99) clf_rf_bal = clf_rf_bal.fit(os_data_X, os_data_y) #from sklearn.inspection import permutation_importance #results = permutation_importance(clf_rf_bal, x, y, scoring='accuracy') #importance = results.importances_mean # summarize feature importance #print('using permutaiton feature importance') #for i, v in enumerate(importance): # print('Feature: %0d, Score: %.5f' % (i, v)) #importance = clf_rf_bal.feature_importances_ # summarize feature importance #print('using feature importance') #for i, v in enumerate(importance): # print('Feature: %0d, Score: %.5f' % (i, v)) bal_cm = confusion_matrix(y_test, clf_rf_bal.predict(x_test)) y_pred_bal = clf_rf_bal.predict(x_test) print('balanced classification report') cls_rpt = classification_report(y_test, y_pred_bal) st.write(f'classification report : {cls_rpt}') bal_ac = accuracy_score(y_test, clf_rf_bal.predict(x_test)) st.write(f'accuracy score : {bal_ac}') filename = 'finalized_model1.pkl' pickle.dump(clf_rf_bal, open(os.path.join(os.getcwd(), filename), 'wb')) if choice == "Contaminant Prediction": st.subheader("Dataset") data_file = st.file_uploader("Upload CSV", type=['csv']) if st.button("Process") and data_file is not None: file_details = { "Filename": data_file.name, "FileType": data_file.type, "FileSize": data_file.size } st.write(file_details) df = pd.read_csv(data_file) st.dataframe(df) tag = str('.csv') ## format to import initString = '-' ## string in csv file name to search for category (normal, sludge, water, together) fileName = data_file.name # print(fileList) final_filelist = pd.DataFrame(columns=['file', 'Target']) res1 = fileName.find(initString) if res1 == -1: res1 = fileName.find('_') if res1 == -1: print(res1) res1 = 5 c1 = int(res1 + 1) c5 = int(res1 + 12) wloc = fileName.rfind('W', c1, c5) sloc = fileName.rfind('S', c1, c5) tloc = fileName.rfind('T', c1, c5) finalCat = max([wloc, sloc, tloc]) strCat = fileName[finalCat] st.write(f'FileName:{fileName}') if strCat not in ['S', 'T', 'W']: strCat = 'No Contaminant' st.write('No Contaminant') if strCat in ['S', 'T', 'W']: st.write('Contaminant Exists') if strCat == 'S': st.write('Type of Contaminant: Sludge') if strCat == 'W': st.write('Type of Contaminant: Water') if strCat == 'T': st.write('Type of Contaminant: Sludge+Water') df4 = pd.DataFrame() df4['roll_std'] = df['Pressure_tmp'].rolling(300).std() df5 = df4.describe().transpose() df5 = df5.reset_index() maxx = df5['max'][0] df9 = pd.DataFrame(columns=[ 'file', 'pre_trans_mean', 'trans_mean', 'post_trans_mean', 'transient_width' ]) # for col in df4.columns: # end = 0 # print(col) # file name # a = df4[col] a = df4['roll_std'] st.write(a) b = a.quantile(0.7) # threshold set here : 70 percentile # print(b) st.write(b) # x = df4[col] > b x = df4['roll_std'] > b # find values greater than threshold # print(x.value_counts()) # print(a) st.write(x) for i, j in enumerate(a): # print(i,j) if j > b: # find value greater than threshold start = i # get the position of value greater than threshold break for k, l in enumerate( a[start:] ): # now start checking from position that was marked earlier # print(i,j) if l < b and abs( k ) > 200: # find values that are less than threshold and makesure check after 200 positions (for finding out better transient part) end = start + k break file = data_file.name pre_trans_mean = (df4['roll_std'].iloc[:start].mean()) trans_mean = (df4['roll_std'].iloc[start:end].mean()) post_trans_mean = (df4['roll_std'].iloc[end:].mean()) if (end - start) > 0: transient_width = end - start else: transient_width = 0 df12 = pd.DataFrame() df12['roll_std_den'] = (df['Density'].rolling(300).std()) df13 = pd.DataFrame() df13 = pd.DataFrame(columns=[ 'file', 'pre_trans_mean_dens', 'trans_mean_dens', 'post_trans_mean_dens', 'transient_width_dens' ]) # for col in df4.columns: # end = 0 # print(col) # file name # a = df4[col] p = df12['roll_std_den'] q = p.quantile(0.7) # threshold set here : 70 percentile # print(b) # st.write(b) # x = df4[col] > b xx = df12['roll_std_den'] > q # find values greater than threshold # print(x.value_counts()) # print(a) # st.write(xx) for i, j in enumerate(p): # print(i,j) if j > q: # find value greater than threshold start_dens = i # get the position of value greater than threshold break for k, l in enumerate( p[start_dens:]): # now start checking from position # print(i,j) if l < q and abs( k) > 200: # find values that are less than threshold end_dens = start_dens + k break pre_trans_mean_dens = df12['roll_std_den'].iloc[:start_dens].mean() trans_mean_dens = df12['roll_std_den'].iloc[ start_dens:end_dens].mean() post_trans_mean_dens = df12['roll_std_den'].iloc[end_dens:].mean() if (end_dens - start_dens) > 0: transient_width_dens = end_dens - start_dens else: transient_width_dens = 0 zz = { 'file': file, 'pre_trans_mean': pre_trans_mean, 'trans_mean': trans_mean, 'post_trans_mean': post_trans_mean, 'pre_trans_mean_dens': pre_trans_mean_dens, 'trans_mean_dens': trans_mean_dens, 'post_trans_mean_dens': post_trans_mean_dens } pre_trans_mean1 = pre_trans_mean trans_mean1 = trans_mean post_trans_mean1 = post_trans_mean transient_width1 = transient_width max1 = maxx pre_trans_mean_dens1 = pre_trans_mean_dens post_trans_mean_dens1 = post_trans_mean_dens st.write(zz) # load the model from disk loaded_model = pickle.load(open('finalized_model1.pkl', 'rb')) result = loaded_model.predict([[ pre_trans_mean1, trans_mean1, post_trans_mean1, transient_width1, max1, pre_trans_mean_dens1, post_trans_mean_dens1 ]]) if result == 0: st.write(f'Predicted Contaminant: Sludge') if result == 1: st.write(f'Predicted Contaminant: Water') if result == 2: st.write(f'Predicted Contaminant: Water+Sludge') if result == 3: st.write('No Contaminant')
# define the 3072-1024-512-10 architecture using Keras model = Sequential() model.add(Dense(1024, input_shape=(3072,), activation="relu")) model.add(Dense(512, activation="relu")) model.add(Dense(10, activation="softmax")) # train the model using SGD print("[INFO] training network...") sgd = SGD(0.01) model.compile(loss="categorical_crossentropy", optimizer=sgd, metrics=["accuracy"]) H = model.fit(trainX, trainY, validation_data=(testX, testY), epochs=100, batch_size=32) # evaluate the network print("[INFO] evaluating network...") predictions = model.predict(testX, batch_size=32) print(classification_report(testY.argmax(axis=1), predictions.argmax(axis=1), target_names=labelNames)) # plot the training loss and accuracy plt.style.use("ggplot") plt.figure() plt.plot(np.arange(100), H.history["loss"], label="train_loss") plt.plot(np.arange(100), H.history["val_loss"], label="val_loss") plt.plot(np.arange(100), H.history["acc"], label="train_acc") plt.plot(np.arange(100), H.history["val_acc"], label="val_acc") plt.title("Training Loss and Accuracy") plt.xlabel("Epoch #") plt.ylabel("Loss/Accuracy") plt.legend() plt.savefig(args["output"]) plt.show()
def evaluate_model(model, X_test, y_test, category_names): y_pred = model.predict(X_test) print(classification_report(y_test, y_pred, target_names=category_names))
plt.axis('off') plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest') plt.title('Training: %i' % label) # To apply a classifier on this data, we need to flatten the image, to # turn the data in a (samples, feature) matrix: n_samples = len(digits.images) data = digits.images.reshape((n_samples, -1)) # Create a classifier: a support vector classifier classifier = svm.SVC(gamma=0.001) # We learn the digits on the first half of the digits classifier.fit(data[:n_samples / 2], digits.target[:n_samples / 2]) # Now predict the value of the digit on the second half: expected = digits.target[n_samples / 2:] predicted = classifier.predict(data[n_samples / 2:]) print("Classification report for classifier %s:\n%s\n" % (classifier, metrics.classification_report(expected, predicted))) print("Confusion matrix:\n%s" % metrics.confusion_matrix(expected, predicted)) images_and_predictions = list(zip(digits.images[n_samples / 2:], predicted)) for index, (image, prediction) in enumerate(images_and_predictions[:4]): plt.subplot(2, 4, index + 5) plt.axis('off') plt.imshow(image, cmap=plt.cm.gray_r, interpolation='nearest') plt.title('Prediction: %i' % prediction) plt.show()
#image data mechine learning from sklearn import model_selection, svm, metrics # CSV 파일을 읽어 들이고 가공(이미지 데이터의 각 픽셀을 실수 벡터로 치환) def load_csv(fname): labels = [] images = [] with open(fname, "r") as f: for line in f: cols = line.split(",") if len(cols) < 2: continue labels.append(int(cols.pop(0))) vals = list(map(lambda n: int(n) / 256, cols)) images.append(vals) return {"labels":labels, "images":images} data = load_csv("./mnist/train.csv") test = load_csv("./mnist/t10k.csv") clf = svm.SVC() clf.fit(data["images"], data["labels"]) predict = clf.predict(test["images"]) # 결과 확인 ac_score = metrics.accuracy_score(test["labels"], predict) cl_report = metrics.classification_report(test["labels"], predict) print("정답률 =", ac_score) print("리포트 =") print(cl_report)
from sklearn.tree import DecisionTreeClassifier import VeriYukle import numpy as np print(' KARAR AĞAÇLARI '.center(50, '-')) x_train, x_test, y_train, y_test = VeriYukle.egitimTestVeriSeti() dt = DecisionTreeClassifier() dt.fit(x_train, y_train) y_pred = dt.predict(x_test) cm = confusion_matrix(y_true=y_test, y_pred=y_pred) print('Doğruluk matrisi'.capitalize().title()) print(cm) FP = cm.sum(axis=0) - np.diag(cm) FN = cm.sum(axis=1) - np.diag(cm) TP = np.diag(cm) TN = cm.sum() - (FP + FN + TP) print('Doğruluk değerleri'.upper().center(50, '-')) print(f"FP:{FP.sum()} FN:{FN.sum()} TP:{TP.sum()} TN:{TN.sum()}") dogruluk = (TP.sum() + TN.sum()) / (TP.sum() + TN.sum() + FN.sum() + FP.sum()) * 100 print(f'Doğruluk: % {dogruluk}') print(classification_report(y_test, y_pred))
from sklearn.datasets import fetch_mldata from sklearn import linear_model from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score mnist = fetch_mldata('MNIST original') X_train = mnist.data[:60000] / 255.0 Y_train = mnist.target[:60000] X_test = mnist.data[60000:] / 255.0 Y_test = mnist.target[60000:] Y_train[Y_train > 1.0] = 0.0 Y_test[Y_test > 1.0] = 0.0 clf = linear_model.LogisticRegression() clf.fit(X_train, Y_train) Y_pred = clf.predict(X_test) print(classification_report(Y_test, Y_pred)) accuracy = accuracy_score(Y_test, Y_pred) precision = precision_score(Y_test, Y_pred, average="weighted") recall = recall_score(Y_test, Y_pred, average="weighted") print(accuracy, precision, recall)
#yp=pd.read_csv(r'test.csv') #yp=datos.to_numpy() #yp=np.delete(yp, 0, axis=1) #print(len(yp[1])) from sklearn import model_selection from sklearn.metrics import confusion_matrix X_train, X_test, y_train, y_test = model_selection.train_test_split( X, y, test_size=0.33) clasificador.fit(X_train, y_train) print(len(X_train), len(X_test)) prediccion = clasificador.predict(X_test) print(prediccion) print() print(confusion_matrix(y_test, prediccion)) from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score print('\nExactitud: {:.2f}\n'.format(accuracy_score(y_test, prediccion))) #Realizamos 10 veces las pruebas from sklearn.metrics import classification_report print(' \n Informe de clasificación \n ') print( classification_report( y_test, prediccion, target_names=['Clase 0', 'Clase 1', 'Clase 2', 'Clase 3']))
from sklearn.metrics import classification_report from sklearn.model_selection import train_test_split from sklearn.svm import SVC import numpy as np import sklearn # generate the XOR data tl = np.random.uniform(size=(100, 2)) + np.array([-2.0, 2.0]) tr = np.random.uniform(size=(100, 2)) + np.array([2.0, 2.0]) bl = np.random.uniform(size=(100, 2)) + np.array([-2.0, -2.0]) br = np.random.uniform(size=(100, 2)) + np.array([2.0, -2.0]) X = np.vstack([tl, tr, br, bl]) Y = np.hstack([[1] * len(tl), [-1] * len(tr), [1] * len(br), [-1] * len(bl)]) # split dataset (trainData, testData, trainLabels, testLabels) = train_test_split(X, Y, test_size=0.25, random_state=42) # train the linear SVM model print('[RESULT] SVM w/ Linear Kernel') model = SVC(kernel='linear') model.fit(trainData, trainLabels) print(classification_report(testLabels, model.predict(testData))) print('[RESULT] SVM w/ Polynomial Kernel') model = SVC(kernel='poly', degree=2, coef0=1) model.fit(trainData, trainLabels) print(classification_report(testLabels, model.predict(testData)))
rows,columns=df.shape df.columns = ['TID', 'Text','Tag','Label'] #neu 0 #neg 1 #pos 2 text=[] label=[] for i in df.Text: text.append(i) for i in df.Label: label.append(i) X_train, X_test, y_train, y_test = train_test_split(text, label, test_size=0.33, random_state=42) text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', RandomForestClassifier(n_estimators=100)), ]) text_clf.fit(X_train, y_train) predicted = text_clf.predict(X_test) print(metrics.classification_report(y_test, predicted))
clf = GridSearchCV(SVC(kernel='rbf', class_weight='auto'), param_grid) clf = clf.fit(X_train_pca, y_train) print "done in %0.3fs" % (time() - t0) print "Best estimator found by grid search:" print clf.best_estimator_ ############################################################################### # Quantitative evaluation of the model quality on the test set print "Predicting the people names on the testing set" t0 = time() y_pred = clf.predict(X_test_pca) print "done in %0.3fs" % (time() - t0) print classification_report(y_test, y_pred, target_names=target_names) print confusion_matrix(y_test, y_pred, labels=range(n_classes)) ############################################################################### # Qualitative evaluation of the predictions using matplotlib def plot_gallery(images, titles, h, w, n_row=3, n_col=4): """Helper function to plot a gallery of portraits""" pl.figure(figsize=(1.8 * n_col, 2.4 * n_row)) pl.subplots_adjust(bottom=0, left=.01, right=.99, top=.90, hspace=.35) for i in range(n_row * n_col): pl.subplot(n_row, n_col, i + 1) pl.imshow(images[i].reshape((h, w)), cmap=pl.cm.gray) pl.title(titles[i], size=12) pl.xticks(())
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, roc_curve from sklearn.metrics import classification_report # In[106]: # Multinomial Naive Bayes from sklearn.naive_bayes import MultinomialNB mnb = MultinomialNB() mnb.fit(x_train,y_train) predmnb = mnb.predict(x_test) print("Confusion Matrix for Multinomial Naive Bayes:") print(confusion_matrix(y_test,predmnb)) print("Score:",round(accuracy_score(y_test,predmnb)*100,2)) print("Classification Report:",classification_report(y_test,predmnb)) # **The performance score of Naive Bayes classifier is 86.06. Since it is high score, I will treat this model as my baseline.** # # 5.4.2 Random Forest Classifier # There is no correlation between our feature(text) and target(review_stars) and this is the reason for choosing Random Forest Classifier. # The vital thing for a Random Forest Classifier model to make an accurate class prediction is the trees of the forest and more importantly their predictions need to be uncorrelated (or at least have low correlations with each other). # # Random forests are an ensemble learning method for classification. It operates by constructing a multitude of decision trees at training time and outputting the class that is the mode of the classes (classification) or mean prediction (regression) of the individual trees. # In[107]: # Random Forest
def display_classification_report(true_labels, predicted_labels, classes=[1,0]): report = metrics.classification_report(y_true=true_labels, y_pred=predicted_labels, labels=classes) print(report)
# train the network print("[INFO] training network...") H = model.fit(trainX, trainY, validation_data=(testX, testY), batch_size=32, epochs=100, verbose=1) # evaluate the network print("[INFO] evaluating network...") predictions = model.predict(testX, batch_size=32) print( classification_report(testY.argmax(axis=1), predictions.argmax(axis=1), target_names=["cat", "dog", "panda"])) # plot the training lass and accuracy plt.style.use("ggplot") plt.figure() plt.plot(np.arange(0, 100), H.history["loss"], label="train_loss") plt.plot(np.arange(0, 100), H.history["val_loss"], label="val_loss") plt.plot(np.arange(0, 100), H.history["acc"], label="train_acc") plt.plot(np.arange(0, 100), H.history["val_acc"], label="val_acc") plt.title("Training Loss and Accuracy") plt.xlabel("Epoch #") plt.ylabel("Loss/Accuracy") plt.legend() #plt.show() plt.savefig("output/shallownet_animals.png")
print("svm") svm_model.fit(svm_trainX_scaled, svm_trainy) print(svm_model.score(svm_testX_scaled, svm_testy)) pred = svm_model.predict(svm_testX_scaled) score = indicator(pred, svm_testy) Accuracy, Precision, Recall, F_meature = score.getMetrics() Specific = score.getSpecific() TPR, FPR = score.getfprtpr() AUC, x, y = score.getAuc() sumx += x sumy += y MCC = score.getMCC() sumAccuracy += Accuracy print(classification_report(svm_testy, pred)) print("Accuracy:", Accuracy) print("Precison:", Precision) print("Recall:", Recall) print("F-meature:", F_meature) print("Specific:", Specific) print("MCC:", MCC) print("AUC:", AUC) print("TPR:", TPR) print("FPR:", FPR) sumPrecision += Precision sumRecall += Recall sumF_meature += F_meature sumAUC += AUC sumSpecific += Specific sumMCC += MCC
dirpath = os.getcwd() #"C:\\Users\\iAngelMx\\Documents\\GitHub\\nlp\\deteccionDeSentimientos" [sampleTexts, y] = prepareRawText2Classify(dirpath, tipoRawText="review", reviewCategory=categoria) print("(1-> Positive 0-> Negative)") y = np.asarray(y) #y<- etiquetas de los textos #X<- Lista de características count_vect = CountVectorizer() X_counts = count_vect.fit_transform(sampleTexts) X = X_counts X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42) import mord as m clf = m.LogisticIT() clf.fit(X_train, y_train) y_pred = clf.predict(X_test) from sklearn import metrics print("Precisión de prediccion: ", clf.score(X_test, y_test)) print("Matriz de confusión: \n", metrics.confusion_matrix(y_test, y_pred)) print("Classification report: \n", metrics.classification_report(y_test, y_pred))
from sklearn.linear_model import RidgeClassifier from sklearn.metrics import classification_report x_train, y_train, x_valid, y_valid, x_test, y_test = prepare_data(one_hot=False) classifiers = [ GaussianNB(), # RidgeClassifier(tol=1e-2, solver="lsqr"), QuadraticDiscriminantAnalysis(), LinearDiscriminantAnalysis(), DecisionTreeClassifier(max_depth=5), KNeighborsClassifier(3, n_jobs=-1), RandomForestClassifier(max_depth=5, n_estimators=10, max_features=1, n_jobs=-1), AdaBoostClassifier(), GradientBoostingClassifier(), SVC(kernel="linear", C=0.025, probability=True), SVC(gamma=2, C=1, probability=True), SVC(kernel="rbf", C=0.025, probability=True), MLPClassifier(alpha=1), GaussianProcessClassifier(1.0 * RBF(1.0), n_jobs=-1), ] for clf in classifiers: print('_' * 80) print(clf.__class__.__name__) clf.fit(x_train, y_train) print('Train/val/test accuracy: ', clf.score(x_train, y_train), clf.score(x_valid, y_valid), clf.score(x_test, y_test)) print('Classification report of Test data') print(classification_report(y_test, clf.predict(x_test)))
from sklearn.preprocessing import StandardScaler scaler = StandardScaler() scaler.fit(dataset.drop('Purchased',axis=1)) scaled_features = scaler.transform(dataset.drop('Purchased',axis=1)) df_feat = pd.DataFrame(scaled_features,columns=['Age', 'EstimatedSalary','Male']) df_feat.head() #train test split from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(df_feat,dataset['Purchased'], test_size=0.30) #import model from sklearn.naive_bayes import GaussianNB classifier = GaussianNB() #training classifier.fit(X_train, y_train) #predicting y_pred = classifier.predict(X_test) #evaluation from sklearn.metrics import classification_report,confusion_matrix cm = confusion_matrix(y_test, y_pred) cr=classification_report(y_test,y_pred) print("Confusion Matrix: \n",cm) print("Classification Report: \n", cr) print("Accuracy is micro avg in classification report")
Test loss: 0.04145523567434866 Test accuracy: 0.9863 We define most misclassified as the 3 digits that have the least precision. Precision: Out of all digits that were classified as 'C' how many were actually correct? For now these are digit 7, 0 and 4 in order of worst to best. """ predictions = model.predict(x_test) predictions = predictions.argmax(axis=1) y_test2 = y_test.argmax(axis=1) print (classification_report(y_test2,predictions,digits=5)) """ For using mean squared error we get: Test loss: 0.006500150103870692 Test accuracy: 0.9596 So it's worse than categorical cross-entropy cost. Now the worst digits are 8, 3 and 0 in order of worst to best Running it again gives: Test loss: 0.007368972765450598 Test accuracy: 0.9501
dimensions = class_names cmn = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis] fig, ax = plt.subplots(figsize=(10, 10)) colors = ["orange", "green"] sn.heatmap(cmn, annot=True, fmt='.2f', xticklabels=dimensions, cmap=colors, yticklabels=dimensions) plt.title("Confusion Matrix") plt.ylabel('Actual') plt.xlabel('Predicted') plt.show(block=False) print(classification_report(y_true, y_pred)) print("\nAccuracy Metrics") TP, FP, TN, FN = perf_measure(y_true, y_pred) precision = TP / (TP + FP) recall = TP / (TP + FN) print('Accuracy score: ' + str(accuracy_score(y_true, y_pred))) print("Precision Score: " + str(precision)) print("Recall: " + str(recall)) print("Logging...") if record:
plt.tight_layout() plt from sklearn.metrics import classification_report, confusion_matrix #進行預測並製作混淆矩陣 Y_pred = model.predict(X_test) print("Y_pred:", Y_pred) y_pred = np.argmax(Y_pred, axis=1) print("Y_pred:", y_pred) ''' # (or) y_pred = model.predict_classes(X_test) print("Y_pred:",y_pred) ''' p = model.predict_proba(X_test) # to predict probability target_names = ['class 0(Flowers)', 'class 1(Dogs)'] print( classification_report(np.argmax(Y_test, axis=1), y_pred, target_names=target_names)) print(confusion_matrix(np.argmax(Y_test, axis=1), y_pred)) # 儲存模型 fname = "weights-Test-CNN.hdf5" model.save_model(fname, overwrite=True)
def calc_acc_proba(df_acc, df_proba, subid, mask_name, X, ev_labels, run_labels, cv, multi_class='ovr', univariate_fsel_k=None, pca_n=None, upsampling=False, undersampling=False, conf_mat=False, cm_group=None, print_report=False, compute_AUC=False, df_auc=None, cv_C=False, repeated_ttest_fsel=None): """Classify 2D data & return accuracy + probabilities for each class ---------- df_acc : pandas dataframe Must have columns=['subid', 'mask_name', 'category', 'classifier', 'accuracy']) df_proba : pandas dataframe Must have columns=['subid', 'mask_name', 'true_category', 'guess_category', 'classifier', 'probability']) subid : str Subject ID (e.g., 'ap01') mask_name: str Name of mask used when calling get_subj_data X : 2D numpy array Selected BOLD data (sample x voxel) for classification ev_labels : list/array of strings condition labels (length = # of samples) run_labels : list/array of ints run label by which to perform cross-validation (length = # of samples) cv : cross-validation generator (e.g., LeaveOneLabelOut(run_labels)) multi_class : str In multiclass case, training uses one-vs-rest ('ovr') or multinomial ('multinomial') univariate_fsel_k : int Option to perform univariate (ANOVA) feature selection based on the training data; take the k best features pca_n : int Option to perform PCA on the training set to reduce the number of features upsampling : bool Option to over-sample using SMOTE to deal with class imbalance undersampling : bool Option to under-sample using random under-sampling (randomly pick samples without replacement) to deal with class imbalance cv_C : bool Option to select C via CV; only works for multinomial LR (multi_class = 'multinomial') repeated_ttest_fsel : int Option to select k features for each combination of t-tests (None otherwise) Returns ------- df_acc : pandas dataframe Must have columns=['subid', 'mask_name', 'category', 'classifier', 'accuracy', 'count']) df_proba : pandas dataframe Must have columns=['subid', 'mask_name', 'true_category', 'guess_category', 'classifier', 'probability']) df_auc : pandas dataframe Must have columns=['subid', 'mask_name', 'category', 'classifier', 'auc']) """ # quick double check if repeated_ttest_fsel and univariate_fsel_k: print 'Cannot have both repeated_ttest_fsel and univariate_fsel_k; one needs to be set to None' return # Determine classifier if multi_class == 'ovr': lr_classifier = LogisticRegression(penalty='l2', C=1.) elif multi_class == 'multinomial': lr_classifier = LogisticRegression(penalty='l2', C=1., multi_class='multinomial', solver='newton-cg') elif multi_class == 'balanced': #useful if classes are unbalanced lr_classifier = LogisticRegression(penalty='l2', C=1., class_weight='balanced') elif multi_class == 'KNeighbors': lr_classifier = KNeighborsClassifier(weights='distance') elif multi_class == 'BaggingClassifier': lr_classifier = BaggingClassifier(LogisticRegression(penalty='l2', C=1.), max_samples=0.5, max_features=0.5) elif multi_class == 'GradientBoosting': lr_classifier = GradientBoostingClassifier(n_estimators=100) elif multi_class == 'GradientBoosted_LR': print 'Add in the code for this classifier!' # http://scikit-learn.org/stable/auto_examples/ensemble/plot_feature_transformation.html elif multi_class == 'AdaBoost': lr_classifier = AdaBoostClassifier(n_estimators=100) elif multi_class == 'MLP': #multilayer perceptron lr_classifier = MLPClassifier(solver='lbfgs', random_state=1, hidden_layer_sizes=(100, 100, 50)) else: print 'Need a classifier!' return dummy_classifier = DummyClassifier() # initialize confusion matrix if conf_mat: num_cond = len(np.unique(ev_labels)) cm_sub = np.zeros([num_cond, num_cond], dtype=int) for class_type, classifier in zip(['logreg', 'chance'], [lr_classifier, dummy_classifier]): # Calculate C in a CV manner, if requested if cv_C and multi_class == 'multinomial': calc_c = LogisticRegressionCV(penalty='l2', cv=cv, multi_class='multinomial', solver='newton-cg') calc_c.fit(X, ev_labels) print 'Setting C to: ' + str(calc_c.C_) lr_classifier = LogisticRegression(penalty='l2', C=calc_c.C_, multi_class='multinomial', solver='newton-cg') # Go through cross-validation loops for train, test in cv: # univariate feature selection? t-test version comes later... if univariate_fsel_k: fsel = SelectKBest(f_classif, k=univariate_fsel_k).fit( X[train], ev_labels[train]) X_train = fsel.transform(X[train]) else: X_train = X[train] # Feature decomposition? if pca_n: pca = PCA(n_components=pca_n).fit(X_train) X_train = pca.transform(X_train) # over/under sampling to balance classes during training? if upsampling: sm = SMOTE(random_state=42 ) # Synthetic Minority Over-sampling Technique X_train, ev_labels_train = sm.fit_sample( X_train, ev_labels[train]) # print X_train.shape # print ev_labels_train elif undersampling: rus = RandomUnderSampler(random_state=42, replacement=False) X_train, ev_labels_train = rus.fit_sample( X_train, ev_labels[train]) else: ev_labels_train = ev_labels[train] # If running feature selection using lowest pvals from combinations of classes if repeated_ttest_fsel: pvals = [] # initalize list for pvals across all combos for i, combo in enumerate( itertools.combinations(list(set(ev_labels_train)), 2)): print i, combo # figure out which samples are of interest mask = np.in1d(ev_labels_train, combo) # get pvals, add on to pvals list fval, pval = f_classif(X_train[mask], ev_labels_train[mask]) pvals.extend(list(pval.argsort()[:repeated_ttest_fsel])) # Now just grab relevant features from training data selected_voxels = list(set(pvals)) print 'Total of ' + str(len(selected_voxels)) + ' voxels.' X_train = X_train[:, selected_voxels] print X_train.shape # Fit classifier w/training data & labels classifier.fit(X_train, ev_labels_train) # Now prepare for testing! if univariate_fsel_k: X_test = fsel.transform(X[test]) else: X_test = X[test] if repeated_ttest_fsel: X_test = X_test[:, selected_voxels] print X_test.shape if pca_n: X_test = pca.transform(X_test) # update confusion matrix if necessary if conf_mat and class_type != 'chance': y_pred = classifier.predict(X_test) cm_fold = confusion_matrix(ev_labels[test], y_pred) cm_sub = np.sum([cm_sub, cm_fold], axis=0) if print_report: y_pred = classifier.predict(X_test) print( classification_report(ev_labels[test], y_pred, target_names=classifier.classes_)) # get logits for all trials if compute_AUC and class_type != 'chance': y_score = classifier.decision_function(X_test) # Iterate through each class to get acc, proba, etc. for i, category in enumerate(classifier.classes_): # Get indices for the true category cat_ind = ev_labels[test] == category # if this trial exists if sum(cat_ind) > 0: # Determine accuracy (TPR) acc = classifier.score(X_test[cat_ind], ev_labels[test][cat_ind]) if compute_AUC and class_type != 'chance': if len(classifier.classes_) > 2: auc = roc_auc_score(cat_ind, y_score[:, i]) else: auc = roc_auc_score(cat_ind, y_score) row = { 'subid': subid, 'mask_name': mask_name, 'category': category, 'classifier': class_type, 'auc': auc } df_auc = df_auc.append( pd.DataFrame.from_dict({0: row}, orient='index')) # Determine probabilities & save out probabilities for each category guessed probabilities = classifier.predict_proba( X_test[cat_ind]).T #class x sample prob_byclass = np.mean( probabilities, axis=1 ) # mean probability for each class for these samples for guess_cat in classifier.classes_: proba = prob_byclass[classifier.classes_ == guess_cat][ 0] # select the relevant column row = { 'subid': subid, 'mask_name': mask_name, 'true_category': category, 'guess_category': guess_cat, 'classifier': class_type, 'probability': proba } df_proba = df_proba.append( pd.DataFrame.from_dict({0: row}, orient='index')) else: print 'Nothing to score!' acc = np.nan row = { 'subid': subid, 'mask_name': mask_name, 'category': category, 'classifier': class_type, 'accuracy': acc, 'count': sum(cat_ind) } df_acc = df_acc.append( pd.DataFrame.from_dict({0: row}, orient='index')) # save confusion matrix, once iterated through CV folds if conf_mat and class_type != 'chance': print classifier.classes_ print 'Confusion matrix (raw counts):' print cm_sub # normalize, and add to group matrix cm_sub = cm_sub.astype('float') / cm_sub.sum(axis=1)[:, np.newaxis] print 'Confusion matrix (normalized):' print cm_sub cm_group = np.append(cm_group, [cm_sub], axis=0) # Return calculations if compute_AUC: if conf_mat: return df_acc, df_proba, df_auc, cm_group else: return df_acc, df_proba, df_auc else: if conf_mat: return df_acc, df_proba, cm_group else: return df_acc, df_proba
all_predictions = spam_detect_model.predict(messages_tfidf) print(all_predictions) # ## classification report # In[61]: from sklearn.metrics import classification_report # In[82]: print(classification_report(messages['label'],all_predictions)) # ## Train Test Split # In[63]: from sklearn.model_selection import train_test_split # In[72]: from sklearn.model_selection import train_test_split
# model from sklearn.ensemble import RandomForestClassifier # eval from sklearn.metrics import accuracy_score from sklearn.metrics import confusion_matrix from sklearn.metrics import classification_report raw_wine = datasets.load_wine() X = raw_wine.data y = raw_wine.target X_tn, X_te, y_tn, y_te = train_test_split(X, y, random_state=42) std_scale = StandardScaler() X_tn_std = std_scale.fit_transform(X_tn) X_te_std = std_scale.transform(X_te) clf_rf = RandomForestClassifier(max_depth=2, random_state=42) clf_rf.fit(X_tn_std, y_tn) rf_pred = clf_rf.predict(X_te_std) acc = accuracy_score(y_te, rf_pred) print('acc : ', acc) conf_matrix = confusion_matrix(y_te, rf_pred) print('confusion matrix : ', conf_matrix) class_report = classification_report(y_te, rf_pred) print(class_report)
#Begin Pipeline Setup select = SelectKBest() bNb = BernoulliNB() steps = [("feature_selection", select), ("bernouli_nb", bNb)] pipeNb = Pipeline(steps) paraGridBnb = dict(feature_selection__k=[20,25,30]) gsBnb = GridSearchCV(pipeNb, param_grid=paraGridBnb, scoring="f1_micro", n_jobs=-1) gsBnb.fit(X_trainBnb, y_trainBnb) BnbPreds = gsBnb.predict(X_testBnb) BnbReport = classification_report(BnbPreds, y_testBnb) BnbScore = accuracy_score(BnbPreds, y_testBnb) BnbMatrix = confusion_matrix(BnbPreds, y_testBnb) bestModelBnb = gsBnb.best_estimator_ from sklearn.externals import joblib joblib.dump(bestModelGb, 'ZIP_BnbBestModel.pkl', compress = 9) #============================================================================== #If I did a feature selection, I believe that the feature removal due to Bnb would have # higher feature importances. Will need to return. # precision recall f1-score support # # 1 0.35 0.33 0.34 9042 # 2 0.00 0.11 0.00 18 # 3 0.01 0.31 0.02 282
params = {'n_neighbors': [i for i in range(1, 30)], 'weights': ['uniform', 'distance'], 'p': [1, 2]} gcv = GridSearchCV(knn, params, scoring='accuracy', cv=6) # cv 数据分成6份 gcv.fit(X_train, y_train) # 查看了GridSearchCV 最佳的参数组合 # print(gcv.best_params_) # 最佳参数 # print(gcv.best_estimator_) # 最佳估计量 # print(gcv.best_score_) # 最佳得分 # 直接使用gcv进行预测,结果一样, 计算准确率 # y_ = gcv.predict(X_test) # print((y_ == y_test).mean()) # print(gcv.score(X_test, y_test)) # print(accuracy_score(y_test, y_)) # # 取出了最好的模型,进行预测 knn_best = gcv.best_estimator_ y_ = knn_best.predict(X_test) # print(accuracy_score(y_test, y_)) # 最佳得分 # print(pd.crosstab(index=y_test, columns=y_, rownames=['True'], colnames=['Predict'], margins=True)) # print(y_test.value_counts()) # 真实的数据 # print(Series(y_).value_counts()) # 预测的数据 # print(confusion_matrix(y_test, y_)) # print(np.round(6/9, 2)) # precision recall f1-score # 精确率 召回率 f1-score调和平均值 print(classification_report(y_test, y_, target_names=['B', 'M']))
# Predicting the results for our test dataset predicted_values = lr.predict(X_test) # Printing the residuals: difference between real and predicted for (real, predicted) in list(zip(y_test, predicted_values)): print( f'Value: {real}, pred: {predicted} {"is different" if real != predicted else ""}' ) # Printing accuracy score(mean accuracy) from 0 - 1 print(f'Accuracy score is {lr.score(X_test, y_test):.2f}/1 \n') # Printing the classification report from sklearn.metrics import classification_report, confusion_matrix, f1_score print('Classification Report') print(classification_report(y_test, predicted_values)) # Printing the classification confusion matrix (diagonal is true) print('Confusion Matrix') print(confusion_matrix(y_test, predicted_values)) print('Overall f1-score') print(f1_score(y_test, predicted_values, average="macro")) # #Printing the colormap # from matplotlib.colors import ListedColormap # from sklearn import neighbors, datasets # # Create color maps for 3-class classification problem, as with wine # cmap_light = ListedColormap(['#FFAAAA', '#AAFFAA', '#AAAAFF']) # cmap_bold = ListedColormap(['#FF0000', '#00FF00', '#0000FF']) # # wine = load_wine()
def ModelEvaluator(model_name,data): from sklearn import metrics from sklearn.metrics import confusion_matrix import matplotlib.pyplot as plt import matplotlib.gridspec as gridspec from matplotlib.ticker import FixedLocator, FixedFormatter #partition processed data into vectors actualClass = data.label predictedClass = data.predicted probability = data.probability #build a confusion matrix cm = confusion_matrix(actualClass, predictedClass, labels = [0,1]) TruePositive = cm[1, 1] TrueNegative = cm[0,0] FalsePositive = cm[0,1] FalseNegative = cm[1,0] numberOfPositives = TruePositive + FalseNegative numberOfNegatives = TrueNegative + FalsePositive #calculate the Null accuracy null_accuracy = 1 - actualClass.mean() #define the model accuracy model_accuracy = metrics.accuracy_score(actualClass, predictedClass) #Generate a metrics report report = metrics.classification_report(actualClass, predictedClass, output_dict = True) #calculate the model performance over the null accuracy performance_over_null = model_accuracy - null_accuracy #Calculate the Specificity of the model specificity = TrueNegative / (TrueNegative + FalsePositive) #Calculate the True positive rate, false positive rate, and thresholds to plot a rock curve fpr, tpr, thresholds = metrics.roc_curve(actualClass, probability) #Calculate the Area under the ROC Curve rocAuc = metrics.roc_auc_score(actualClass, probability) #Calculate the Michaels Correlation Coefficient mcc = metrics.matthews_corrcoef(actualClass, predictedClass) #generate figure fig = plt.figure(figsize = (10, 5)) spec = gridspec.GridSpec(ncols=2, nrows=2, wspace=0.5, hspace = 0.8, width_ratios=[1, 1], height_ratios = [1, 20], figure=fig) text = fig.add_subplot(spec[0,0]) text.axis('off') text. set_title('%s' % (model_name), fontweight = 'bold', fontsize = 16) text.text(0,0,'The performance of this model over the null accuracy is %2.2f%%\nModel Sensitivity: %2.6f%% \nModel Specificity: %2.6f%% \nModel F1 Score: %2.6f \nMatthews Correlation Coeffiecient: %2.6f' % ((performance_over_null *100), (report['1.0']['recall']*100), (specificity*100), (report['1.0']['f1-score']), mcc), bbox=dict(facecolor='white'), verticalalignment="top") #plot confusion matrix in pos 0,0 confusionMatrixLabels = ['Normal Traffic', 'Intrusion'] confusionMatrixColourMap = plt.cm.Blues confusionMatrix = fig.add_subplot(spec[1,0]) confusionMatrix.set_aspect('equal') confusionMatrix.imshow(cm, interpolation = 'nearest', cmap = confusionMatrixColourMap) confusionMatrix.set(ylabel ='True class', xlabel ='Predicted class') #confusionMatrix.xlabel(labelpad=5) confusionMatrix.set_xticks(np.arange(0,2)) formatter = FixedFormatter(['Normal Traffic', 'Intrusion']) locator = FixedLocator([0,1]) confusionMatrix.yaxis.set_major_formatter(formatter) confusionMatrix.yaxis.set_major_locator(locator) confusionMatrix.xaxis.set_major_formatter(formatter) confusionMatrix.xaxis.set_major_locator(locator) #confusionMatrix.set_yticks(np.arange(0,2)) #confusionMatrix.set_xticklabels(np.arange(0,1), confusionMatrixLabels, fontdict = None) tot = sum(data.label) for i in range(cm.shape[0]): for j in range(cm.shape[1]): confusionMatrix.text(j, i, (format(cm[i, j])),ha ='center', va="baseline", color="white" if cm[i,j] > (0.5*tot) else 'black', size = 'larger') cmLabels = ['TN', 'FP', 'FN', 'TP' ] a = 0 for i in range(cm.shape[0]): for j in range(cm.shape[1]): confusionMatrix.text(j + 0.3, i + 0.4, (cmLabels[a]), ha ='center', va="baseline", color="white" if cm[i,j] >(0.5*tot) else 'black', size = 'larger') if a < 4: a += 1 a=0 for i in range(cm.shape[a]): if a == 0: confusionMatrix.text(j+0.8, i, ('Total:\n %d' % (numberOfNegatives)), ha ='center', va="center", color = 'black', size = 'larger') a += 1 else: confusionMatrix.text(j+0.8, i, ('Total:\n %d' % (numberOfPositives)), ha ='center', va="center", color = 'black', size = 'larger') #plot roc curve in position 0,1 rocCurve = fig.add_subplot(spec[1, 1]) rocCurve.set_aspect('equal') rocCurve.plot(fpr, tpr, color='red', lw=2, label = 'ROC area = %0.5f)' % rocAuc ) rocCurve.set(xlabel = 'False Positive Rate (1-Specifcity)', ylabel = 'True Positive Rate (Sensitivity)' ) rocCurve.legend(loc="lower right")
data = pickle.load(pick_in) pick_in.close() random.shuffle(data) features = [] labels = [] #Split the elements in data into features and labels for feature, label in data: features.append(feature) labels.append(label) #Split the data into train (70%) and test data (30%) xtrain, xtest, ytrain, ytest = train_test_split(features, labels, test_size=0.3) decision_trees_model = tree.DecisionTreeClassifier() decision_trees_model.fit(xtrain, ytrain) prediction = decision_trees_model.predict(xtest) score = decision_trees_model.score(xtest, ytest) print(classification_report(ytest, prediction)) print("depth: ", decision_trees_model.get_depth()) print("prediction", prediction) print("Testing accuracy ", score) print("Numpy accuracy ", np.mean(ytest == prediction)) #Saves the model in 'model.sav' folder pick = open('decision_trees_model.sav', 'wb') pickle.dump(decision_trees_model, pick) pick.close()
print(Y_test.value_counts()) print from sklearn.preprocessing import StandardScaler from sklearn.linear_model import LogisticRegression from sklearn.linear_model import SGDClassifier ss=StandardScaler() X_train=ss.fit_transform(X_train) X_test=ss.transform(X_test) lr=LogisticRegression() sgdc=SGDClassifier() lr.fit(X_train,Y_train) lr_y_predict=lr.predict(X_test) sgdc.fit(X_train,Y_train) sgdc_y_predit=sgdc.predict(X_test) from sklearn.metrics import classification_report print 'Accuracy of LR Classifier: ',lr.score(X_test,Y_test) print classification_report(Y_test,lr_y_predict,target_names=['Benign','Malignant']) print 'Accuracy of SGD Classifier: ',sgdc.score(X_test,Y_test) print classification_report(Y_test,sgdc_y_predit,target_names=['Benign','Malignant'])