def fit_decision_tree(train_X, train_y, test_X, test_y): # print classification reports # print accuracy # The format should be """ Classification Report: precision recall f1-score support 0.0 0.80 0.89 0.85 4932 1.0 0.75 0.60 0.67 2676 avg / total 0.78 0.79 0.78 7608 Accuracy: 0.788512092534""" dtc = tree.DecisionTreeClassifier() dtc = dtc.fit(train_X,train_y.flat) pred_y = dtc.predict(test_X) print classification_report(test_y, pred_y) print accuracy_score(test_y,pred_y) # create the graph - Here you just need to create the dot file. Please uncomment my code below from sklearn.externals.six import StringIO f = open('tre.dot','w') tree.export_graphviz(dtc, out_file=f) # please change your_tree_model_fit with the variable you used above f.close()
def fit_logistic(train_X, train_y, test_X, test_y): logreg = linear_model.LogisticRegression() logreg = logreg.fit(train_X, train_y.flat) pred_y = logreg.predict(test_X) # print classification reports # print accuracy # The format should be print classification_report(test_y, pred_y) print accuracy_score(test_y,pred_y) """ Classification Report: precision recall f1-score support 0.0 0.80 0.89 0.85 4932 1.0 0.75 0.60 0.67 2676 avg / total 0.78 0.79 0.78 7608 Accuracy: 0.788512092534""" # don't worry about the values. Random sampling may lead to different varlue show_confusion_matrix(test_y,pred_y) return pred_y # predicted y values
def fit_decision_tree(train_X, train_y, test_X, test_y): # print classification reports # print accuracy # The format should be """ Classification Report: precision recall f1-score support 0.0 0.80 0.89 0.85 4932 1.0 0.75 0.60 0.67 2676 avg / total 0.78 0.79 0.78 7608 Accuracy: 0.788512092534""" dtc = tree.DecisionTreeClassifier() dtc = dtc.fit(train_X, train_y.flat) pred_y = dtc.predict(test_X) print classification_report(test_y, pred_y) print accuracy_score(test_y, pred_y) # create the graph - Here you just need to create the dot file. Please uncomment my code below from sklearn.externals.six import StringIO f = open('tre.dot', 'w') tree.export_graphviz( dtc, out_file=f ) # please change your_tree_model_fit with the variable you used above f.close()
def check_folding(classifier, check_instance=True, has_staged_pp=True, has_importances=True): X, y, sample_weight = generate_classification_data(distance=0.6) assert classifier == classifier.fit(X, y, sample_weight=sample_weight) assert list(classifier.features) == list(X.columns) check_classification_model(classifier, X, y, check_instance=check_instance, has_staged_pp=has_staged_pp, has_importances=has_importances) def mean_vote(x): return numpy.mean(x, axis=0) labels = classifier.predict(X, mean_vote) proba = classifier.predict_proba(X, mean_vote) assert numpy.all(proba == classifier.predict_proba(X, mean_vote)) score = accuracy_score(y, labels) print(score) assert score > 0.7 assert numpy.allclose(proba.sum(axis=1), 1), 'probabilities do not sum to 1' assert numpy.all(proba >= 0.), 'negative probabilities' auc_score = roc_auc_score(y, proba[:, 1]) print(auc_score) assert auc_score > 0.8 if has_staged_pp: for p in classifier.staged_predict_proba(X, mean_vote): assert p.shape == (len(X), 2) # checking that last iteration coincides with previous assert numpy.all(p == proba)
def main(): pipeline = Pipeline([ ('vect', TfidfVectorizer()), ('clf', LogisticRegression()) ]) parameters = { 'vect__max_df': (0.25, 0.5, 0.75), 'vect__stop_words': ('english', None), 'vect__max_features': (5000, 10000, None), 'vect__ngram_range': ((1, 1), (1, 2)), 'vect__use_idf': (True, False), 'vect__norm': ('l1', 'l2'), 'clf__penalty': ('l1', 'l2'), 'clf__C': (0.1, 1, 10), } df = pd.read_csv('movie-reviews/train.tsv', header=0, delimiter='\t') X, y = df['Phrase'], df['Sentiment'].as_matrix() X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5) grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy') grid_search.fit(X_train, y_train) print 'Best score: %0.3f' % grid_search.best_score_ print 'Best parameters set:' best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print '\t%s: %r' % (param_name, best_parameters[param_name]) predictions = grid_search.predict(X_test) print 'Accuracy:', accuracy_score(y_test, predictions) print 'Precision:', precision_score(y_test, predictions) print 'Recall:', recall_score(y_test, predictions) print 'F1 score:', f1_score(y_test, predictions)
def calc_metrics(true_labels, predicted_labels): """Provide accuracy, precision, recall, and f1 as error measure. Parameters ---------- true_labels : list, ndarray true labels predicted_labels : list, ndarray predicted labels Returns ------- (float, float, float, float) accuracy, precision, recall, f1 Example ------- >>> y_true = [0, 1, 1, 0] >>> y_pred = [0, 0, 1, 1] >>> calc_metrics(y_true, y_pred) (0.5, 0.5, 0.5, 0.5) """ acc = accuracy_score(true_labels, predicted_labels) p, r, f1, _ = precision_recall_fscore_support(true_labels, predicted_labels, average='micro') return (acc, p, r, f1)
def test_quality(n_samples=3000): testX, testY = generate_sample(n_samples, 10, 0.6) trainX, trainY = generate_sample(n_samples, 10, 0.6) params = { 'n_neighbors': 10, 'n_estimators': 10, 'uniform_variables': ['column0'], 'base_estimator': DecisionTreeClassifier(min_samples_leaf=20, max_depth=5) } for algorithm in ['SAMME', 'SAMME.R']: uboost_classifier = uBoostClassifier( algorithm=algorithm, efficiency_steps=5, **params) bdt_classifier = uBoostBDT(algorithm=algorithm, **params) for classifier in [bdt_classifier, uboost_classifier]: classifier.fit(trainX, trainY) predict_proba = classifier.predict_proba(testX) predict = classifier.predict(testX) assert roc_auc_score(testY, predict_proba[:, 1]) > 0.7, \ "quality is awful" print("Accuracy = %.3f" % accuracy_score(testY, predict))
def main(): pipeline = Pipeline([ ('vect', TfidfVectorizer(stop_words='english')), ('clf', LogisticRegression()) ]) parameters = { 'vect__max_df': (0.25, 0.5), 'vect__ngram_range': ((1, 1), (1, 2)), 'vect__use_idf': (True, False), 'clf__C': (0.1, 1, 10), } os.chdir('C:\\Users\\Dan\\1) Python Notebooks\\Datasets') df = pd.read_csv('data/train.tsv', header=0, delimiter='\t') X, y = df['Phrase'], df['Sentiment'].as_matrix() X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5) grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy') lb = LabelBinarizer() y_train = np.array([number[0] for number in lb.fit_transform(y_train)]) grid_search.fit(X_train, y_train) print 'Best score: %0.3f' % grid_search.best_score_ print 'Best parameters set:' best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print '\t%s: %r' % (param_name, best_parameters[param_name]) predictions = grid_search.predict(X_test) lb = LabelBinarizer() y_train = np.array([number[0] for number in lb.fit_transform(y_train)]) print 'Accuracy:', accuracy_score(y_test, predictions) print 'Precision:', precision_score(y_test, predictions) print 'Recall:', recall_score(y_test, predictions)
def main(): pipeline = Pipeline([ ('vect', TfidfVectorizer(stop_words='english')), ('clf', LogisticRegression()) ]) parameters = { 'vect__max_df': (0.25, 0.5), 'vect__ngram_range': ((1, 1), (1, 2)), 'vect__use_idf': (True, False), 'clf__C': (0.1, 1, 10), } df = pd.read_csv('data/train.tsv', header=0, delimiter='\t') X, y = df['Phrase'], df['Sentiment'].as_matrix() X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5) grid_search = GridSearchCV(pipeline, parameters, n_jobs=3, verbose=1, scoring='accuracy') grid_search.fit(X_train, y_train) print 'Best score: %0.3f' % grid_search.best_score_ print 'Best parameters set:' best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print '\t%s: %r' % (param_name, best_parameters[param_name]) predictions = grid_search.predict(X_test) print 'Accuracy:', accuracy_score(y_test, predictions) print 'Confusion Matrix:', confusion_matrix(y_test, predictions) print 'Classification Report:', classification_report(y_test, predictions)
def main(): pipeline = Pipeline([('vect', TfidfVectorizer()), ('clf', LogisticRegression())]) parameters = { # 'vect__max_df': (0.25, 0.5, 0.75), 'vect__stop_words': ('english', None), # 'vect__max_features': (5000, 10000, None), # 'vect__ngram_range': ((1, 1), (1, 2)), # 'vect__use_idf': (True, False), # 'vect__norm': ('l1', 'l2'), # 'clf__penalty': ('l1', 'l2'), # 'clf__C': (0.1, 1, 10), } df = pd.read_csv('movie-reviews/train.tsv', header=0, delimiter='\t') X, y = df['Phrase'], df['Sentiment'].as_matrix() X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5) grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy') grid_search.fit(X_train, y_train) print 'Best score: %0.3f' % grid_search.best_score_ print 'Best parameters set:' best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print '\t%s: %r' % (param_name, best_parameters[param_name]) predictions = grid_search.predict(X_test) print 'Accuracy:', accuracy_score(y_test, predictions) print 'Classification Report:', classification_report(y_test, predictions) from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, predictions) print cm plt.matshow(cm) plt.title('Confusion matrix') plt.colorbar() plt.ylabel('True label') plt.xlabel('Predicted label') plt.show() predictions = np.ones(len(predictions)) * 2 print 'Accuracy:', accuracy_score(y_test, predictions) print 'Degenerate Classification Report:', classification_report( y_test, predictions)
def train(self, instances, labels, centroid): self.centroid = centroid sample_weights = self.sampler_weigher.get_sample_weights(instances, centroid) self.sample_centroid = numpy.average(instances, axis=0, weights=sample_weights) self.base_estimator.fit(instances, labels, sample_weight=sample_weights) instances_oob, labels_oob = instances[sample_weights == 0], labels[sample_weights == 0] if len(instances_oob) > 0: self.oob_accuracy = accuracy_score(labels_oob, self.predict(instances_oob)) return self
def evaluate(df): X = df.ix[:,0:7] y = df["seed"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) print len(X_train) y_test = np.array(y_test) clf = LogisticRegression() clf.fit(X_train,y_train) print "------------",clf.predict_proba(X_test) print clf.get_params() pipeline= Pipeline([ ('clf',LogisticRegression()) ]) parameters={ } grid_search = GridSearchCV(pipeline,parameters,n_jobs=1,verbose=1) grid_search.fit(X_train,y_train) print "Best score:",grid_search.best_score_ print "Best parameters set:" best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print (param_name,best_parameters[param_name]) prediction = grid_search.predict(X_test) for i,pred in enumerate(prediction): print "original:",y_test[i],"predicted",pred print grid_search.score(X_test,y_test) print accuracy_score(y_test,prediction) print "classification_report",classification_report(y_test,prediction) clf_pred = clf.predict(X_test) for i,pred in enumerate(clf_pred): print "original:",y_test[i],"predicted",pred print accuracy_score(y_test,clf_pred) print clf.score(X_test,y_test)
def test_factory(): factory = ClassifiersFactory() try: from rep.estimators.tmva import TMVAClassifier factory.add_classifier('tmva', TMVAClassifier()) except ImportError: pass factory.add_classifier('rf', RandomForestClassifier(n_estimators=10)) factory.add_classifier('ada', AdaBoostClassifier(n_estimators=20)) X, y, sample_weight = generate_classification_data() assert factory == factory.fit(X, y, sample_weight=sample_weight, features=list(X.columns), parallel_profile='threads-4') for cl in factory.values(): assert list(cl.features) == list(X.columns) proba = factory.predict_proba(X, parallel_profile='threads-4') labels = factory.predict(X, parallel_profile='threads-4') for key, val in labels.items(): score = accuracy_score(y, val) print(key, score) assert score > 0.7, key for key, val in proba.items(): assert numpy.allclose(val.sum(axis=1), 1), 'probabilities do not sum to 1' assert numpy.all(val >= 0.), 'negative probabilities' auc_score = roc_auc_score(y, val[:, 1]) print(auc_score) assert auc_score > 0.8 for key, iterator in factory.staged_predict_proba(X).items(): assert key != 'tmva', 'tmva does not support staged pp' for p in iterator: assert p.shape == (len(X), 2) # checking that last iteration coincides with previous assert numpy.all(p == proba[key]) # testing picklability dump_string = cPickle.dumps(factory) clf_loaded = cPickle.loads(dump_string) assert type(factory) == type(clf_loaded) probs1 = factory.predict_proba(X) probs2 = clf_loaded.predict_proba(X) for key, val in probs1.items(): assert numpy.all(val == probs2[key]), 'something strange was loaded' report = ClassificationReport({'rf': factory['rf']}, LabeledDataStorage(X, y, sample_weight)) report.feature_importance_shuffling(roc_auc_score_mod).plot(new_plot=True, figsize=(18, 3)) report = factory.test_on_lds(LabeledDataStorage(X, y, sample_weight)) report = factory.test_on(X, y, sample_weight=sample_weight) val = numpy.mean(X['column0']) check_report_with_mask(report, "column0 > %f" % (val / 2.), X) check_report_with_mask(report, lambda x: numpy.array(x['column0']) < val * 2., X) check_report_with_mask(report, None, X)
def run_model(self, train_path, test_path): trainx, trainy = self.load_data(train_path) self.train_model(trainx, trainy) testx, testy = self.load_data(test_path) predy = self.predict_res(testx) accuracy = accuracy_score(testy, predy) label = [1, 0] classifier = ['interested', 'nointerested'] result = classification_report(testy, predy, labels=label, target_names = classifier) + '\naccuracy\t' + str(accuracy) print result
def main(): pipeline = Pipeline([ ('vect', TfidfVectorizer()), ('clf', LogisticRegression()) ]) parameters = { # 'vect__max_df': (0.25, 0.5, 0.75), 'vect__stop_words': ('english', None), # 'vect__max_features': (5000, 10000, None), # 'vect__ngram_range': ((1, 1), (1, 2)), # 'vect__use_idf': (True, False), # 'vect__norm': ('l1', 'l2'), # 'clf__penalty': ('l1', 'l2'), # 'clf__C': (0.1, 1, 10), } df = pd.read_csv('movie-reviews/train.tsv', header=0, delimiter='\t') X, y = df['Phrase'], df['Sentiment'].as_matrix() X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.5) grid_search = GridSearchCV(pipeline, parameters, n_jobs=-1, verbose=1, scoring='accuracy') grid_search.fit(X_train, y_train) print 'Best score: %0.3f' % grid_search.best_score_ print 'Best parameters set:' best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print '\t%s: %r' % (param_name, best_parameters[param_name]) predictions = grid_search.predict(X_test) print 'Accuracy:', accuracy_score(y_test, predictions) print 'Classification Report:', classification_report(y_test, predictions) from sklearn.metrics import confusion_matrix cm = confusion_matrix(y_test, predictions) print cm plt.matshow(cm) plt.title('Confusion matrix') plt.colorbar() plt.ylabel('True label') plt.xlabel('Predicted label') plt.show() predictions = np.ones(len(predictions)) * 2 print 'Accuracy:', accuracy_score(y_test, predictions) print 'Degenerate Classification Report:', classification_report(y_test, predictions)
def Run(self, trainFileDir, testFileDir): XTrain, yTrain = self.loadData(trainFileDir) self.trainModel(XTrain, yTrain) XTest, yTest = self.loadData(testFileDir) yPred = self.predict(XTest) accuracy = accuracy_score(yTest, yPred) #precision, recall, fScore, _ = precision_recall_fscore_support(y, yPred) labels = [1, 0] classNames = ['interested', 'notInterested'] report = classification_report(yTest, yPred, labels=labels, target_names=classNames) + '\naccuracy\t' + str(accuracy) print report
def evaluate(df): X = df.ix[:, 0:7] y = df["seed"] X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42) print len(X_train) y_test = np.array(y_test) clf = LogisticRegression() clf.fit(X_train, y_train) print "------------", clf.predict_proba(X_test) print clf.get_params() pipeline = Pipeline([('clf', LogisticRegression())]) parameters = {} grid_search = GridSearchCV(pipeline, parameters, n_jobs=1, verbose=1) grid_search.fit(X_train, y_train) print "Best score:", grid_search.best_score_ print "Best parameters set:" best_parameters = grid_search.best_estimator_.get_params() for param_name in sorted(parameters.keys()): print(param_name, best_parameters[param_name]) prediction = grid_search.predict(X_test) for i, pred in enumerate(prediction): print "original:", y_test[i], "predicted", pred print grid_search.score(X_test, y_test) print accuracy_score(y_test, prediction) print "classification_report", classification_report(y_test, prediction) clf_pred = clf.predict(X_test) for i, pred in enumerate(clf_pred): print "original:", y_test[i], "predicted", pred print accuracy_score(y_test, clf_pred) print clf.score(X_test, y_test)
def analyze_run(self, prediction_matrix, labels): for j in range(self.accuracy_sample.shape[1]): predictions = prediction_matrix[:, j] self.accuracy_sample[self.current_run][j] = accuracy_score( labels, predictions) precision, recall, f1 = precision_recall_f1_score( labels, predictions) self.precision_sample[self.current_run][j] = precision self.recall_sample[self.current_run][j] = recall self.f1_sample[self.current_run][j] = f1 self.current_run += 1
def Run(self, trainFileDir, testFileDir): XTrain, yTrain = self.loadData(trainFileDir) self.trainModel(XTrain, yTrain) XTest, yTest = self.loadData(testFileDir) yPred = self.predict(XTest) accuracy = accuracy_score(yTest, yPred) #precision, recall, fScore, _ = precision_recall_fscore_support(y, yPred) labels = [1, 0] classNames = ['interested', 'notInterested'] report = classification_report( yTest, yPred, labels=labels, target_names=classNames) + '\naccuracy\t' + str(accuracy) print report
def fit_logistic(train_X, train_y, test_X, test_y): logreg = linear_model.LogisticRegression() logreg = logreg.fit(train_X, train_y.flat) pred_y = logreg.predict(test_X) # print classification reports # print accuracy # The format should be print classification_report(test_y, pred_y) print accuracy_score(test_y, pred_y) """ Classification Report: precision recall f1-score support 0.0 0.80 0.89 0.85 4932 1.0 0.75 0.60 0.67 2676 avg / total 0.78 0.79 0.78 7608 Accuracy: 0.788512092534""" # don't worry about the values. Random sampling may lead to different varlue show_confusion_matrix(test_y, pred_y) return pred_y # predicted y values
def train(self, instances, labels, centroid): self.centroid = centroid sample_weights = self.sampler_weigher.get_sample_weights( instances, centroid) self.sample_centroid = numpy.average(instances, axis=0, weights=sample_weights) self.base_estimator.fit(instances, labels, sample_weight=sample_weights) instances_oob, labels_oob = instances[sample_weights == 0], labels[ sample_weights == 0] if len(instances_oob) > 0: self.oob_accuracy = accuracy_score(labels_oob, self.predict(instances_oob)) return self
def Classify(txtList, txtLabels, fileName, labelList): x_train = np.array(txtList[0:300]) y_train = np.array(txtLabels[0:300]) x_test = np.array(txtList[301:]) y_test = np.array(txtLabels[301:]) classifier = Pipeline([ ('vectorizer', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', OneVsRestClassifier(LinearSVC()))]) classifier.fit(x_train, y_train) predicted = classifier.predict(x_test) f=open(fileName,'w') f.writelines(metrics.classification_report(y_test, predicted,target_names=labelList)) f.write('\nNumber of Labels:'+str(len(labelList))) f.write('\nhamming loss : '+str(metrics.hamming_loss(y_test,predicted))) f.write('\nf-beta(beta=0.5 - biased towards Precision) : '+str(metrics.fbeta_score(y_test,predicted,0.5))) f.write('\nzero-loss:'+str(zero_one_loss(y_test,predicted))) f.write('\nAccuracy score:'+str(metrics.accuracy_score(y_test,predicted))) f.close()
def main(argv): try: opts, args = getopt.getopt(argv, "d:c:") except getopt.GetoptError: sys.exit(2) for opt, arg in opts: if opt == '-d': data_file = arg elif opt == '-c': label_col = int(arg) y_true = np.genfromtxt(data_file, usecols=label_col, delimiter="\t", skip_header=1) for lab in range(2, 9): print "lab", lab y_pred = np.genfromtxt(data_file, usecols=lab, delimiter="\t", skip_header=1) print "The classification report for Algorithm", lab, "is \n" #Make classification report print metrics.classification_report(y_true, y_pred) print "Accuracy: %.6f" % metrics.accuracy_score(y_true, y_pred) #Compute specificity from confusion amtrix cm = confusion_matrix(y_true, y_pred) print "Confusion matrix as \n", cm tn = int(cm[0, 0]) fp = int(cm[0, 1]) print "tn", tn print "fp", fp s = tn / (tn + fp) print "Speicificity is", s, "\n" print "Metthiew correlation co-efficient: %.6f" % matthews_corrcoef( y_true, y_pred)
def main(argv=None): if argv is None: argv = sys.argv[1:] args = parser.parse_args(argv) log.info('start parameters: ' + str(args)) log.info('loading data') true = [] pred = [] for line in file_line_generator(args.true_labels): true.append(line) for line in file_line_generator(args.pred_labels): pred.append(line) acc = accuracy_score(true, pred) log.info('accuracy: %f' % acc) if args.precision or args.recall or args.f_measure: p, r, f, _ = precision_recall_fscore_support( true, pred, args.beta, pos_label=args.pos_label, average=None if not args.avg else args.avg) if args.precision: log.info('precision: %f' % p) if args.recall: log.info('recall: %f' % r) if args.f_measure: log.info('f-measure: %f' % f) log.info('finished')
if __name__ == '__main__': if len(sys.argv) != 2: print ("Illegal use of Arguments: Best_configuration.py <Training_samples_location> <Testing_Samples_Location>") exit(1) test = sys.argv[1] header_list = [] labels = [] i=0 header_test = [] test_labels = [] i = 0 for root, dirs, files in os.walk(test): for name in files: fo = open(root +"/"+name, "r") content = fo.read().replace('\n', ' ') body = re.sub(r'^(.*) Lines: (\d)+ ', "", content) header_test.append(unicode(body,errors='ignore')) test_labels.append(i) i=i+1 text_clf01 = joblib.load('Training_model.pkl') predicted01 = text_clf01.predict(header_test) print("Removed Stop Words + L2 penalization") print ("F1:",metrics.f1_score(test_labels, predicted01, average='macro')) print ("accuracy:", metrics.accuracy_score(test_labels, predicted01)) print ("precision:",metrics.precision_score(test_labels, predicted01, average='macro')) print ("recall:",metrics.recall_score(test_labels, predicted01, average='macro'))
plt.show() #There's a pitch-perfect illustration of overfitting. Look at the gulf between the training and cv scores. As we train #on more and more examples, the training score does decrease and cv scores increases but we'll need exponentially more #examples to reduce the gulf between the two. Let's confirm understanding by looking at the test scores. # In[23]: #Let's see how our trained model performs on the test set. We are not going to train on this set merely looking at how well #our model can generalize. #Calling Fit on the estimator object so we can predict. We're NOT retraining the classifier here. estimator.fit(X_train, y_train) y_pred = estimator.predict(X_test) print metrics.classification_report(y_test, y_pred) print "Decision Trees: Final Generalization Accuracy: %.6f" % metrics.accuracy_score( y_test, y_pred) #That's not too bad but we can get a much better result if we addressed the overfitting problem. Let's now try the random #forests classifier to see how it does. # In[25]: #WARNING - THIS MIGHT TAKE A WHILE TO RUN. TRY ADJUSTING parameters such as n_jobs (jobs to run in parallel, before #increasing this make sure your system can handle it), n_iter for ShuffleSplit (in the function definition) and reducing #number of values being tried for max_depth/n_estimators. #SELECT INTERRUPT IN THE MENU AND PRESS INTERRUPT KERNEL IF YOU NEEDD TO STOP EXECUTION max_depth = np.linspace(5, 10, 5) n_estimators = [10, 100, 1000]
import pylab as pl features_train, labels_train, features_test, labels_test = makeTerrainData() ################################################################################# ########################## DECISION TREE ################################# #### your code goes here from classifyDT import classify from sklearn.metrics.metrics import accuracy_score clf = classify(features_train, labels_train,50.0) clf.fit(features_train,labels_train) pred = clf.predict_proba(features_test) roundedNumber = [] for i in range(0,len(pred)): roundedNumber.append(round(pred[i,1])) acc = accuracy_score(labels_test,roundedNumber)### you fill this in! print acc### be sure to compute the accuracy on the test set def submitAccuracies(): return {"acc":round(acc,3)}
for multiplier in np.linspace(0.5, 1.5, 10): threshold = np.percentile(y_prob, anomaly_prob * multiplier) y_label = list() for elem in y_prob: if elem > threshold: label = 1 else: label = 0 y_label.append(label) result = classification_report(y, y_label,labels = [0,1], target_names = ['anomaly', 'normal']) f1 = f1_score(y, y_label, pos_label = 0) accuracy = accuracy_score(y, y_label) # print '--------------------------------------------------------' # print 'temp : ', data_name, (n_comp, cov_type), f1 # print multiplier, anomaly_prob, threshold # print result # print '--------------------------------------------------------' if data_name.endswith('.txt'): encode_output = data_name.split('_')[4] dbn_model = data_name[data_name.rindex('_')+1:data_name.index('.')] else: encode_output = '-' dbn_model = '-' result_table.loc[pos, :] = [data_name, dbn_model, encode_output, n_comp, cov_type, anomaly_prob, multiplier, f1, accuracy]
def whole_dataset_train_test(X, y): rfpred = RandomForestClassifier().fit(X,y) pred = rfpred.predict(X) print "When fitted on the whole dataset with selected features, then the classification report is found to be:\n"; print "Random Forests: Accuracy: %.6f" %metrics.accuracy_score(y,pred) print metrics.classification_report(y, pred)
vectorizer = TfidfVectorizer() X_train = vectorizer.fit_transform(X_train) X_test = vectorizer.transform(X_test) from sklearn.linear_model import SGDClassifier, LogisticRegression from sklearn.metrics import classification_report y_true_all = [] predictions_all = [] for label in good_categories[:3]: print 'label', label y_train = [1 if label in instance else 0 for instance in y_train_all] y_test = [1 if label in instance else 0 for instance in y_test_all] y_true_all.append(y_test) classifier = LogisticRegression() classifier.fit_transform(X_train, y_train) predictions = classifier.predict(X_test) predictions_all.append(predictions) print classification_report(y_test, predictions) print confusion_matrix(y_test, predictions) print 'precision', precision_score(y_test, predictions) print 'recall', recall_score(y_test, predictions) print 'accuracy', accuracy_score(y_test, predictions) print '\n' y_true_all = np.array(y_true_all) predictions_all = np.array(predictions_all) print hamming_loss(y_true_all, predictions_all)
vect = CountVectorizer() train_dtm = vect.fit_transform(X_train) test_dtm = vect.transform(X_test) #Task 5 from sklearn.naive_bayes import MultinomialNB nb = MultinomialNB() nb.fit(train_dtm, y_train) y_pred = nb.predict(test_dtm) from sklearn.metrics import metrics print metrics.accuracy_score(y_test, y_pred) #92% Accuracy #Task 6 # Map five to 1 and 1 to 0 y_test[y_test ==1] = 0 y_test[y_test == 5 ] = 1 y_pred_prob = nb.predict_proba(test_dtm)[:,1] print metrics.roc_auc_score(y_test, y_pred_prob) #Task 7 import matplotlib.pyplot as plt fpr, tpr, thresholds = metrics.roc_curve(y_test, y_pred_prob) plt.plot(fpr, tpr) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0])
X_train, y_train = X[train_indices], y[train_indices] X_test, y_test = X[test_indices], y[test_indices] n_repeat = 100 n_experts = 1 y_preds = numpy.zeros((len(y_test), n_experts), dtype=int) for e in range(n_experts): indices = numpy.random.choice(len(X_train), size=int(15.0*len(X_train)), replace=True) X_train_exp, y_train_exp = X_train[indices], y_train[indices] for k, x in enumerate(X_test): X_matrix = numpy.tile([x], (n_repeat, 1)) X_total = numpy.vstack((X_train_exp, X_matrix)) y_total = numpy.hstack((y_train_exp, numpy.zeros(n_repeat))) value = 0 for p in classes: y_total[-n_repeat:] = p curr_value = measure(X_total, y_total) if curr_value == 0 or curr_value == 1: print "OPS!" if curr_value > value: y_preds[k, e] = p value = curr_value y_pred = numpy.asarray([numpy.bincount(row).argmax() for row in y_preds]) curr_accuracies[fold] = accuracy_score(y_test, y_pred) print curr_accuracies[fold], numpy.random.seed(fold) curr_knn[fold] = DecisionTreeClassifier().fit(X_train, y_train).score(X_test, y_test) print curr_knn[fold] print curr_accuracies.mean() print curr_knn.mean()
def print_classification_report(y_test_report, y_predicted_report, target_names): # target_names = ['class 0', 'class 1'] print ("overall accuracy score of the classifier is") print accuracy_score(y_test_report, y_predicted_report) print (classification_report(np.array(y_test_report), np.array(y_predicted_report), target_names=target_names)) return None
test_size=0.33, random_state=42) from sklearn.ensemble import RandomForestClassifier #se pasmo con 1000000 #probar con mas parametros classifier = RandomForestClassifier(n_estimators=100) classifier.fit(X_train, y_train) prediction = classifier.predict(X_test) #print X_train.shape from sklearn.metrics.metrics import precision_score, recall_score, confusion_matrix, classification_report, accuracy_score print '\nAccuracy:', accuracy_score(y_test, prediction) print '\nscore:', classifier.score(X_train, y_train) print '\nrecall:', recall_score(y_test, prediction) print '\nprecision:', precision_score(y_test, prediction) print '\n clasification report:\n', classification_report(y_test, prediction) print '\n confussion matrix:\n', confusion_matrix(y_test, prediction) #plots: import matplotlib.pyplot as plt confusion_matrix_plot = confusion_matrix(y_test, prediction) plt.title('matriz de confusion') plt.colorbar() plt.xlabel() plt.xlabel('categoria de verdad') plt.ylabel('categoria predecida')
processed_comment_list = [] for art in commentList.items(): for comm in art[1]: processed_comment_list.append(comm.body.decode('ascii', 'ignore')) features = vectorizer.transform(processed_comment_list) y_train = load_numpy_matrix(feature_set_path + r'valueVector' + tag + '_train.npy') y_test = load_numpy_matrix(feature_set_path + r'valueVector' + tag + '_test.npy') print features.shape print y_train.shape print y_test.shape valueVector = np.concatenate([y_train, y_test]) print print valueVector.shape # train_list = [' '.join(sent) for sent in train_list] # test_list = [' '.join(sent) for sent in test_list] predicted = [float(v) for v in clf.predict(features)] print "Accuracy: %0.3f " % (accuracy_score(valueVector, predicted)) print classification_report(valueVector, predicted, target_names=['0', '1']) print draw_confusion_matrix(valueVector, predicted, ['ham', 'spam'])
X_train = vectorizer.fit_transform(corpus_train) X_test = vectorizer.transform(corpus_test) clf = RandomForestClassifier(n_estimators=10) #clf = KNeighborsClassifier(n_neighbors=10) #clf = LinearSVC() clf.fit(X_train, y_train) print len(y_train) print len(y_test) pred = clf.predict(X_test) #pred = ['0']* len(y_test) score = metrics.accuracy_score(y_test, pred) print("accuracy: %0.3f" % score) total.append(score) n = 20 # feature_names = vectorizer.get_feature_names() # coefs_with_fns = sorted(zip(clf.coef_[0], feature_names)) # top = zip(coefs_with_fns[:n], coefs_with_fns[:-(n + 1):-1]) # for (coef_1, fn_1), (coef_2, fn_2) in top: # print "\t%.4f\t%-15s\t\t%.4f\t%-15s" % (coef_1, fn_1, coef_2, fn_2) print np.mean(total)
lines = tLine.rstrip().split('|@~') tweet = lines[0] sentiment = lines[1] processedTweet = processTweet(tweet) featureVector = getFeatureVector(processedTweet) testTweets.append((featureVector, sentiment)) tLine = tp.readline() # end loop # Train the SVM Classifier result_train = getSVMFeatureVectorAndLabels(tweets, featureList) result_test = getSVMFeatureVectorAndLabels(testTweets, featureList) # Split the data into a training set and a test set data_train = result_train['feature_vector'] target_train = result_train['labels'] data_test = result_test['feature_vector'] target_test = result_test['labels'] # Run SVM Classifier SVMClassifier = svm.SVC(kernel='linear') target_pred = SVMClassifier.fit(data_train, target_train).predict(data_test) targetNames = ['cessation', 'no cessation'] print "Classification by SVM Classifier" print classification_report(target_test, target_pred, target_names=targetNames) print confusion_matrix(target_test, target_pred) print accuracy_score(target_test, target_pred) #
features_train, labels_train, features_test, labels_test = makeTerrainData() def submitAccuracies(): return {"acc_min_samples_split_2":round(acc_min_samples_split_2,3), "acc_min_samples_split_50":round(acc_min_samples_split_50,3)} ########################## DECISION TREE ################################# ### your code goes here--now create 2 decision tree classifiers, ### one with min_samples_split=2 and one with min_samples_split=50 ### compute the accuracies on the testing data and store ### the accuracy numbers to acc_min_samples_split_2 and ### acc_min_samples_split_5, respectively from classifyDT import classify from sklearn.metrics.metrics import accuracy_score clf = classify(features_train, labels_train,50.0) pred = clf.predict_proba(features_test) roundedNumber = [] for i in range(0,len(pred)): roundedNumber.append(round(pred[i,1])) acc_min_samples_split_50 = accuracy_score(labels_test,roundedNumber)### you fill this in! clf = classify(features_train, labels_train,2.0) pred = clf.predict_proba(features_test) acc_min_samples_split_2 = accuracy_score(labels_test,pred[:,1])### you fill this in! print submitAccuracies()
def print_classification_report(y_test_report, y_predicted_report,target_names): #target_names = ['class 0', 'class 1'] print ("overall accuracy score of the classifier is") print accuracy_score(y_test_report, y_predicted_report) print(classification_report(np.array(y_test_report), np.array(y_predicted_report), target_names=target_names)); return None
def submitAccuracies(): return { "acc_min_samples_split_2": round(acc_min_samples_split_2, 3), "acc_min_samples_split_50": round(acc_min_samples_split_50, 3) } ########################## DECISION TREE ################################# ### your code goes here--now create 2 decision tree classifiers, ### one with min_samples_split=2 and one with min_samples_split=50 ### compute the accuracies on the testing data and store ### the accuracy numbers to acc_min_samples_split_2 and ### acc_min_samples_split_5, respectively from classifyDT import classify from sklearn.metrics.metrics import accuracy_score clf = classify(features_train, labels_train, 50.0) pred = clf.predict_proba(features_test) roundedNumber = [] for i in range(0, len(pred)): roundedNumber.append(round(pred[i, 1])) acc_min_samples_split_50 = accuracy_score(labels_test, roundedNumber) ### you fill this in! clf = classify(features_train, labels_train, 2.0) pred = clf.predict_proba(features_test) acc_min_samples_split_2 = accuracy_score(labels_test, pred[:, 1]) ### you fill this in! print submitAccuracies()
train_vectors.append(item[1]) train_vectors = array(train_vectors) if TRY_WITH_PREBUILD: print('Now building a classifier for our initial test, how does it do on pre-computed vectors.') # The paper uses a neural network, whatever that is... clf = SVC(C=50.0, kernel='linear') # For our first test we use a subset of train data clf.fit(train_vectors[:20000], train_targets[:20000]) print('Without loading in new stuff, lets get an idea of what we can do.') predicted = clf.predict(train_vectors[20000:25000]) acc = metrics.accuracy_score(train_targets[20000:25000], predicted) print('Accuracy: ', str(acc * 100.0) + '%') del clf print('Now we got some new reviews coming in.\n' '\tBut before we read them lets rebuild the classifier with all available data.') else: print('Now building a classifier') clf = SVC(C=50.0, kernel='linear') clf.fit(train_vectors, train_targets) print('Extending vocab and building vectors for new labels') # Freeze the words,should only matter for dm (high inflection)? model_dm.train_words = False
#There's a pitch-perfect illustration of overfitting. Look at the gulf between the training and cv scores. As we train #on more and more examples, the training score does decrease and cv scores increases but we'll need exponentially more #examples to reduce the gulf between the two. Let's confirm understanding by looking at the test scores. # In[23]: #Let's see how our trained model performs on the test set. We are not going to train on this set merely looking at how well #our model can generalize. #Calling Fit on the estimator object so we can predict. We're NOT retraining the classifier here. estimator.fit(X_train, y_train) y_pred=estimator.predict(X_test) print metrics.classification_report(y_test,y_pred) print "Decision Trees: Final Generalization Accuracy: %.6f" %metrics.accuracy_score(y_test,y_pred) #That's not too bad but we can get a much better result if we addressed the overfitting problem. Let's now try the random #forests classifier to see how it does. # In[25]: #WARNING - THIS MIGHT TAKE A WHILE TO RUN. TRY ADJUSTING parameters such as n_jobs (jobs to run in parallel, before #increasing this make sure your system can handle it), n_iter for ShuffleSplit (in the function definition) and reducing #number of values being tried for max_depth/n_estimators. #SELECT INTERRUPT IN THE MENU AND PRESS INTERRUPT KERNEL IF YOU NEEDD TO STOP EXECUTION max_depth=np.linspace(5,10,5) n_estimators=[10, 100, 1000]
train_vectors = array(train_vectors) if TRY_WITH_PREBUILD: print( 'Now building a classifier for our initial test, how does it do on pre-computed vectors.' ) # The paper uses a neural network, whatever that is... clf = SVC(C=50.0, kernel='linear') # For our first test we use a subset of train data clf.fit(train_vectors[:20000], train_targets[:20000]) print('Without loading in new stuff, lets get an idea of what we can do.') predicted = clf.predict(train_vectors[20000:25000]) acc = metrics.accuracy_score(train_targets[20000:25000], predicted) print('Accuracy: ', str(acc * 100.0) + '%') del clf print( 'Now we got some new reviews coming in.\n' '\tBut before we read them lets rebuild the classifier with all available data.' ) else: print('Now building a classifier') clf = SVC(C=50.0, kernel='linear') clf.fit(train_vectors, train_targets) print('Extending vocab and building vectors for new labels') # Freeze the words,should only matter for dm (high inflection)?
if opts.mode in ['age', 'gender']: # Preparando la máquina de aprendizaje verbose(" Training fold (%i)" % (i + 1)) from sklearn.ensemble import RandomForestClassifier classifier = RandomForestClassifier(n_estimators=opts.estimators, class_weight=weight) # Aprendiendo classifier.fit(X_train, y_train) # Prediciendo verbose(" Predicting fold (%i)" % (i + 1)) prediction = classifier.predict(X_test) verbose(' Accuracy fold (%i):' % (i + 1), accuracy_score(y_test, prediction)) y_.extend(y_test) prediction_.extend(prediction) else: # Preparando la máquina de aprendizaje verbose(" Regressing fold (%i)" % (i + 1)) from sklearn.ensemble import RandomForestRegressor regressor = RandomForestRegressor(n_estimators=opts.estimators) # Aprendiendo regressor.fit(X_train, y_train) # Prediciendo verbose(" Predicting fold (%i)" % (i + 1)) prediction = regressor.predict(X_test)
X_train = vectorizer.fit_transform(X_train) X_test = vectorizer.transform(X_test) from sklearn.linear_model import SGDClassifier, LogisticRegression from sklearn.metrics import classification_report y_true_all = [] predictions_all = [] for label in good_categories[:3]: print 'label', label y_train = [1 if label in instance else 0 for instance in y_train_all] y_test = [1 if label in instance else 0 for instance in y_test_all] y_true_all.append(y_test) classifier = LogisticRegression() classifier.fit_transform(X_train, y_train) predictions = classifier.predict(X_test) predictions_all.append(predictions) print classification_report(y_test, predictions) print confusion_matrix(y_test, predictions) print 'precision', precision_score(y_test, predictions) print 'recall', recall_score(y_test, predictions) print 'accuracy', accuracy_score(y_test, predictions) print '\n' y_true_all = np.array(y_true_all) predictions_all = np.array(predictions_all) print hamming_loss(y_true_all, predictions_all)
for tweet in reader[0:2*(numironicos/3)]: tweets_train.append(tweet["text"]) labels_train.append("noironia") for tweet in reader[2*(numironicos/3):]: tweets_test.append(tweet["text"]) labels_test.append("noironia") stop_words = [] f = open("spanish.txt") for line in f: stop_words.append(line.strip()) f.close() y_train = np.array(labels_train, dtype=object) y_test = np.array(labels_test, dtype=object) vectorizer = TfidfVectorizer(input='content', max_df=0.5, stop_words = stop_words) X_train = vectorizer.fit_transform(np.array(tweets_train, dtype=object)) X_test = vectorizer.transform(np.array(tweets_test, dtype=object)) classifier = RandomForestClassifier(n_estimators = 10) classifier.fit(X_train.toarray(), y_train) prediction = classifier.predict(X_test.toarray()) print '\nAccuracy :', accuracy_score(y_test, prediction) print '\nPrecision :', precision_score(y_test, prediction) print '\nRecall :', recall_score(y_test, prediction) print '\nF-score :', f1_score(y_test, prediction) print '\nClasification report:\n', classification_report(y_test,prediction) print '\nConfussion matrix :\n',confusion_matrix(y_test, prediction)
def main(argv): # get options passed at command line try: opts, args = getopt.getopt(argv, "d:o:c:C:t:m:") except getopt.GetoptError: #print helpString sys.exit(2) #print opts for opt, arg in opts: if opt == '-d': data_file = arg elif opt == '-o': out_folder = arg elif opt == '-c': label_col = int(arg) elif opt == '-C': data_cols = arg elif opt == '-t': test_file = arg #Whole genome prediction file elif opt == '-m': model_file = arg model_filename = os.path.abspath(model_file) data_file = os.path.abspath(data_file) test_file = os.path.abspath(test_file) print model_file, "\n" data_cols = [int(x) for x in data_cols.split(",")] x_data = np.loadtxt(data_file, usecols=data_cols, delimiter = "\t", skiprows=1) y_data = np.genfromtxt(data_file, usecols = label_col, delimiter = "\t", skip_header=1) test_x_data = np.loadtxt(test_file, usecols=data_cols, delimiter = "\t", skiprows=1) test_y_data = np.genfromtxt(test_file, usecols = label_col, delimiter = "\t", skip_header=1) #Load the model file# estimator = joblib.load(model_filename) #perform same scaling on training and testing data x_data, test_x_data = scaling_training_testing_data(x_data, test_x_data) np.random.seed(0) indices = np.random.permutation(len(test_x_data)) test_x_data = test_x_data[indices] test_y_data = test_y_data[indices] cols = 0 with open (test_file,"r") as temp: a = '\n'.join(line.strip("\n") for line in temp) b = np.genfromtxt(StringIO(a), usecols = cols, delimiter="\t", dtype=None, skip_header=1) enhancer_names_test = b[indices] temp.close() y_FAN_pred = estimator.predict(test_x_data) y_score_test = estimator.predict_proba(test_x_data) print metrics.classification_report(test_y_data,y_FAN_pred) combined_test = zip(enhancer_names_test, test_y_data, y_FAN_pred, y_score_test[:,0], y_score_test[:,1]) #f = open(out_folder + "/subroutine_RF_FANTOM_FeatureSelected_pred.txt", 'w') f = open(out_folder + "/GM12878_FANTOM_RF_FeatureSelected_ROC.txt", 'w') f.write("Enhancer_name\tY_true_labels\tY_predicted_labels\tProb_Class0\tProb_class1\n") for i in combined_test: line = '\t'.join(str(x) for x in i) f.write(line + '\n') f.close() print "Random Forests: On FANTOM, Final Generalization Accuracy: %.6f" %metrics.accuracy_score(test_y_data,y_FAN_pred) print metrics.classification_report(test_y_data,y_FAN_pred) print "Number of mislabeled points : %d" % (test_y_data != y_FAN_pred).sum() print metrics.classification_report(test_y_data,y_FAN_pred) print "Random Forests: Final Generalization Accuracy: %.6f" %metrics.accuracy_score(test_y_data,y_FAN_pred) #Before we move on, let's look at a key parameter that RF returns, namely feature_importances. This tells us which #features in our dataset seemed to matter the most (although won't matter in the present scenario with only 2 features) print estimator.feature_importances_ #Plot ROC# roc_plt = plot_roc(estimator, test_x_data, test_y_data, y_FAN_pred) #pl.savefig(out_folder + "/subroutine_RF_FeatureSelected_split_test_train_Kfold.svg", transparent=True, bbox_inches='tight', pad_inches=0.2) pl.savefig(out_folder + "/GM12878_FANTOM_RF_FeatureSelected_ROC.svg", transparent=True, bbox_inches='tight', pad_inches=0.2) roc_plt.show()
from sklearn.ensemble import RandomForestClassifier classifier=RandomForestClassifier(n_estimators=10000, criterion='entropy') #classifier = SVC(C=10, kernel='linear', #gamma=10, coef0=0.0, shrinking=True, #probability=False, tol=0.001, cache_size=20000, #class_weight='auto', verbose=False, max_iter=-1, #random_state=None) # Aprendiendo classifier.fit(X_train, y_train) # Prediciendo verbose(" Predicting fold (%i)"%(i+1)) prediction = classifier.predict(X_test) verbose(' Accuracy fold (%i):'%(i+1), accuracy_score(y_test, prediction)) y_.extend(y_test) prediction_.extend(prediction) else: # Preparando la máquina de aprendizaje verbose(" Regressing fold (%i)"%(i+1)) from sklearn.ensemble import RandomForestRegressor from sklearn.svm import SVR #regressor=RandomForestRegressor(n_estimators=opts.estimators) regressor = SVR(kernel='linear', degree=3, gamma=1.0, coef0=1.0, tol=0.001, C=10, epsilon=0.1, shrinking=True, probability=False , cache_size=200, verbose=False, max_iter=-1, random_state=None) # Aprendiendo