def auc_score(self, ground_truth, predictions, **kwargs): """ Calculate the AUC score for this particular trial. This will also calculate the F scores and ROC curves Args: ground_truth: vector of class labels predictions: vector of predicted class labels Returns: AUC score for this trial """ # calculate f scores thresholded = threshold(predictions[:, 1], threshmin=0.5) thresholded = threshold(thresholded, threshmax=0.5, newval=1.0).astype(int) fhalf_score = metrics.fbeta_score(ground_truth.astype(int), thresholded, beta=0.5) f2_score = metrics.fbeta_score(ground_truth.astype(int), thresholded, beta=2) f1_score = metrics.fbeta_score(ground_truth.astype(int), thresholded, beta=1) # calculate ROC curve and AUC fpr, tpr, _ = metrics.roc_curve(ground_truth, predictions[:, 1]) area = metrics.auc(fpr, tpr) self.fhalf_scores_.append(fhalf_score) self.f2scores_.append(f2_score) self.f1scores_.append(f1_score) self.rates_.append((fpr, tpr)) self.aucs_.append(area) return area
def evaluate_model(y, y_pred, y_pred_prob, label, statistics, uniprot=None, verbose=0): y_pred_prob_1 = [x[1] for x in y_pred_prob] if uniprot: for u, p1, p2 in zip(uniprot, y, y_pred_prob_1): print("\t\t\tResult for {}, {} \n\t\t\t\tTrue: \t{} ||| Pred: \t{}".format(label, u, p1, p2)) label_stats = compute_label_statistics(y, y_pred, labels=labels) statistics.update_statistics(label, 'Accuracy', accuracy_score(y, y_pred)) statistics.update_statistics(label, 'F (beta=0.5)', fbeta_score(y, y_pred, beta=0.5, labels=[0, 1], average='binary')) statistics.update_statistics(label, 'F (beta=1)', fbeta_score(y, y_pred, beta=1.0, labels=[0, 1], average='binary')) statistics.update_statistics(label, 'Specificity', label_stats[1]['specificity']) statistics.update_statistics(label, 'Recall', label_stats[1]['sensitivity']) statistics.update_statistics(label, 'Precision', label_stats[1]['precision']) statistics.update_statistics(label, 'FDR', label_stats[1]['fdr']) try: statistics.update_statistics(label, 'ROC-AUC', roc_auc_score(y, y_pred, average="weighted")) except (ValueError, AssertionError): statistics.update_statistics(label, 'AUC', 0.0) try: pr_auc = average_precision_score(y, y_pred, average="weighted") if str(pr_auc) == 'nan': pr_auc = 0.0 statistics.update_statistics(label, 'PR-AUC', pr_auc) except (ValueError, AssertionError): statistics.update_statistics(label, 'PR-AUC', 0.0) if verbose: statistics.print_statistics(label) return statistics
def test_sample_order_invariance(): y_true, y_pred, _ = make_prediction(binary=True) y_true_shuffle, y_pred_shuffle = shuffle(y_true, y_pred, random_state=0) for metric in [accuracy_score, hamming_loss, zero_one_loss, lambda y1, y2: zero_one_loss(y1, y2, normalize=False), precision_score, recall_score, f1_score, lambda y1, y2: fbeta_score(y1, y2, beta=2), lambda y1, y2: fbeta_score(y1, y2, beta=0.5), matthews_corrcoef, mean_absolute_error, mean_squared_error, explained_variance_score, r2_score]: assert_almost_equal(metric(y_true, y_pred), metric(y_true_shuffle, y_pred_shuffle), err_msg="%s is not sample order invariant" % metric)
def test_precision_recall_f1_score_with_an_empty_prediction(): y_true = np.array([[0, 1, 0, 0], [1, 0, 0, 0], [0, 1, 1, 0]]) y_pred = np.array([[0, 0, 0, 0], [0, 0, 0, 1], [0, 1, 1, 0]]) # true_pos = [ 0. 1. 1. 0.] # false_pos = [ 0. 0. 0. 1.] # false_neg = [ 1. 1. 0. 0.] p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None) assert_array_almost_equal(p, [0.0, 1.0, 1.0, 0.0], 2) assert_array_almost_equal(r, [0.0, 0.5, 1.0, 0.0], 2) assert_array_almost_equal(f, [0.0, 1 / 1.5, 1, 0.0], 2) assert_array_almost_equal(s, [1, 2, 1, 0], 2) f2 = fbeta_score(y_true, y_pred, beta=2, average=None) support = s assert_array_almost_equal(f2, [0, 0.55, 1, 0], 2) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="macro") assert_almost_equal(p, 0.5) assert_almost_equal(r, 1.5 / 4) assert_almost_equal(f, 2.5 / (4 * 1.5)) assert_equal(s, None) assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, average="macro"), np.mean(f2)) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="micro") assert_almost_equal(p, 2 / 3) assert_almost_equal(r, 0.5) assert_almost_equal(f, 2 / 3 / (2 / 3 + 0.5)) assert_equal(s, None) assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, average="micro"), (1 + 4) * p * r / (4 * p + r)) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="weighted") assert_almost_equal(p, 3 / 4) assert_almost_equal(r, 0.5) assert_almost_equal(f, (2 / 1.5 + 1) / 4) assert_equal(s, None) assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, average="weighted"), np.average(f2, weights=support)) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="samples") # |h(x_i) inter y_i | = [0, 0, 2] # |y_i| = [1, 1, 2] # |h(x_i)| = [0, 1, 2] assert_almost_equal(p, 1 / 3) assert_almost_equal(r, 1 / 3) assert_almost_equal(f, 1 / 3) assert_equal(s, None) assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, average="samples"), 0.333, 2)
def build_metric(self, logs): return { 'val_loss': lambda y,y_hat: logs['val_loss'], 'val_acc': lambda y,y_hat: logs['val_acc'], 'val_f1': f1_score, 'val_f1': lambda y,y_hat: fbeta_score(y, y_hat, beta=0.5), 'val_f2': lambda y,y_hat: fbeta_score(y, y_hat, beta=2) }[self.metric_name]
def test_fbeta_score(self): result = self.df.metrics.fbeta_score(beta=0.5, average='weighted') expected = metrics.fbeta_score(self.target, self.pred, beta=0.5, average='weighted') self.assertEqual(result, expected) result = self.df.metrics.fbeta_score(beta=0.5, average='macro') expected = metrics.fbeta_score(self.target, self.pred, beta=0.5, average='macro') self.assertEqual(result, expected) result = self.df.metrics.fbeta_score(beta=0.5, average=None) expected = metrics.fbeta_score(self.target, self.pred, beta=0.5, average=None) self.assertTrue(isinstance(result, pdml.ModelSeries)) self.assert_numpy_array_almost_equal(result.values, expected)
def train_predict(learner, sample_size, X_train, y_train, X_test, y_test): ''' inputs: - learner: the learning algorithm to be trained and predicted on - sample_size: the size of samples (number) to be drawn from training set - X_train: features training set - y_train: income training set - X_test: features testing set - y_test: income testing set ''' results = {} # Fit the learner to the training data using slicing with 'sample_size' start = time() # Get start time learner.fit(X_train[:sample_size], y_train[:sample_size]) end = time() # Get end time # Calculate the training time results['train_time'] = end - start # Get the predictions on the test set, # then get predictions on the first 300 training samples start = time() # Get start time predictions_test = learner.predict(X_test) predictions_train = learner.predict(X_train[:300]) end = time() # Get end time # Calculate the total prediction time results['pred_time'] = end - start # Compute accuracy on the first 300 training samples results['acc_train'] = accuracy_score(y_train[:300], predictions_train[:300]) # Compute accuracy on test set results['acc_test'] = accuracy_score(y_test, predictions_test) # Compute F-score on the the first 300 training samples results['f_train'] = fbeta_score(y_train[:300], predictions_train[:300], beta=0.5) # Compute F-score on the test set results['f_test'] = fbeta_score(y_test, predictions_test, beta=0.5) # Success print "{} trained on {} samples.".format(learner.__class__.__name__, sample_size) # Return the results return results
def testGetMetrics(self): negative_class = 0 positive_class = 1 actual = random.random_integers(negative_class,positive_class,100) judgments = [positive_class]*len(actual) beta = 2.0 expected_metrics = [] for i in range(len(actual)): expected_metrics.append(fbeta_score(actual, judgments, beta)) judgments[i] = negative_class expected_metrics.append(fbeta_score(actual, judgments, beta)) actual_metrics = getMetrics(actual, positive_class, beta) self.assertEqual(expected_metrics, actual_metrics)
def evaluate_model(preds, testy): accuracy = metrics.accuracy_score(testy, preds) precision = metrics.precision_score(testy, preds) recall = metrics.recall_score(testy, preds) F1 = metrics.f1_score(testy, preds) Fbeta = metrics.fbeta_score(testy, preds, 2) # weighting recall stronger than precision print "Model summary: accuracy - ", accuracy,"precision - ",precision, "recall - ", recall, "Fbeta - ",Fbeta, "F1 - ",F1
def _created_model(self, X, Y, indices, i, model): # to assign an F-score weight to each classifier, # sample another subset of the data and use the model # we just train to generate predictions beta = self.weighting n = X.shape[0] bagsize = len(indices) if beta or self.verbose: error_sample_indices = np.random.random_integers(0,n-1,bagsize) error_subset = X[error_sample_indices, :] if self.feature_subsets: error_subset = error_subset[:, self.feature_subsets[i]] error_labels = Y[error_sample_indices] y_pred = model.predict(error_subset) if self.weighting: f_score = fbeta_score(error_labels, y_pred, beta) self.weights[i] = f_score if self.verbose: print "Actual non-zero:", np.sum(error_labels != 0) num_pred_nz = np.sum(y_pred != 0) print "Predicted non-zero:", num_pred_nz pred_correct = (y_pred == error_labels) pred_nz = (y_pred != 0) num_true_nz = np.sum(pred_correct & pred_nz) print "True non-zero:", num_true_nz print "False non-zero:", num_pred_nz - num_true_nz print "---"
def plot_precision_recall(performace_df, model, ax=None, beta=0.1): ax = ax or plt.gca() if isinstance(model, CalibratedClassifierCV): model = model.base_estimator thresholds = np.linspace(0, 1, model.n_estimators + 2) precision = [] recall = [] f_beta = [] ax.axvline(0, color='lightgray') ax.axvline(1, color='lightgray') ax.axhline(0, color='lightgray') ax.axhline(1, color='lightgray') for threshold in thresholds: prediction = (performace_df.probabilities.values >= threshold).astype('int') label = performace_df.label.values precision.append(metrics.precision_score(label, prediction)) recall.append(metrics.recall_score(label, prediction)) f_beta.append(metrics.fbeta_score(label, prediction, beta=beta)) ax.plot(thresholds, precision, label='precision') ax.plot(thresholds, recall, label='recall') ax.plot(thresholds, f_beta, label='$f_{{{:.2f}}}$'.format(beta)) ax.legend() ax.set_xlabel('prediction threshold') ax.figure.tight_layout()
def main(data_module): """Load data, train model and evaluate it.""" data = data_module.load_data() model = create_model(data_module.n_classes, (data['x_train'].shape[1], )) print(model.summary()) optimizer = get_optimizer({'optimizer': {'initial_lr': 0.001}}) model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=[precision, recall, f1, accuracy]) t0 = time.time() model.fit(data['x_train'], data['y_train'], batch_size=32, epochs=30, validation_data=(data['x_test'], data['y_test']), shuffle=True, # callbacks=callbacks ) t1 = time.time() # res = get_tptnfpfn(model, data) preds = model.predict(data['x_test']) preds[preds >= 0.5] = 1 preds[preds < 0.5] = 0 t2 = time.time() print(("{clf_name:<30}: {acc:0.2f}% {f1:0.2f}% in {train_time:0.2f}s " "train / {test_time:0.2f}s test") .format(clf_name="MLP", acc=(accuracy_score(y_true=data['y_test'], y_pred=preds) * 100), f1=(fbeta_score(y_true=data['y_test'], y_pred=preds, beta=1, average="weighted") * 100), train_time=(t1 - t0), test_time=(t2 - t1)))
def test_precision_recall_f1_score_binary(): """Test Precision Recall and F1 Score for binary classification task""" y_true, y_pred, _ = make_prediction(binary=True) # detailed measures for each class p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None) assert_array_almost_equal(p, [0.73, 0.85], 2) assert_array_almost_equal(r, [0.88, 0.68], 2) assert_array_almost_equal(f, [0.80, 0.76], 2) assert_array_equal(s, [25, 25]) # individual scoring function that can be used for grid search: in the # binary class case the score is the value of the measure for the positive # class (e.g. label == 1) ps = precision_score(y_true, y_pred) assert_array_almost_equal(ps, 0.85, 2) rs = recall_score(y_true, y_pred) assert_array_almost_equal(rs, 0.68, 2) fs = f1_score(y_true, y_pred) assert_array_almost_equal(fs, 0.76, 2) assert_almost_equal(fbeta_score(y_true, y_pred, beta=2), (1 + 2 ** 2) * ps * rs / (2 ** 2 * ps + rs), 2)
def optimize_lda(corpus, dictionary, train, test, topics=5, max_topics=1200): # train and test set must be lists of review-label pairs '''runs lda with increasing number of topics to optimize number of topics; runs svm classifier with each new lda lda model and adds error to error_list''' accuracy_list = [] while topics <= max_topics: start_time = time.clock() lda = ldamodel.LdaModel(corpus, id2word=dictionary, num_topics=topics) print 'lda !' x_train, y_train = topic_vector(train, lda, dictionary) classifier = tree.DecisionTreeClassifier() classifier.fit(x_train, y_train) print 'classified!' x_test, y_test = topic_vector(test, lda, dictionary) y_pred = list(classifier.predict(x_test)) print 'predicted!' accuracy = metrics.fbeta_score(y_test, y_pred, beta=2) # F2 Score accuracy_list.append([accuracy, topics]) print 'accuracy! ', accuracy #confusion = metrics.confusion_matrix(y_test, y_pred) #print 'accuracy ', metrics.accuracy_score(y_test, y_pred) #print 'precision ', metrics.precision_score(y_test, y_pred) #print 'recall ', metrics.recall_score(y_test, y_pred) #topics += 50 if topics < 100: topics += 25 else: topics += 100 end_time = time.clock() print ('lda_%s time: %s' % (topics, (end_time - start_time))) #print y_test #print y_pred #save_thing(accuracy_list, 'accuracy') return accuracy_list
def kfold_cv(cls, xs, ys, k): from sklearn import metrics pk = len(xs) / k prec = [] rec = [] for i in range(k): ki = pk * i kj = pk * (i + 1) xs_train = np.concatenate((xs[:ki,:], xs[kj:,:])) ys_train = np.concatenate((ys[:ki], ys[kj:])) xs_test = xs[ki:kj] ys_test = ys[ki:kj] if (ys_test == 1).sum() == 0: continue cls.fit(xs_train, ys_train) # score = cls.score(xs_test, ys_test) # # print '{}: {}'.format(i, score) # avg_score += score ys_pred = cls.predict(xs_test) # # print metrics.precision_score(ys_test, ys_pred, pos_label=0), \ # # metrics.recall_score(ys_test, ys_pred, pos_label=0) # print '-------' # fpr, tpr, thresholds = metrics.roc_curve(ys_test, ys_pred) # print metrics.auc(fpr, tpr) #print metrics.roc_auc_score(ys_test, ys_pred) # print metrics.precision_recall_curve(ys_test, ys_pred, pos_label=1) print metrics.fbeta_score(ys_test, ys_pred, 0.5) # print '--' # print xs_test # print y_pred # print ys_test # print '---' # (correct,) = (ys_test == ys_pred).sum(), # total = len(xs_test) # avg_score += float(correct) / total prec.append(metrics.f1_score(ys_test, ys_pred)) rec.append(metrics.recall_score(ys_test, ys_pred)) return (np.array(prec), np.array(rec))
def run(y_true, y_pred): perf = {} perf['accuracy'] = accuracy_score(y_true, y_pred) perf['precision'] = precision_score(y_true, y_pred, average='micro') perf['recall'] = recall_score(y_true, y_pred, average='micro') perf['fbeta_score'] = fbeta_score(y_true, y_pred, average='macro', beta=1.0) perf['hamming_loss'] = hamming_loss(y_true, y_pred) perf['cm'] = confusion_matrix(y_true, y_pred) return perf
def test_symmetry(): """Test the symmetry of score and loss functions""" y_true, y_pred, _ = make_prediction(binary=True) # Symmetric metric for metric in [accuracy_score, lambda y1, y2: accuracy_score(y1, y2, normalize=False), zero_one_loss, lambda y1, y2: zero_one_loss(y1, y2, normalize=False), hamming_loss, f1_score, matthews_corrcoef, mean_squared_error, mean_absolute_error]: assert_equal(metric(y_true, y_pred), metric(y_pred, y_true), msg="%s is not symetric" % metric) # Not symmetric metrics for metric in [precision_score, recall_score, lambda y1, y2: fbeta_score(y1, y2, beta=0.5), lambda y1, y2: fbeta_score(y1, y2, beta=2), explained_variance_score, r2_score]: assert_true(metric(y_true, y_pred) != metric(y_pred, y_true), msg="%s seems to be symetric" % metric) # Deprecated metrics with warnings.catch_warnings(True): # Throw deprecated warning assert_equal(zero_one(y_true, y_pred), zero_one(y_pred, y_true)) assert_equal(zero_one(y_true, y_pred, normalize=False), zero_one(y_pred, y_true, normalize=False)) assert_equal(zero_one_score(y_true, y_pred), zero_one_score(y_pred, y_true))
def evaluate_crf_model(x, y, estimator, labels, uniprot=None, verbose=0): y_pred = np.asarray(estimator.predict(x)) statistics = Statistics() statistics.update_statistics('all_labels', 'accuracy', estimator.score(x, y)) bin_labels = [0, 1] for i, l in enumerate(labels): y_true_binary_l = y[:, i].astype(int) y_pred_binary_l = y_pred[:, i].astype(int) label_stats = compute_label_statistics(y_true_binary_l, y_pred_binary_l, labels=bin_labels) statistics.update_statistics(l, 'Accuracy', accuracy_score(y_true_binary_l, y_pred_binary_l)) statistics.update_statistics(l, 'Specifcity', label_stats[1]['specificity']) statistics.update_statistics(l, 'Recall', label_stats[1]['sensitivity']) statistics.update_statistics(l, 'Precision', label_stats[1]['precision']) statistics.update_statistics(l, 'FDR', label_stats[1]['fdr']) statistics.update_statistics(l, 'F-Score (beta=0.5)', fbeta_score( y_true_binary_l, y_pred_binary_l, beta=0.5, labels=bin_labels, average='binary' )) statistics.update_statistics(l, 'F-Score (beta=1)', fbeta_score( y_true_binary_l, y_pred_binary_l, beta=1.0, labels=bin_labels, average='binary' )) try: roc_auc = roc_auc_score(y_true_binary_l, y_pred_binary_l, average="binary") statistics.update_statistics(l, 'ROC-AUC', roc_auc) except (ValueError, AssertionError): statistics.update_statistics(l, 'ROC-AUC', np.NaN) try: pr_auc = average_precision_score(y_true_binary_l, y_pred_binary_l, average="binary") statistics.update_statistics(l, 'PR-AUC', pr_auc) except (ValueError, AssertionError): statistics.update_statistics(l, 'PR-AUC', np.NaN) if verbose: for l in labels: statistics.print_statistics(l) if uniprot and verbose: for u, p1, p2 in zip(uniprot, y, y_pred): print("\t\t\tResult for {} \n\t\t\t\tTrue: \t{} ||| Pred: \t{}".format(u, p1, p2)) return statistics
def cv(self, data, X, labels, n_folds=5, random_state=42, verbose=True, poslabel='guess'): cv = StratifiedKFold(labels, n_folds, random_state=random_state) truths = np.array([None] * len(labels)) preds = np.array([None] * len(labels)) for train, test in cv: self.clf.fit(X[train], labels[train]) preds[test] = self.clf.predict(X[test]) truths[test] = labels[test] binary_truths = self.to_binary(truths, poslabel) binary_preds = self.to_binary(preds, poslabel) results = \ {'accuracy': accuracy_score(truths, preds), 'f1_pos': f1_score(binary_truths, binary_preds), 'fbeta.01': fbeta_score(binary_truths, binary_preds, beta=.01), 'fbeta.1': fbeta_score(binary_truths, binary_preds, beta=.1), 'fbeta.3': fbeta_score(binary_truths, binary_preds, beta=.3), 'fbeta.5': fbeta_score(binary_truths, binary_preds, beta=.5), 'fbeta.7': fbeta_score(binary_truths, binary_preds, beta=.7), 'fbeta2': fbeta_score(binary_truths, binary_preds, beta=2), 'fbeta3': fbeta_score(binary_truths, binary_preds, beta=3), 'fbeta5': fbeta_score(binary_truths, binary_preds, beta=5), 'fbeta7': fbeta_score(binary_truths, binary_preds, beta=7), 'fbeta10': fbeta_score(binary_truths, binary_preds, beta=10), 'macro_f1': f1_score(truths, preds, average='macro', pos_label=None), 'micro_f1': f1_score(truths, preds, average='micro', pos_label=None), 'recall': recall_score(binary_truths, binary_preds), 'precision': precision_score(binary_truths, binary_preds), 'roc_auc': roc_auc_score(binary_truths, binary_preds) } if verbose: print(self.confusion(truths, preds, self.clf.classes_)) print(classification_report(truths, preds)) self.fit(X, labels) self.top_terms() print('\n') if data is not None: self.top_error_terms(truths, preds, X, data) return results
def find_threshold(fn,Xcv,ycv): """ by cv testing on (Xval,yval) """ dists = np.fromiter((fn(x) for x in Xcv), float) best = (0,0) #threshold, f1_score print dists.min(), dists.max() ## for t in dists: # for each prob score for t in np.linspace(dists.min(), dists.max(), 100): preds = (dists < t).astype(int) f = fbeta_score(ycv, preds, 1.) if f > best[1]: best = (t,f) return best
def getResult(self, predict, data_set): y_true, y_predict = control.calculate_entire_ds(predict, data_set) result = metrics.classification_report(y_true, y_predict) result += "\nAccuracy classification: %f\n" % metrics.accuracy_score(y_true, y_predict) result += "F1 score: %f\n" % metrics.f1_score(y_true, y_predict) result += "Fbeta score: %f\n" % metrics.fbeta_score(y_true, y_predict, beta=0.5) result += "Hamming loss: %f\n" % metrics.hamming_loss(y_true, y_predict) result += "Hinge loss: %f\n" % metrics.hinge_loss(y_true, y_predict) result += "Jaccard similarity: %f\n" % metrics.jaccard_similarity_score(y_true, y_predict) result += "Precision: %f\n" % metrics.precision_score(y_true, y_predict) result += "Recall: %f\n" % metrics.recall_score(y_true, y_predict) if self.is_binary(): result += "Average precision: %f\n" % metrics.average_precision_score(y_true, y_predict) result += "Matthews correlation coefficient: %f\n" % metrics.matthews_corrcoef(y_true, y_predict) result += "Area Under the Curve: %f" % metrics.roc_auc_score(y_true, y_predict) return result
def test_classification_scores(): """Test classification scorers.""" X, y = make_blobs(random_state=0, centers=2) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = LinearSVC(random_state=0) clf.fit(X_train, y_train) for prefix, metric in [('f1', f1_score), ('precision', precision_score), ('recall', recall_score)]: score1 = get_scorer('%s_weighted' % prefix)(clf, X_test, y_test) score2 = metric(y_test, clf.predict(X_test), pos_label=None, average='weighted') assert_almost_equal(score1, score2) score1 = get_scorer('%s_macro' % prefix)(clf, X_test, y_test) score2 = metric(y_test, clf.predict(X_test), pos_label=None, average='macro') assert_almost_equal(score1, score2) score1 = get_scorer('%s_micro' % prefix)(clf, X_test, y_test) score2 = metric(y_test, clf.predict(X_test), pos_label=None, average='micro') assert_almost_equal(score1, score2) score1 = get_scorer('%s' % prefix)(clf, X_test, y_test) score2 = metric(y_test, clf.predict(X_test), pos_label=1) assert_almost_equal(score1, score2) # test fbeta score that takes an argument scorer = make_scorer(fbeta_score, beta=2) score1 = scorer(clf, X_test, y_test) score2 = fbeta_score(y_test, clf.predict(X_test), beta=2, average='weighted') assert_almost_equal(score1, score2) # test that custom scorer can be pickled unpickled_scorer = pickle.loads(pickle.dumps(scorer)) score3 = unpickled_scorer(clf, X_test, y_test) assert_almost_equal(score1, score3) # smoke test the repr: repr(fbeta_score)
def on_epoch_end(self, epoch, logs={}): if 'iteration' in logs.keys() and logs['iteration'] % self.iteration_freq != 0: # If we've broken a large training set into smaller chunks, we don't # need to run the classification report after every chunk. return y_hat = self.model.predict_classes(self.x, verbose=0) fbeta = fbeta_score(self.y, y_hat, beta=0.5, average='weighted') report = classification_report( self.y, y_hat, labels=self.labels, target_names=self.target_names) if 'iteration' in logs.keys(): self.logger("epoch {epoch} iteration {iteration} - val_fbeta(beta=0.5): {fbeta}".format( epoch=epoch, iteration=logs['iteration'], fbeta=fbeta)) else: self.logger("epoch {epoch} - val_fbeta(beta=0.5): {fbeta}".format( epoch=epoch, fbeta=fbeta)) self.logger(report)
def automate_train_predict(learner, sample_size, X_train, y_train, X_test, y_test): ''' inputs: - learner: the learning algorithm to be trained and predicted on - sample_size: the size of samples (number) to be drawn from training set - X_train: features training set - y_train: income training set - X_test: features testing set - y_test: income testing set ''' results = {} # TODO: Fit the learner to the training data using slicing with 'sample_size' using .fit(training_features[:], training_labels[:]) start = time() # Get start time learner.fit(X_train[:sample_size],y_train[:sample_size]) end = time() # Get end time # TODO: Calculate the training time results['train_time'] = end - start # TODO: Get the predictions on the test set(X_test), # then get predictions on the first 300 training samples(X_train) using .predict() start = time() # Get start time predictions_test = learner.predict(X_test) end = time() # Get end time # TODO: Calculate the total prediction time results['pred_time'] = end - start # TODO: Compute accuracy on test set using accuracy_score() results['acc_test'] = accuracy_score(y_test,predictions_test) # TODO: Compute F-score on the test set which is y_test results['f_test'] = fbeta_score(y_test,predictions_test,beta=0.5) # Success print ("{} trained on {} samples. train {:.3f}sec predict {:.3f}sec fsctest {:.3f}".format(learner.__class__.__name__, sample_size,results['train_time'],results['pred_time'],results['f_test'])) # Return the results return results
def test_classification_scores(): X, y = make_blobs(random_state=0) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = LinearSVC(random_state=0) clf.fit(X_train, y_train) score1 = SCORERS['f1'](clf, X_test, y_test) score2 = f1_score(y_test, clf.predict(X_test)) assert_almost_equal(score1, score2) # test fbeta score that takes an argument scorer = Scorer(fbeta_score, beta=2) score1 = scorer(clf, X_test, y_test) score2 = fbeta_score(y_test, clf.predict(X_test), beta=2) assert_almost_equal(score1, score2) # test that custom scorer can be pickled unpickled_scorer = pickle.loads(pickle.dumps(scorer)) score3 = unpickled_scorer(clf, X_test, y_test) assert_almost_equal(score1, score3) # smoke test the repr: repr(fbeta_score)
def cross_validation_scores(X, y, clf): y_pred = cross_validation.cross_val_predict(clf, X, y, cv=5) # better than cross_val_score: less restricting y_true = y basic_metrics = metrics_from_confusion_matrix(y_true, y_pred) # TODO find a way to store this: XML, mongodb, logs... print "TP: %0.2f" % basic_metrics['TP'] print "FP: %0.2f" % basic_metrics['FP'] print "FN: %0.2f" % basic_metrics['FN'] print "TN: %0.2f" % basic_metrics['TN'] print "NPP: %0.2f" % basic_metrics['NPP'] print "Specificity: %0.2f" % basic_metrics['specificity'] print "Precision/PPP (basic): %0.2f" % basic_metrics['PPP'] print "Precision/PPP (predefined): %0.2f" % precision_score(y_true, y_pred) print "Sensitivity/recall (basic): %0.2f" % basic_metrics['sensitivity'] print "Sensitivity/recall (predefined): %0.2f" % recall_score(y_true, y_pred) print "Accuracy (basic): %0.2f" % basic_metrics['accuracy'] print "Accuracy (predefined): %0.2f" % accuracy_score(y_true, y_pred) print "F1 score (basic): %0.2f" % basic_metrics['F1'] print "F1 score (predefined): %0.2f" % f1_score(y_true, y_pred) print "F2 score (basic): %0.2f" % basic_metrics['F2'] print "F2 score (predefined): %0.2f" % fbeta_score(y_true, y_pred, 2)
def f025(y_true, y_pred): return fbeta_score(y_true, y_pred, average='binary', beta=0.25)
print(predTest.argmax(axis=1)[misclassified_knn[2]]) plt.imshow(np.reshape(x_test[misclassified_knn[2]], (28, 28))) plt.show() neighbors3 = historyNeigh.kneighbors(x_test[misclassified_knn[2]].reshape(1, -1)) flattened3 = [val for sublist in neighbors3[1] for val in sublist] for n in flattened3: print(y_train.argmax(axis=1)[n]) plt.imshow(np.reshape(x_train[n], (28, 28))) plt.show() # Confusion matrix and other metrics cnf_matrix_KN = confusion_matrix(y_test.argmax(axis=1), predTest.argmax(axis=1)) plot_confusion_matrix(cnf_matrix_KN, classes=['0','1','2','3','4','5','6','7','8','9'], title='Confusion matrix, without normalization') print("Accuracy on test set:") print(accuracy_score(y_test, predTest)) print("Precision score:") average_precision_knn = average_precision_score(y_test, predTest) print(average_precision_knn) recall_knn = recall_score(y_test, predTest, average='micro') print("Recall score:") print(recall_knn) fscore_knn = fbeta_score(y_test, predTest, beta=1, average='micro') print("F-score:") print(fscore_knn)
def train_predict(learner, sample_size, X_train, y_train, X_test, y_test): ''' inputs: - learner: the learning algorithm to be trained and predicted on - sample_size: the size of samples (number) to be drawn from training set - X_train: features training set - y_train: income training set - X_test: features testing set - y_test: income testing set ''' results = {} # TODO: Fit the learner to the training data using slicing with 'sample_size' using .fit(training_features[:], training_labels[:]) start = time() # Get start time learner = learner.fit(X_train[:sample_size], y_train[:sample_size]) end = time() # Get end time # TODO: Calculate the training time results['train_time'] = (end - start) print('\nthe training time is: ', results['train_time']) # TODO: Get the predictions on the test set(X_test), # then get predictions on the first 300 training samples(X_train) using .predict() start = time() # Get start time predictions_test = learner.predict(X_test) predictions_train = learner.predict(X_train[:300]) end = time() # Get end time print('\nstart time is ', start) print('\nend time is ', end) print('\npredictions test shape is: ', predictions_test.shape) print('\npredictions train shape is: ', predictions_train.shape) # TODO: Calculate the total prediction time results['pred_time'] = (end - start) print('\nthe total prediction time is :', results['pred_time']) # TODO: Compute accuracy on the first 300 training samples which is y_train[:300] results['acc_train'] = accuracy_score(y_train[:300], predictions_train) print('\nthe accuracy on the first 300 is :', results['acc_train']) # TODO: Compute accuracy on test set using accuracy_score() results['acc_test'] = accuracy_score(y_test, predictions_test) print('\ntest accuracy score is: ', results['acc_test']) # TODO: Compute F-score on the the first 300 training samples using fbeta_score() results['f_train'] = fbeta_score(y_train[:300], predictions_train, beta=0.75, average='micro') print('\nfirst 300 fbeta score is: ', results['f_train']) # TODO: Compute F-score on the test set which is y_test results['f_test'] = fbeta_score(y_test, predictions_test, beta=0.75, average='micro') print('\nfbeta test score is: ', results['f_test']) # Success print("\n{} trained on {} samples\n.".format(learner.__class__.__name__, sample_size)) # Return the results return results
} # print("LightGBM params:", lgbm_params) def f2_score(y_pred: np.array, data: Any) -> Any: y_true = data.get_label() y_pred = y_pred > gold_threshold if np.sum(y_pred) == 0: return 'f2', 0, True return 'f2', fbeta_score(y_true, y_pred, beta=2), True lgb_clf = lgb.train(lgbm_params, lgtrain, num_boost_round=2000, valid_sets=[lgtrain, lgvalid], valid_names=['train', 'valid'], early_stopping_rounds=100, verbose_eval=50, feval=f2_score) val_pred = lgb_clf.predict(x_val) f2 = fbeta_score(y_val, val_pred > gold_threshold, beta=2) dprint(f2) filename = f'{model_dir}/lightgbm_f{level2_fold}_c{class_:04}_{f2:.04}.pkl' with open(filename, 'wb') as model_file: pickle.dump(lgb_clf, model_file)
def main(cv, scaler, PENALTY, SOLVER, C, N_JOBS, RANDOM_STATE): trainset = pd.read_csv("trainset_180314.csv").iloc[:, 1:] print(len(trainset)) print(trainset.columns[5:].tolist()) # continuous and categorical mains = ["user_coupon", "user_id", "coupon_id", "start_time", "is_used"] categorical = [ 'sex_1', 'sex_2', 'age_60', 'age_70', 'age_80', 'age_90', 'age_0', 'city1', 'city2', 'city3', 'city4', 'city5', 'AppVerLast_2.1', 'AppVerLast_2.2', 'AppVerLast_2.3', 'AppVerLast_2.4', 'AppVerLast_2.5', 'AppVerLast_2.7', 'AppVerLast_2.8', 'covers_mon', 'covers_tue', 'covers_wed', 'covers_thu', 'covers_fri', 'covers_sat', 'covers_sun', 'type1', 'type6', 'Complaints', 'Eventsoperation', 'NewUserCouponPackageByBD', 'PreUserCouponCode', 'RecallUserDaily', 'home201603222253', 'home_dongbeiguan', 'home_jiangzhecai', 'home_muqinjie', 'home_xiangcaiguan', 'preuser', 'shareuser', '商家拒单返券', '家厨发券', '活动赠券', '码兑券', '自运营赠券', '蒲公英受邀', 'CoupUseLast' ] conitnuous = [ 'kitchen_entropy', 'distance_median', 'distance_std', 'user_longitude_median', 'user_longitude_std', 'user_latitude_median', 'user_latitude_std', 'coupon_effective_days', 'money', 'max_money', 'WeeklyCouponUsedCount', "BiWeeklyCouponUsedCount", 'WeeklyOrderCount', 'BiWeeklyOrderCount', 'coupon_usage_rate', 'order_coupon_usage_rate', 'coupon_type1_usage_rate', 'coupon_type6_usage_rate', 'coupon_used_weekend_perc', 'order_weekend_perc', 'worth_money_median', 'worth_money_std', 'InterCoup', 'InterOrder', 'Recency' ] # scaling X_train_continuous = scaler.fit_transform(trainset[conitnuous]) trainset_scaled = pd.concat([ trainset.loc[:, mains + categorical], pd.DataFrame(X_train_continuous, columns=conitnuous) ], axis=1) # split train & dev split_date1 = "2016-04-15" split_date2 = "2016-04-22" split_date3 = "2016-04-29" split_date4 = "2016-05-06" trainset1 = trainset_scaled[trainset_scaled["start_time"] <= split_date1] devset1 = trainset_scaled[(trainset_scaled["start_time"] > split_date1) & (trainset_scaled["start_time"] <= split_date2)] trainset2 = trainset_scaled[trainset_scaled["start_time"] <= split_date2] devset2 = trainset_scaled[(trainset_scaled["start_time"] > split_date2) & (trainset_scaled["start_time"] <= split_date3)] trainset3 = trainset_scaled[trainset_scaled["start_time"] <= split_date3] devset3 = trainset_scaled[(trainset_scaled["start_time"] > split_date3) & (trainset_scaled["start_time"] <= split_date4)] trainset4 = trainset_scaled[trainset_scaled["start_time"] <= split_date4] devset4 = trainset_scaled[trainset_scaled["start_time"] > split_date4] # shuffle trainset trainset1 = trainset1.iloc[shuffle(trainset1.index).tolist(), ] trainset2 = trainset2.iloc[shuffle(trainset2.index).tolist(), ] trainset3 = trainset3.iloc[shuffle(trainset3.index).tolist(), ] trainset4 = trainset4.iloc[shuffle(trainset4.index).tolist(), ] trainsets = [trainset1, trainset2, trainset3, trainset4] devsets = [devset1, devset2, devset3, devset4] X_trains, y_trains, X_devs, y_devs = [], [], [], [] for i in trainsets: X_trains.append(i[i.columns[5:]]) y_trains.append(i["is_used"]) for i in devsets: X_devs.append(i[i.columns[5:]]) y_devs.append(i["is_used"]) ## 1. Logistic Regression res_lr = defaultdict( lambda: defaultdict(lambda: defaultdict(lambda: list))) res_lr["PENALTY"] = PENALTY res_lr["SCALER"] = SCALER res_lr["BALANCE"] = BALANCE evaluations = ["F05", "Precision", "Recall", "Mean_Pre", "AUC", "Accuracy"] for c in C: for ev in evaluations: res_lr[ev][str(c)] = [] # train start_time = time.time() for c in C: start_time2 = time.time() for n in cv: lr = LogisticRegression(C=c, penalty=PENALTY, solver=SOLVER, class_weight={1: BALANCE}, max_iter=MAX_ITER, random_state=RANDOM_STATE, n_jobs=N_JOBS) lr.fit(X_trains[n], y_trains[n]) y_pred = lr.predict(X_devs[n]) y_dev = y_devs[n] print("P: {}, CV: {}, C: {}".format(PENALTY, n, c)) print(confusion_matrix(y_dev, y_pred, labels=[1, 0])) f05 = fbeta_score(y_dev, y_pred, beta=0.5, labels=[1, 0]) precision = precision_score(y_dev, y_pred, labels=[1, 0]) recall = recall_score(y_dev, y_pred, labels=[1, 0]) mp = average_precision_score(y_dev, y_pred) auc = roc_auc_score(y_dev, y_pred) acc = accuracy_score(y_dev, y_pred) evaluations_res = [f05, precision, recall, mp, auc, acc] for i in range(len(evaluations)): print("{}: {}".format(evaluations[i], evaluations_res[i])) res_lr[evaluations[i]][str(c)].append(evaluations_res[i]) print("\n") print("Finished c {} in {} sec\n".format(c, time.time() - start_time2)) print("{} sec\n".format(time.time() - start_time)) # average cv results for ev in evaluations: res_lr[ev] = {c: np.mean(res_lr[ev][c]) for c in res_lr[ev]} # save param output with open('res_lr_{}_{}_1v{}.json'.format(PENALTY, SCALER, BALANCE), 'w') as f: json.dump(res_lr, f)
all_predicted_lines = [] all_target_lines = [] for doc in doc_test: predicted_lines = random_search.predict(doc.data) all_predicted_lines += list(predicted_lines) all_target_lines += list(doc.targets) predicted_doc = utils.classify_doc(predicted_lines) documents_predicted.append(predicted_doc) documents_target.append(doc.category) print("Line by Line ") print("Confusion Matrix: \n{}".format( confusion_matrix(all_target_lines, all_predicted_lines))) accuracy = fbeta_score(all_target_lines, all_predicted_lines, average=None, beta=2) print("Accuracy: {}".format(accuracy)) doc_accuracy = fbeta_score(documents_target, documents_predicted, average=None, beta=2) print("Document Accuracy: {}".format(doc_accuracy)) print("Document Confusion Matrix: \n{}".format( confusion_matrix(documents_target, documents_predicted)))
def mf(x): p2 = np.zeros_like(p) for i in range(17): p2[:, i] = (p[:, i] > x[i]).astype(np.int) score = fbeta_score(y, p2, beta=2, average='samples') return score
pred_stances = cross_val_predict(vote_pipeline, train_data.Abstract, train_data.Stance, cv=cv) print second_clf.named_steps print first_clf.named_steps print 80 * '=' print "TRAIN" print 80 * '=' print classification_report(train_data.Stance, pred_stances, digits=4) macro_f = fbeta_score(train_data.Stance, pred_stances, 1.0, labels=['AGAINST', 'FAVOR', 'NONE'], average='macro') print 'macro-average of F-score(FAVOR) and F-score(AGAINST): {:.4f}\n'.format( macro_f) print 80 * '=' print "VALIDATE" print 80 * '=' print 'WORD2VEC VECTORS:', word2vec_ids[0] print 80 * '=' vote_pipeline.fit(train_data.Abstract, train_data.Stance) pred_stances = vote_pipeline.predict(validate_data.Abstract)
def fbeta(true_label, prediction): return fbeta_score(true_label, prediction, beta=2, average='samples')
# transform skewed data skewed = ['capital-gain', 'capital-loss'] features_log_transformed = pd.DataFrame(data = features_raw) features_log_transformed[skewed] = features_raw[skewed].apply(lambda x: np.log(x + 1)) # Normalize numerical features scaler = MinMaxScaler() # default=(0, 1) numerical = ['age', 'education-num', 'capital-gain', 'capital-loss', 'hours-per-week'] features_log_minmax_transform = pd.DataFrame(data = features_log_transformed) features_log_minmax_transform[numerical] = scaler.fit_transform(features_log_transformed[numerical]) # One-hot encode categorical features features_final = pd.get_dummies(features_log_minmax_transform) income = income_raw.map({'>50K': 1, '<=50K': 0}) # Shuffle and Split data X_train, X_test, y_train, y_test = train_test_split(features_final, income, test_size = 0.2, random_state = 0) # Evaluate Model Performance with fbeta = 0.5 fbeta = 0.5 best_clf = Models.evaluate_models(X_train,y_train,X_test,y_test,fbeta) print("\n",best_clf.__class__.__name__) best_clf = Models.optimize_best_model(best_clf,X_train,y_train,X_test,y_test) model_predictions = best_clf.predict(X_test) print("Final accuracy score on the testing data: {:.4f}".format(accuracy_score(y_test, model_predictions))) print("Final F-score on the testing data: {:.4f}".format(fbeta_score(y_test, model_predictions, beta = 0.5)))
def test_precision_recall_f1_score_multilabel_2(): """ Test precision_recall_f1_score on a crafted multilabel example 2 """ # Second crafted example y_true_ll = [(1,), (2,), (2, 3)] y_pred_ll = [(4,), (4,), (2, 1)] lb = LabelBinarizer() lb.fit([range(1, 5)]) y_true_bi = lb.transform(y_true_ll) y_pred_bi = lb.transform(y_pred_ll) for y_true, y_pred in [(y_true_ll, y_pred_ll), (y_true_bi, y_pred_bi)]: # tp = [ 0. 1. 0. 0.] # fp = [ 1. 0. 0. 2.] # fn = [ 1. 1. 1. 0.] p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None) assert_array_almost_equal(p, [0.0, 1.0, 0.0, 0.0], 2) assert_array_almost_equal(r, [0.0, 0.5, 0.0, 0.0], 2) assert_array_almost_equal(f, [0.0, 0.66, 0.0, 0.0], 2) assert_array_almost_equal(s, [1, 2, 1, 0], 2) f2 = fbeta_score(y_true, y_pred, beta=2, average=None) support = s assert_array_almost_equal(f2, [0, 0.55, 0, 0], 2) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="micro") assert_almost_equal(p, 0.25) assert_almost_equal(r, 0.25) assert_almost_equal(f, 2 * 0.25 * 0.25 / 0.5) assert_equal(s, None) assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, average="micro"), (1 + 4) * p * r / (4 * p + r)) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="macro") assert_almost_equal(p, 0.25) assert_almost_equal(r, 0.125) assert_almost_equal(f, 2 / 12) assert_equal(s, None) assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, average="macro"), np.mean(f2)) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="weighted") assert_almost_equal(p, 2 / 4) assert_almost_equal(r, 1 / 4) assert_almost_equal(f, 2 / 3 * 2 / 4) assert_equal(s, None) assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, average="weighted"), np.average(f2, weights=support)) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="samples") # Check weigted # |h(x_i) inter y_i | = [0, 0, 1] # |y_i| = [1, 1, 2] # |h(x_i)| = [1, 1, 2] assert_almost_equal(p, 1 / 6) assert_almost_equal(r, 1 / 6) assert_almost_equal(f, 2 / 4 * 1 / 3) assert_equal(s, None) assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, average="samples"), 0.1666, 2)
# In[171]: X = df.iloc[:, :df.shape[1]] Y = st.iloc[:, 0] # In[163]: #Make pipeline of clf clf_pipeline = make_pipeline(StandardScaler(), tree.DecisionTreeClassifier()) y_pred = cross_val_predict(clf_pipeline, X, Y, cv=10) print('Accuracy score:', metrics.accuracy_score(Y, y_pred)) print('RMSE:', round(sqrt(mean_squared_error(Y, y_pred)), 2)) print('R_squared:', round(r2_score(Y, y_pred), 2)) print('Recall score:', metrics.recall_score(Y, y_pred)) print('Fbeta score:', fbeta_score(Y, y_pred, beta=1.5)) print('F1-score:', f1_score(Y, y_pred)) # # Filter Features by Variance # In[164]: var = df.var() idx = [] for i in range(len(var)): if var[i] < 0.75: print('{:50} {}'.format(var.index[i], var[i])) idx.append(var.index[i]) # In[165]:
def avg_fscore(y_true, y_pred): return fbeta_score(y_true, y_pred, average="macro", beta=0.5)
grid_fit = grid_obj.fit(X_train, y_train) # Get the estimator best_clf = grid_fit.best_estimator_ # Make predictions using the unoptimized and model predictions = (clf.fit(X_train, y_train)).predict(X_test) best_predictions = best_clf.predict(X_test) # Report the before-and-afterscores print("Unoptimized model\n------") print("Accuracy score on testing data: {:.4f}".format( accuracy_score(y_test, predictions))) print("F-score on testing data: {:.4f}".format( fbeta_score(y_test, predictions, beta=0.5))) print("\nOptimized Model\n------") print("Final accuracy score on the testing data: {:.4f}".format( accuracy_score(y_test, best_predictions))) print("Final F-score on the testing data: {:.4f}".format( fbeta_score(y_test, best_predictions, beta=0.5))) pickle.dump(best_clf, open(filename, 'wb')) importances = best_clf.feature_importances_ vs.feature_plot(importances, X_train, y_train) X_train_reduced = X_train[X_train.columns.values[( np.argsort(importances)[::-1])[:5]]] X_test_reduced = X_test[X_test.columns.values[(
def f2_measure(y_true, y_pred): return fbeta_score(y_true, y_pred, beta=2)
def evaluate( self, sentences: Union[List[DataPoint], Dataset], out_path: Union[str, Path] = None, embedding_storage_mode: str = "none", mini_batch_size: int = 32, num_workers: int = 8, ) -> (Result, float): # read Dataset into data loader (if list of sentences passed, make Dataset first) if not isinstance(sentences, Dataset): sentences = SentenceDataset(sentences) data_loader = DataLoader(sentences, batch_size=mini_batch_size, num_workers=num_workers) # use scikit-learn to evaluate y_true = [] y_pred = [] with torch.no_grad(): eval_loss = 0 lines: List[str] = [] batch_count: int = 0 for batch in data_loader: batch_count += 1 # remove previously predicted labels [sentence.remove_labels('predicted') for sentence in batch] # get the gold labels true_values_for_batch = [ sentence.get_labels(self.label_type) for sentence in batch ] # predict for batch loss = self.predict( batch, embedding_storage_mode=embedding_storage_mode, mini_batch_size=mini_batch_size, label_name='predicted', return_loss=True) eval_loss += loss sentences_for_batch = [ sent.to_plain_string() for sent in batch ] # get the predicted labels predictions = [ sentence.get_labels('predicted') for sentence in batch ] for sentence, prediction, true_value in zip( sentences_for_batch, predictions, true_values_for_batch, ): eval_line = "{}\t{}\t{}\n".format(sentence, true_value, prediction) lines.append(eval_line) for predictions_for_sentence, true_values_for_sentence in zip( predictions, true_values_for_batch): true_values_for_sentence = [ label.value for label in true_values_for_sentence ] predictions_for_sentence = [ label.value for label in predictions_for_sentence ] y_true_instance = np.zeros(len(self.label_dictionary), dtype=int) for i in range(len(self.label_dictionary)): if self.label_dictionary.get_item_for_index( i) in true_values_for_sentence: y_true_instance[i] = 1 y_true.append(y_true_instance.tolist()) y_pred_instance = np.zeros(len(self.label_dictionary), dtype=int) for i in range(len(self.label_dictionary)): if self.label_dictionary.get_item_for_index( i) in predictions_for_sentence: y_pred_instance[i] = 1 y_pred.append(y_pred_instance.tolist()) store_embeddings(batch, embedding_storage_mode) # remove predicted labels for sentence in sentences: sentence.annotation_layers['predicted'] = [] if out_path is not None: with open(out_path, "w", encoding="utf-8") as outfile: outfile.write("".join(lines)) # make "classification report" target_names = [] for i in range(len(self.label_dictionary)): target_names.append( self.label_dictionary.get_item_for_index(i)) classification_report = metrics.classification_report( y_true, y_pred, digits=4, target_names=target_names, zero_division=0) # get scores micro_f_score = round( metrics.fbeta_score(y_true, y_pred, beta=self.beta, average='micro', zero_division=0), 4) accuracy_score = round(metrics.accuracy_score(y_true, y_pred), 4) macro_f_score = round( metrics.fbeta_score(y_true, y_pred, beta=self.beta, average='macro', zero_division=0), 4) precision_score = round( metrics.precision_score(y_true, y_pred, average='macro', zero_division=0), 4) recall_score = round( metrics.recall_score(y_true, y_pred, average='macro', zero_division=0), 4) detailed_result = ("\nResults:" f"\n- F-score (micro) {micro_f_score}" f"\n- F-score (macro) {macro_f_score}" f"\n- Accuracy {accuracy_score}" '\n\nBy class:\n' + classification_report) # line for log file if not self.multi_label: log_header = "ACCURACY" log_line = f"\t{accuracy_score}" else: log_header = "PRECISION\tRECALL\tF1\tACCURACY" log_line = f"{precision_score}\t" \ f"{recall_score}\t" \ f"{macro_f_score}\t" \ f"{accuracy_score}" result = Result( main_score=micro_f_score, log_line=log_line, log_header=log_header, detailed_results=detailed_result, ) eval_loss /= batch_count return result, eval_loss
def _get_fbeta_score(self, classifier, X_valid, y_valid): p_valid = classifier.predict(X_valid) return fbeta_score(y_valid, np.array(p_valid) > 0.2, beta=2, average='samples')
def classification(source, model, target_att, test_source = "", fs_task=False): # source -- Path to the file that is used to train. # model -- Object loaded from file with trained model. # target_att -- Name of attribute in source that is considered as target. # test_source -- Path to the file that is used to test. # fs_task -- String with name of used feature selection algorithm. results = dict.fromkeys(["predictions", "score", "model", "features", "removed_features", "selected_features", "feature_importances", "measures"]) results["predictions"] = [] # Basic metrics used for classification and feature selection evaluation. metrics = dict.fromkeys(["accuracy","recall","precision","f_measure","f_beta"]) metrics["accuracy"] = [] metrics["recall"] = [] metrics["precision"] = [] metrics["f_measure"] = [] results["removed_features"] = [] results["selected_features"] = [] results["feature_importances"] = [] # http://nlp.stanford.edu/IR-book/html/htmledition/evaluation-of-unranked-retrieval-sets-1.html metrics["f_beta"] = [] cfr = model print(model) # Object for reading train data and test data csv = csvhandling.CsvHandling() # Numpy array with values from source path without feature names and target values. train = csv.read_csv(source) # List of feature names features = csv.get_features(source) # Numpy array with target values target = csv.get_target(source,target_att) if test_source != False: # Numpy array with values from test_source path without feature names and target values. test = csv.read_csv(test_source) # Numpy array with test target values test_target = csv.get_target(test_source,target_att) if fs_task: # Pipeline with fitted model and feature selection filter or only fitted model. cfr = featureselection.get_fs_model(cfr, fs_task, train, target) original_features = features[:] if fs_task == "RFE": selected_features = [] else: selected_features = featureselection.get_selected_features(cfr.named_steps["feature_selection"],original_features) removed_features = [i for i in features if not i in selected_features] results["removed_features"].append(removed_features) results["selected_features"].append(selected_features) else: cfr.fit(train, target) prediction = cfr.predict(test) results["predictions"].append(prediction) metrics["accuracy"].append(mx.accuracy_score(test_target, prediction)) metrics["precision"].append(mx.precision_score(test_target, prediction, average="macro")) metrics["recall"].append(mx.recall_score(test_target, prediction, average="macro")) metrics["f_measure"].append(mx.f1_score(test_target, prediction, average="macro")) else: # If there are no test data than there is cross-validation used for model evaluation. cv = cross_validation.KFold(len(train), n_folds=5, shuffle=False, random_state=None) if fs_task == "RFE": # Pipeline with fitted model and feature selection filter or only fitted model. cfr = featureselection.get_fs_model(cfr, fs_task+"CV", train, target, cv) original_features = features[:] selected_features = featureselection.get_selected_features(cfr,original_features) removed_features = [i for i in features if not i in selected_features] results["removed_features"].append(removed_features) results["selected_features"].append(selected_features) for traincv, testcv in cv: test = train[testcv] test_target = target[testcv] prediction = cfr.predict(test) results["predictions"].append(prediction) metrics["accuracy"].append(mx.accuracy_score(test_target, prediction)) metrics["precision"].append(mx.precision_score(test_target, prediction)) metrics["recall"].append(mx.recall_score(test_target, prediction)) metrics["f_measure"].append(mx.f1_score(test_target, prediction)) metrics["f_beta"].append(mx.fbeta_score(test_target, prediction, 0.5)) else: for traincv, testcv in cv: # Repaired bug from http://stackoverflow.com/questions/19265097/why-does-cross-validation-for-randomforestregressor-fail-in-scikit-learn if fs_task: cfr = featureselection.get_fs_model(cfr, fs_task, train[traincv], target[traincv]) original_features = features[:] if fs_task == "fromModel": selected_features = featureselection.get_selected_features(cfr,original_features) else: selected_features = featureselection.get_selected_features(cfr.named_steps["feature_selection"],original_features) removed_features = [i for i in features if not i in selected_features] results["removed_features"].append(removed_features) results["selected_features"].append(selected_features) else: cfr.fit(train[traincv], target[traincv]) test = train[testcv] test_target = target[testcv] prediction = cfr.predict(test) results["predictions"].append(prediction) metrics["accuracy"].append(mx.accuracy_score(test_target, prediction)) metrics["precision"].append(mx.precision_score(test_target, prediction)) metrics["recall"].append(mx.recall_score(test_target, prediction)) metrics["f_measure"].append(mx.f1_score(test_target, prediction)) metrics["f_beta"].append(mx.fbeta_score(test_target, prediction, 0.5)) results["score"] = cfr.score(test, test_target) results["model"] = cfr results["metrics"] = metrics return results
def sample_run(df, anoms_ref, window_size=500, com=12): """ This functions expects a dataframe df as mandatory argument. The first column of the df should contain timestamps, the second machine IDs Keyword arguments: df: a pandas data frame with two columns: 1. timestamp, 2. value anoms_ref: reference anomaly detection results window_size: the size of the window of data points that are used for anomaly detection com: decay in terms of center of mass (this approximates averageing over about twice as many hours) """ n_epochs = 10 p_anoms = .5 def detect_ts_online(df_smooth, window_size, stop): is_anomaly = False run_time = 9999 start_index = max(0, stop - window_size) df_win = df_smooth.iloc[start_index:stop, :] start_time = time.time() results = detect_ts(df_win, alpha=0.05, max_anoms=0.02, only_last=None, longterm=False, e_value=False, direction='both') run_time = time.time() - start_time if results['anoms'].shape[0] > 0: timestamp = df_win['timestamp'].tail(1).values[0] if timestamp == results['anoms'].tail(1)['timestamp'].values[0]: is_anomaly = True return is_anomaly, run_time def running_avg(ts, com=6): rm_o = np.zeros_like(ts) rm_o[0] = ts[0] for r in range(1, len(ts)): curr_com = float(min(com, r)) rm_o[r] = rm_o[r - 1] + (ts[r] - rm_o[r - 1]) / (curr_com + 1) return rm_o # create arrays that will hold the results of batch AD (y_true) and online AD (y_pred) y_true = [] y_pred = [] run_times = [] # check which unique machines, sensors, and timestamps we have in the dataset machineIDs = df['machineID'].unique() sensors = df.columns[2:] timestamps = df['datetime'].unique()[window_size:] # sample n_machines_test random machines and sensors random_machines = np.random.choice(machineIDs, n_epochs) random_sensors = np.random.choice(sensors, n_epochs) # we intialize an array with that will later hold a sample of timetamps random_timestamps = np.random.choice(timestamps, n_epochs) for i in range(0, n_epochs): # take a slice of the dataframe that only contains the measures of one random machine df_s = df[df['machineID'] == random_machines[i]] # smooth the values of one random sensor, using our running_avg function smooth_values = running_avg(df_s[random_sensors[i]].values, com) # create a data frame with two columns: timestamp, and smoothed values df_smooth = pd.DataFrame(data={ 'timestamp': df_s['datetime'].values, 'value': smooth_values }) # load the results of batch AD for this machine and sensor anoms_s = anoms_ref[((anoms_ref['machineID'] == random_machines[i]) & (anoms_ref['errorID'] == random_sensors[i]))] # find the location of the t'th random timestamp in the data frame if np.random.random() < p_anoms: anoms_timestamps = anoms_s['datetime'].values np.random.shuffle(anoms_timestamps) counter = 0 while anoms_timestamps[0] < timestamps[0]: if counter > 100: return 0.0, 9999.0 np.random.shuffle(anoms_timestamps) counter += 1 random_timestamps[i] = anoms_timestamps[0] # select the test case test_case = df_smooth[df_smooth['timestamp'] == random_timestamps[i]] test_case_index = test_case.index.values[0] # check whether the batch AD found an anomaly at that time stamps and copy into y_true at idx y_true_i = random_timestamps[i] in anoms_s['datetime'].values # perform online AD, and write result to y_pred y_pred_i, run_times_i = detect_ts_online(df_smooth, window_size, test_case_index) y_true.append(y_true_i) y_pred.append(y_pred_i) run_times.append(run_times_i) return fbeta_score(y_true, y_pred, beta=2), np.mean(run_times)
# print(means_and_stds) end_ind = int( np.argwhere( np.isnan(stats_all['preds']['test'][min_ind, 0, train_setSize, :]))[0]) - 1 predictions = stats_all['preds']['test'][min_ind, cv_fold, train_setSize, 0:end_ind] targets = stats_all['targets']['test'][min_ind, cv_fold, train_setSize, 0:end_ind] probs = stats_all['probs']['test'][min_ind, cv_fold, train_setSize, 0:end_ind] fpr, tpr, thresholds = metrics.roc_curve(targets, probs, pos_label=1) auc_score = metrics.auc(fpr, tpr) auc_scores[cv_fold, train_setSize] = auc_score f2beta_score = metrics.fbeta_score(targets, predictions, beta=2) if (train_setSize == (num_trainSetSizes - 1)): for epoch_num in range(0, num_epochs, 2): auc_scores_curve_train[cv_fold, epoch_num] = stats_all['auc']['train'][ epoch_num, cv_fold, train_setSize] auc_scores_curve_val[cv_fold, epoch_num] = stats_all['auc']['val'][ epoch_num, cv_fold, train_setSize] auc_scores_curve_test[cv_fold, epoch_num] = stats_all['auc']['test'][ epoch_num, cv_fold, train_setSize] losses_curve_train[cv_fold, epoch_num] = stats_all['losses']['train'][
def fbeta(_, predictions_binary, labels, parameters): return metrics.fbeta_score(labels, predictions_binary, **parameters)
def _test(num_classes, threshold, multilabel, average): fbeta = FBeta( beta=2.0, num_classes=num_classes, threshold=threshold, multilabel=multilabel, average=average, ) f1 = F1Score( num_classes=num_classes, threshold=threshold, multilabel=multilabel, average=average, ) outputs = torch.randn(100, 4) targets = torch.randint(0, 4, size=(100, )) bs = _BaseInputHandler( num_classes=num_classes, average=average, threshold=0.5, multilabel=multilabel, ) np_outputs, np_targets = bs._compute(outputs=outputs, targets=targets) fbeta.accumulate(outputs=outputs, targets=targets) f1.accumulate(outputs=outputs, targets=targets) fbeta_val = fbeta.value f1_val = f1.value assert fbeta.case_type == "multiclass" assert f1.case_type == "multiclass" with warnings.catch_warnings(): warnings.simplefilter("ignore", category=UndefinedMetricWarning) fbeta_skm = fbeta_score(np_targets.numpy(), np_outputs.numpy(), average=average, beta=2.0) f1_skm = f1_score(np_targets.numpy(), np_outputs.numpy(), average=average) assert fbeta_skm == pytest.approx(fbeta_val.item()) assert f1_skm == pytest.approx(f1_val.item()) bs = 16 iters = targets.shape[0] // bs + 1 fbeta.reset() f1.reset() for i in range(iters): idx = i * bs fbeta.accumulate(outputs=outputs[idx:idx + bs], targets=targets[idx:idx + bs]) f1.accumulate( outputs=outputs[idx:idx + bs], targets=targets[idx:idx + bs], ) f1_m = f1.value fbeta_m = fbeta.value assert f1_skm == pytest.approx(f1_m.item()) assert fbeta_skm == pytest.approx(fbeta_m.item())
print 'Precision:\t', precision # The recall is the ratio 'tp / (tp + fn)' where 'tp' is the number of # true positives and 'fn' the number of false negatives. The recall is # intuitively the ability of the classifier to find all the positive samples. # The best value is 1 and the worst value is 0. recall = recall_score(y_true, y_hat) print 'Recall: \t', recall # F1 score, also known as balanced F-score or F-measure # The F1 score can be interpreted as a weighted average of the precision and # recall, where an F1 score reaches its best value at 1 and worst score at 0. # The relative contribution of precision and recall to the F1 score are # equal. The formula for the F1 score is: # F1 = 2 * (precision * recall) / (precision + recall) print 'f1 score: \t', f1_score(y_true, y_hat) #print 2 * (precision * recall) / (precision + recall) # The F-beta score is the weighted harmonic mean of precision and recall, # reaching its optimal value at 1 and its worst value at 0. # The 'beta' parameter determines the weight of precision in the combined # score. 'beta < 1' lends more weight to precision, while 'beta > 1' # favors recall ('beta -> 0' considers only precision, 'beta -> inf' only recall). print 'F-beta:' for beta in np.logspace(-3, 3, num=7, base=10): fbeta = fbeta_score(y_true, y_hat, beta=beta) print '\tbeta=%9.3f\tF-beta=%.5f' % (beta, fbeta) #print (1+beta**2)*precision*recall / (beta**2 * precision + recall) print precision_recall_fscore_support(y_true, y_hat, beta=1)
def _test(num_classes, threshold, multilabel, average): fbeta = FBeta( beta=2.0, num_classes=num_classes, threshold=threshold, multilabel=multilabel, average=average, ) f1 = F1Score( num_classes=num_classes, threshold=threshold, multilabel=multilabel, average=average, ) outputs = torch.randn(100, 1) targets = torch.randint(0, 2, size=(100, )) bs = _BaseInputHandler( num_classes=num_classes, average=average, threshold=0.5, multilabel=multilabel, ) np_outputs, np_targets = bs._compute(outputs=outputs, targets=targets) fbeta.accumulate(outputs=outputs, targets=targets) f1.accumulate(outputs=outputs, targets=targets) fbeta_val = fbeta.value f1_val = f1.value assert fbeta.case_type == "binary" assert f1.case_type == "binary" fbeta_skm = fbeta_score(np_targets.numpy(), np_outputs.numpy(), average="binary", beta=2.0) f1_skm = f1_score(np_targets.numpy(), np_outputs.numpy(), average="binary") assert fbeta_skm == pytest.approx(fbeta_val.item()) assert f1_skm == pytest.approx(f1_val.item()) bs = 16 iters = targets.shape[0] // bs + 1 fbeta.reset() f1.reset() for i in range(iters): idx = i * bs fbeta.accumulate(outputs=outputs[idx:idx + bs], targets=targets[idx:idx + bs]) f1.accumulate( outputs=outputs[idx:idx + bs], targets=targets[idx:idx + bs], ) f1_m = f1.value fbeta_m = fbeta.value assert f1_skm == pytest.approx(f1_m.item()) assert fbeta_skm == pytest.approx(fbeta_m.item())
def ki_test(self, ckpth, list_path, img_root, test_id, test_batch, ntype='', real_sn=False, test_each=False): """ kinship identification test :return: """ self.Net.load(ckpth) self.infer = partial(self.Net.inference, net_type=ntype) test_set = self.dloader(list_path, img_root, test_id, transform=test_transform, test=True, test_each=test_each, real_sn=real_sn) test_loader = DataLoader(test_set, batch_size=test_batch) total_pred = [] total_label = [] self.Net.net.eval() with torch.no_grad(): for data in test_loader: images, labels, _, _ = data images, labels = images.to(self.device), labels.to(self.device) if ntype == 'cascade': predicted = self.infer(images) else: outputs = self.infer(images) _, predicted = torch.max(outputs.data, 1) predicted = predicted.cpu().data.numpy() labels = labels.cpu().data.numpy() total_pred = np.concatenate((total_pred, predicted), axis=0) total_label = np.concatenate((total_label, labels), axis=0) if real_sn: confu_m = confusion_matrix(total_label, total_pred, labels=[1, 2, 3, 4], normalize='true') f10_fd = fbeta_score(total_label, total_pred, labels=[1], beta=10, average='macro') f10_fs = fbeta_score(total_label, total_pred, labels=[2], beta=10, average='macro') f10_md = fbeta_score(total_label, total_pred, labels=[3], beta=10, average='macro') f10_ms = fbeta_score(total_label, total_pred, labels=[4], beta=10, average='macro') micro_f1 = fbeta_score(total_label, total_pred, beta=10, average='macro') acc = sum(total_label == total_pred) / len(total_label) return confu_m, f10_fd, f10_fs, f10_md, f10_ms, micro_f1, acc else: if test_each: confu_m = confusion_matrix(total_label, total_pred, labels=[1, 2, 3, 4], normalize='true') micro_f1 = f1_score(total_label, total_pred) acc = sum(total_label == total_pred) / len(total_label) return confu_m, micro_f1, acc else: confu_m = confusion_matrix(total_label, total_pred, labels=[1, 2, 3, 4], normalize='true') micro_f1 = f1_score(total_label, total_pred, average='macro') acc = sum(total_label == total_pred) / len(total_label) return confu_m, micro_f1, acc
#calculate the f0.5 measure from sklearn.metrics import fbeta_score from sklearn.metrics import f1_score from sklearn.metrics import precision_score from sklearn.metrics import recall_score #perfect precision, 50% recall y_true = [0, 0, 0, 0, 0, 1, 1, 1, 1, 1] y_pred = [1, 1, 1, 1, 1, 1, 1, 1, 1, 1] p = precision_score(y_true, y_pred) r = recall_score(y_true, y_pred) f = fbeta_score(y_true, y_pred, beta=0.5) print('Result: p=%.3f, r=%.3f, f=%.3f' % (p, r, f))
def train_nn(i): trainindex = train_df[train_df['CVindices'] != i].index.tolist() valindex = train_df[train_df['CVindices'] == i].index.tolist() X_val_df = train_df.iloc[valindex, :] X_build, X_valid = train_data_224_3[trainindex, :], train_data_224_3[ valindex, :] y_build, y_valid = train_target_224_3[trainindex, :], train_target_224_3[ valindex, :] print('Split train: ', len(X_build), len(y_build)) print('Split valid: ', len(X_valid), len(y_valid)) model = resnet152_model(ROWS, COLUMNS, CHANNELS, num_classes=17) # callbacks = [ # EarlyStopping(monitor='val_loss', patience=3, verbose=VERBOSEFLAG), # ] callbacks = [ ModelCheckpoint(MODEL_WEIGHTS_FILE, monitor='val_loss', save_best_only=True, verbose=1) ] #sgd = SGD(lr=1e-3, decay=1e-6, momentum=0.9, nesterov=True) adam = Adam(lr=0.01, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=1e-6) model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy']) # model.compile(loss='categorical_crossentropy', optimizer="adadelta", \ # metrics=["accuracy"]) model.fit_generator(train_datagen.flow(X_build, y_build, batch_size=batch_size, shuffle=True), samples_per_epoch=len(X_build), nb_epoch=nb_epoch, callbacks=callbacks, validation_data=valid_datagen.flow( X_valid, y_valid, batch_size=batch_size), nb_val_samples=X_valid.shape[0], verbose=VERBOSEFLAG) model.load_weights(MODEL_WEIGHTS_FILE) pred_cv = model.predict_generator(valid_datagen.flow(X_valid, batch_size=batch_size, shuffle=False), val_samples=X_valid.shape[0]) print( 'F2 Score : ', fbeta_score(y_valid, np.array(pred_cv) > 0.2, beta=2, average='samples')) pred_cv = pd.DataFrame(pred_cv) pred_cv.head() pred_cv.columns = [ "slash_burn", "clear", "blooming", "primary", "cloudy", "conventional_mine", "water", "haze", "cultivation", "partly_cloudy", "artisinal_mine", "habitation", "bare_ground", "blow_down", "agriculture", "road", "selective_logging" ] pred_cv["image_name"] = X_val_df.image_name.values sub_valfile = inDir + '/submissions/Prav.resnet152_01.fold' + str( i) + '.csv' pred_cv = pred_cv[[ "image_name", "slash_burn", "clear", "blooming", "primary", "cloudy", "conventional_mine", "water", "haze", "cultivation", "partly_cloudy", "artisinal_mine", "habitation", "bare_ground", "blow_down", "agriculture", "road", "selective_logging" ]] pred_cv.to_csv(sub_valfile, index=False) pred_test = model.predict_generator(test_datagen.flow( test_data_224_3, batch_size=batch_size, shuffle=False), val_samples=test_data_224_3.shape[0]) pred_test = pd.DataFrame(pred_test) pred_test.columns = [ "slash_burn", "clear", "blooming", "primary", "cloudy", "conventional_mine", "water", "haze", "cultivation", "partly_cloudy", "artisinal_mine", "habitation", "bare_ground", "blow_down", "agriculture", "road", "selective_logging" ] pred_test["image_name"] = test_all.image_name.values pred_test = pred_test[[ "image_name", "slash_burn", "clear", "blooming", "primary", "cloudy", "conventional_mine", "water", "haze", "cultivation", "partly_cloudy", "artisinal_mine", "habitation", "bare_ground", "blow_down", "agriculture", "road", "selective_logging" ]] sub_file = inDir + '/submissions/Prav.resnet152_01.fold' + str( i) + '-test' + '.csv' pred_test.to_csv(sub_file, index=False)
def evaluate( self, sentences: Union[List[Sentence], Dataset], out_path: Union[str, Path] = None, embedding_storage_mode: str = "none", mini_batch_size: int = 32, num_workers: int = 8, ) -> (Result, float): # read Dataset into data loader (if list of sentences passed, make Dataset first) if not isinstance(sentences, Dataset): sentences = SentenceDataset(sentences) data_loader = DataLoader(sentences, batch_size=mini_batch_size, num_workers=num_workers) # if span F1 needs to be used, use separate eval method if self._requires_span_F1_evaluation(): return self._evaluate_with_span_F1(data_loader, embedding_storage_mode, mini_batch_size, out_path) # else, use scikit-learn to evaluate y_true = [] y_pred = [] labels = Dictionary(add_unk=False) eval_loss = 0 batch_no: int = 0 lines: List[str] = [] for batch in data_loader: # predict for batch loss = self.predict(batch, embedding_storage_mode=embedding_storage_mode, mini_batch_size=mini_batch_size, label_name='predicted', return_loss=True) eval_loss += loss batch_no += 1 for sentence in batch: for token in sentence: # add gold tag gold_tag = token.get_tag(self.tag_type).value y_true.append(labels.add_item(gold_tag)) # add predicted tag predicted_tag = token.get_tag('predicted').value y_pred.append(labels.add_item(predicted_tag)) # for file output lines.append(f'{token.text} {gold_tag} {predicted_tag}\n') lines.append('\n') if out_path: with open(Path(out_path), "w", encoding="utf-8") as outfile: outfile.write("".join(lines)) eval_loss /= batch_no # use sklearn from sklearn import metrics # make "classification report" target_names = [] for i in range(len(labels)): target_names.append(labels.get_item_for_index(i)) classification_report = metrics.classification_report(y_true, y_pred, digits=4, target_names=target_names, zero_division=1) # get scores macro_f_score = round(metrics.fbeta_score(y_true, y_pred, beta=self.beta, average='micro'), 4) micro_f_score = round(metrics.fbeta_score(y_true, y_pred, beta=self.beta, average='macro'), 4) accuracy_score = round(metrics.accuracy_score(y_true, y_pred), 4) detailed_result = ( "\nResults:" f"\n- F-score (micro) {macro_f_score}" f"\n- F-score (macro) {micro_f_score}" f"\n- Accuracy {accuracy_score}" '\n\nBy class:\n' + classification_report ) # line for log file log_header = "ACCURACY" log_line = f"\t{accuracy_score}" result = Result( main_score=macro_f_score, log_line=log_line, log_header=log_header, detailed_results=detailed_result, ) return result, eval_loss
total = np.add(total, inception_full_pred) avg = total / 12 p_valid = np.vstack((p_valid, avg)) targets = np.zeros(17) for t in tags.split(' '): targets[label_map[t]] = 1 Y_valid.append(targets) Y_valid = np.array(Y_valid, np.uint8) p_valid = np.delete(p_valid, 0, axis=0) print(Y_valid) print(p_valid) print( fbeta_score(Y_valid, np.array(p_valid) > 0.2, beta=2, average='samples')) constant_threshold = find_f2score_threshold(p_valid, Y_valid, verbose=True) # Find different thresholds for each label if find_thresholds: out = np.array(p_valid) threshold = np.arange(0.1, 0.9, 0.1) acc = [] accuracies = [] best_threshold = np.zeros(out.shape[1]) for i in range(out.shape[1]): y_prob = np.array(out[:, i]) for j in threshold: y_pred = [1 if prob >= j else 0 for prob in y_prob]
random_state=30) # In[49]: from sklearn.metrics import make_scorer, fbeta_score, accuracy_score from sklearn import metrics ada_income = (ada_Boosts.predict(X_test)) print(metrics.confusion_matrix(Y_test, ada_income)) print("Accuracy Score =", metrics.accuracy_score(Y_test, ada_income)) #print ("Accuracy score on testing data:{:.4f}".format( #accuracy_score(Y_test, ada_income))) print("F-score on testing data:{:.4f}".format( fbeta_score(Y_test, ada_income, beta=0.5))) # In[97]: from sklearn import metrics adult_tree = tree.DecisionTreeClassifier(criterion='entropy', max_depth=4, max_leaf_nodes=5) adult_tree.fit(X_train, Y_train) predictions = adult_tree.predict(X_test) print(metrics.confusion_matrix(Y_test, predictions)) print("Accuracy Score =", metrics.accuracy_score(Y_test, predictions)) #adult_tree = tree.DecisionTreeClassifier(criterion='entropy', max_depth=4, max_leaf_nodes=5)
def test_precision_recall_f1_score_multilabel_1(): """ Test precision_recall_f1_score on a crafted multilabel example """ # First crafted example y_true_ll = [(0,), (1,), (2, 3)] y_pred_ll = [(1,), (1,), (2, 0)] lb = LabelBinarizer() lb.fit([range(4)]) y_true_bi = lb.transform(y_true_ll) y_pred_bi = lb.transform(y_pred_ll) for y_true, y_pred in [(y_true_ll, y_pred_ll), (y_true_bi, y_pred_bi)]: p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None) #tp = [0, 1, 1, 0] #fn = [1, 0, 0, 1] #fp = [1, 1, 0, 0] # Check per class assert_array_almost_equal(p, [0.0, 0.5, 1.0, 0.0], 2) assert_array_almost_equal(r, [0.0, 1.0, 1.0, 0.0], 2) assert_array_almost_equal(f, [0.0, 1 / 1.5, 1, 0.0], 2) assert_array_almost_equal(s, [1, 1, 1, 1], 2) f2 = fbeta_score(y_true, y_pred, beta=2, average=None) support = s assert_array_almost_equal(f2, [0, 0.83, 1, 0], 2) # Check macro p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="macro") assert_almost_equal(p, 1.5 / 4) assert_almost_equal(r, 0.5) assert_almost_equal(f, 2.5 / 1.5 * 0.25) assert_equal(s, None) assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, average="macro"), np.mean(f2)) # Check micro p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="micro") assert_almost_equal(p, 0.5) assert_almost_equal(r, 0.5) assert_almost_equal(f, 0.5) assert_equal(s, None) assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, average="micro"), (1 + 4) * p * r / (4 * p + r)) # Check weigted p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="weighted") assert_almost_equal(p, 1.5 / 4) assert_almost_equal(r, 0.5) assert_almost_equal(f, 2.5 / 1.5 * 0.25) assert_equal(s, None) assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, average="weighted"), np.average(f2, weights=support)) # Check weigted # |h(x_i) inter y_i | = [0, 1, 1] # |y_i| = [1, 1, 2] # |h(x_i)| = [1, 1, 2] p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="samples") assert_almost_equal(p, 0.5) assert_almost_equal(r, 0.5) assert_almost_equal(f, 0.5) assert_equal(s, None) assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, average="samples"), 0.5)
def f2_score(y_true, y_pred): y_true, y_pred, = np.array(y_true), np.array(y_pred) return fbeta_score(y_true, y_pred, beta=2, average='samples')
def test_precision_recall_f1_score_with_an_empty_prediction(): y_true_ll = [(1,), (0,), (2, 1,)] y_pred_ll = [tuple(), (3,), (2, 1)] lb = LabelBinarizer() lb.fit([range(4)]) y_true_bi = lb.transform(y_true_ll) y_pred_bi = lb.transform(y_pred_ll) for y_true, y_pred in [(y_true_ll, y_pred_ll), (y_true_bi, y_pred_bi)]: # true_pos = [ 0. 1. 1. 0.] # false_pos = [ 0. 0. 0. 1.] # false_neg = [ 1. 1. 0. 0.] p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average=None) assert_array_almost_equal(p, [0.0, 1.0, 1.0, 0.0], 2) assert_array_almost_equal(r, [0.0, 0.5, 1.0, 0.0], 2) assert_array_almost_equal(f, [0.0, 1 / 1.5, 1, 0.0], 2) assert_array_almost_equal(s, [1, 2, 1, 0], 2) f2 = fbeta_score(y_true, y_pred, beta=2, average=None) support = s assert_array_almost_equal(f2, [0, 0.55, 1, 0], 2) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="macro") assert_almost_equal(p, 0.5) assert_almost_equal(r, 1.5 / 4) assert_almost_equal(f, 2.5 / (4 * 1.5)) assert_equal(s, None) assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, average="macro"), np.mean(f2)) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="micro") assert_almost_equal(p, 2 / 3) assert_almost_equal(r, 0.5) assert_almost_equal(f, 2 / 3 / (2 / 3 + 0.5)) assert_equal(s, None) assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, average="micro"), (1 + 4) * p * r / (4 * p + r)) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="weighted") assert_almost_equal(p, 3 / 4) assert_almost_equal(r, 0.5) assert_almost_equal(f, (2 / 1.5 + 1) / 4) assert_equal(s, None) assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, average="weighted"), np.average(f2, weights=support)) p, r, f, s = precision_recall_fscore_support(y_true, y_pred, average="samples") # |h(x_i) inter y_i | = [0, 0, 2] # |y_i| = [1, 1, 2] # |h(x_i)| = [0, 1, 2] assert_almost_equal(p, 1 / 3) assert_almost_equal(r, 1 / 3) assert_almost_equal(f, 1 / 3) assert_equal(s, None) assert_almost_equal(fbeta_score(y_true, y_pred, beta=2, average="samples"), 0.333, 2)
def fbeta(model, X_valid, y_valid): p_valid = model.predict(X_valid) return fbeta_score(y_valid, np.array(p_valid) > 0.2, beta=2, average='samples')