def test(self, model, data, filePath=None, writeOutput=False): Pr_micro = 0.000001 Re_micro = 0.000001 Pr_macro = 0.000001 Re_macro = 0.000001 if writeOutput: f = open(filePath, 'w') for x, char_seq, y, seq_length, sentences in data: #print('Predictions are:') y_pred = model(x, char_seq, seq_length) y_pred = torch.argmax(y_pred, 2) if writeOutput: #self.writeOutput(f, sentences, y, y_pred) for index in range(y.shape[0]): sentence = sentences[index].split('\n') pred_labels = [ self.labels_inverse[i.item()] for i in y_pred[index] ] for i in range(len(sentence)): line = sentence[i] f.write(line + ' ' + pred_labels[i] + '\n') f.write('\n') y_flat = y.view(y.shape[0] * y.shape[1]) y_pred_flat = y_pred.view(y.shape[0] * y.shape[1]) index = np.where(y_flat > 1) index_pred = np.where(y_pred_flat > 1) pr = scorer(y_flat[index_pred], y_pred_flat[index_pred], average='macro')[0] re = scorer(y_flat[index], y_pred_flat[index], average='macro')[1] Pr_macro += pr Re_macro += re # pr, re, f1, _ = scorer(y_flat, y_pred_flat, average='micro') re = scorer(y_flat[index], y_pred_flat[index], average='micro')[1] pr = scorer(y_flat[index_pred], y_pred_flat[index_pred], average='micro')[0] Pr_micro += pr Re_micro += re #pdb.set_trace() print("Micro PR, Re, F1") Pr_micro /= len(data) Re_micro /= len(data) F1_micro = (2 * Pr_micro * Re_micro) / (Pr_micro + Re_micro) print(Pr_micro, Re_micro, F1_micro) print("Macro PR, Re, F1") Pr_macro /= len(data) Re_macro /= len(data) F1_macro = (2 * Pr_macro * Re_macro) / (Pr_macro + Re_macro) print(Pr_macro, Re_macro, F1_macro) if writeOutput: f.close() print(len(data)) return F1_micro, F1_macro
def score_B(labels, idx, val_set): """ Strategy B: Loop over the predictions. Pros: Focus on the correctness of the clusters themselves. Reward homogeneous clusters. Cons: Ignore images that should have been clustered together but were not. Rewards smaller clusters with few errors in them. """ y_true, y_pred = [], [] for label in set(labels): if label == -1: continue label_idx = [ idx[i] for i in range(labels.shape[0]) if labels[i] == label ] for i in range(len(label_idx) - 1): for j in range(i, len(label_idx)): pair = list(filter(lambda r: \ (r[0] == int(label_idx[i]) and r[1] == int(label_idx[j])) or \ (r[0] == int(label_idx[j]) and r[1] == int(label_idx[i])) , val_set)) if len(pair) == 0: continue else: together = pair.pop()[2] y_true.append(1) y_pred.append(together) sB = scorer(y_true, y_pred) return sB
def score_A(labels, idx, val_set): """ Strategy A: Loop over the annotations. Pros: Correspond to a traditional classification performance score. Reward clustering that excatly matches human annotations. Cons: Penalize even if clusters themselves are homogeneous. Highly dependent on the size and quality of annotations. """ label_lookup = {int(idx[i]): label for i, label in enumerate(labels)} y_true, y_pred = [], [] for id1, id2, together in val_set: # Check if id1 and id2 have been predicted label_of_id1 = label_lookup.get(id1, None) label_of_id2 = label_lookup.get(id2, None) if label_of_id1 is None or label_of_id2 is None: continue # Check if they are in the same cluster, except if they are in -1 y_true.append(together) if label_of_id1 == label_of_id2 and label_of_id1 != -1: y_pred.append(1) else: y_pred.append(0) sA = scorer(y_true, y_pred) return sA
def evaluate_model(self, data): le = LabelEncoder() #drop non_clustered points from dataframe data['cluster'] = data['cluster'].where(data['cluster'].str.len() > 0, np.nan) data.dropna(subset=['cluster'], inplace=True) #separate cluster labels from datapoints points = data.as_matrix(columns=data.columns[1:-1]) labels = data.as_matrix(columns=data.columns[-1:]) le.fit_transform(labels) labels = le.transform(labels) #returns numpy float64 score value return scorer(points, labels)
def cross_val_score(estimator, X, y=None, scoring=None, cv=None): """Run cross-validation like normal but return (scores, predictions)""" ### TODO: Test this code. It's 100% untested! X, y = sklearn.utils.validation.indexable(X, y) cv = sklearn.cross_validation.check_cv(cv, X, y, classifier=sklearn.base.is_classifier(estimator)) scorer = sklearn.metrics.scorer.check_scoring(estimator, scoring=scoring) y_pred = numpy.zeros_like(y) scores = [] for train, test in cv: current_est = sklearn.base.clone(estimator).fit(X[train], y[train]) predictions = current_est.predict(X[test]) scores.append(scorer(y[test], predictions)) y_pred[test] = predictions return numpy.asarray(scores), y_pred
parser = argparse.ArgumentParser() parser.add_argument("test_data", help="formatted test data filename") parser.add_argument("predictions", help="predicted classification data filename") args = parser.parse_args() with open(args.test_data, "r") as f: true_data = json.load(f) with open(args.predictions, "r") as f: predicted_data = json.load(f) true_labels = [] for obs in true_data: true_labels.append(obs[1]["NEtag"]) predicted_me = [] predicted_nb = [] predicted_dt = [] for obs in predicted_data: predicted_me.append(obs[1]["me_pred"]) predicted_nb.append(obs[1]["nb_pred"]) predicted_dt.append(obs[1]["dt_pred"]) for labels in [predicted_me, predicted_nb, predicted_dt]: print(scorer(true_labels, labels, average="micro")) print(classification_report(true_labels, labels))
+---------+-----------+----------------+\n\ | MAPE | {:.4f} | {:.4f} |\n\ | R² | {:.4f} | {:.4f} |\n\ +---------+-----------+----------------+\n\ '.format(mape(y_true=train_Y, y_pred=predicted_train_Y), mape(y_true=validation_Y, y_pred=predicted_validation_Y), r2_score(y_true=train_Y, y_pred=predicted_train_Y), r2_score(y_true=validation_Y, y_pred=predicted_validation_Y))) logging.info('Generating learning curves') train_sizes, train_scores, valid_scores = learning_curve( TransformedLinearRegression(1500), train.iloc[:, :-1], train.iloc[:, -1], scoring=scorer(mape), cv=5) # plot configuration fig = plt.gcf() fig.canvas.set_window_title( 'Learning curves of Linear Regression model with Quantile Transformation' ) plt.plot(train_sizes, train_scores.mean(axis=1), color='r', label='Training Score') plt.plot(train_sizes, valid_scores.mean(axis=1), color='g', label='Validation Score')