def make_evaluation_report(predY, testY): ############Configuration ################ average_metric = 'micro' # {None,'micro', 'macro','weighted','samples'} ############Configuration end############# #1) Hamming loss hl = hamming_loss(testY, predY) #2) one_error one_error = 0.00000000000000001 #3) Coverage # try: try: coverage = coverage_error(testY.toarray(), predY.toarray()) except: try: coverage = coverage_error(testY, predY.todense()) except: coverage = 0.0 # except: # coverage = coverage_error(testY,predY.todense()) #4) F1-score and precision f1score = metrics.f1_score(testY, predY, average=average_metric) precision = metrics.precision_score(testY, predY, average=average_metric) res = [hl, one_error, coverage, f1score, precision] # print(" %-12s %-12s %-12s %-12s %-12s %-12s" % ('modelname','hamming','one_error','coverage','f1score','precision')) # print(" %-12s %-12f %-12f %-12f %-12f %-12f" % ('modelname',res[0],res[1],res[2],res[3],res[4])) return res
def coverage_error_scores(arr, arr1, arr2, arr3): a4 = coverage_error(arr, arr1) b4 = coverage_error(arr, arr2) c4 = coverage_error(arr, arr3) print("Coverage Error Scores for the three classifiers are") print("Using Binary Relevance: " + str(a4)) print("Using Classifier Chain: " + str(b4)) print("Using LabelPowerSet: " + str(c4)) print("\n")
def on_epoch_end(self, epoch, logs={}): result = self.model.predict(self.x_test) roc_auc = metrics.roc_auc_score(self.y_test.ravel(), result.ravel()) print('\r Micro val_roc_auc: %s' % (str(round(roc_auc, 4))), end=100 * ' ' + '\n') fpr = dict() tpr = dict() roc_auc = dict() for i in range(14): fpr[i], tpr[i], _ = roc_curve(self.y_test[:, i], result[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) print("Class " + str(i) + "auc = " + str(roc_auc[i])) macro = sum(roc_auc.values()) / 14 print('\r Macro val_roc_auc: %s' % (str(round(macro, 4))), end=100 * ' ' + '\n') value = coverage_error(self.y_test, result) print('\r coverage_error: %s' % (str(round(value, 4))), end=100 * ' ' + '\n') value = label_ranking_loss(self.y_test, result) print('\r label_ranking_loss: %s' % (str(round(value, 4))), end=100 * ' ' + '\n') roc_auc = label_ranking_average_precision_score(self.y_test, result) print('\r label_ranking_average_precision_score: %s' % (str(round(roc_auc, 4))), end=100 * ' ' + '\n') return
def use_sklearn_ml_knn(): """ :return: """ base_path = os.getcwd() # train_x = np.load(os.path.join(base_path, 'dataset/train_x.npy'), allow_pickle=True) # train_y = np.load(os.path.join(base_path, 'dataset/train_y.npy'), allow_pickle=True) train_x = np.load(os.path.join(base_path, 'my_dataset/train_x.npy'), allow_pickle=True) train_y = np.load(os.path.join(base_path, 'my_dataset/train_y.npy'), allow_pickle=True) new_train_y = [] for tup in train_y: tmp = [] for label in tup: if label == 0: tmp.append(0) else: tmp.append(1) new_train_y.append(tmp) # test_x = np.load('dataset/test_x.npy', allow_pickle=True) # test_y = np.load('dataset/test_y.npy', allow_pickle=True) test_x = np.load('my_dataset/test_x.npy', allow_pickle=True) test_y = np.load('my_dataset/test_y.npy', allow_pickle=True) new_test_y = [] for tup in test_y: tmp = [] for label in tup: if label == 0: tmp.append(0) else: tmp.append(1) new_test_y.append(tmp) new_test_y = np.array(new_test_y) classifier = MLkNN2(train_x, np.array(new_train_y), k=10) # classifier.fit(train_x, np.array(new_train_y)) classifier.fit() predictions = classifier.predict(test_x) predictions = convert_prediction(predictions) # hamming_loss = HammingLoss(new_test_y, predictions) h_loss = hamming_loss(new_test_y, predictions) z = zero_one_loss(new_test_y, predictions) c = coverage_error(new_test_y, predictions) r = label_ranking_loss(new_test_y, predictions) a = average_precision_score(new_test_y, predictions) print('hamming_loss = ', h_loss) print('0-1_loss = ', z) print('cover_loss = ', c) print('rank_loss = ', r) print('average_loss = ', a)
def binary(X_train, X_test, y_train, y_test): print("Binary Relevance") model = BinaryRelevance(classifier=SVC(), require_dense=[True, True]).fit(X_train, y_train) y_pred = model.predict(X_test) hamming = hamming_loss(y_test, y_pred) subset_accuracy = accuracy_score(y_test, y_pred) recall = recall_score(y_test, y_pred, average='micro') precision = precision_score(y_test, y_pred, average='micro') f1 = f1_score(y_test, y_pred, average='micro') coverage = coverage_error(y_test, y_pred.toarray()) aps = label_ranking_average_precision_score(y_test, y_pred.toarray()) rankingloss = label_ranking_loss(y_test, y_pred.toarray()) print("Hamming: " + str(hamming)) print("Subset Accuracy: " + str(subset_accuracy)) print("Recall: " + str(recall)) print("Precision: " + str(precision)) print("F1: " + str(f1)) print("Coverage error: " + str(coverage)) print("Average Precision Score: " + str(aps)) print("Ranking Loss: " + str(rankingloss)) print("\n") return hamming, subset_accuracy, recall, precision, f1, coverage, aps, rankingloss
def powerset(X_train, X_test, y_train, y_test, classifier): print("Label Powerset") model = chooseClassifier(classifier, X_train, y_train) y_pred = model.predict(X_test) hamming = hamming_loss(y_test, y_pred) subset_accuracy = accuracy_score(y_test, y_pred) recall = recall_score(y_test, y_pred, average='micro') precision = precision_score(y_test, y_pred, average='micro') f1 = f1_score(y_test, y_pred, average='micro') coverage = coverage_error(y_test, y_pred.toarray()) aps = label_ranking_average_precision_score(y_test, y_pred.toarray()) rankingloss = label_ranking_loss(y_test, y_pred.toarray()) print("Hamming: " + str(hamming)) print("Subset Accuracy: " + str(subset_accuracy)) print("Recall: " + str(recall)) print("Precision: " + str(precision)) print("F1: " + str(f1)) print("Coverage error: " + str(coverage)) print("Average Precision Score: " + str(aps)) print("Ranking Loss: " + str(rankingloss)) print("\n") return hamming, subset_accuracy, recall, precision, f1, coverage, aps, rankingloss
def evaluation(y_pred, y_prob, y_true): coverage = coverage_error(y_true, y_prob) hamming = hamming_loss(y_true, y_pred) ranking_loss = label_ranking_loss(y_true, y_prob) f1_macro = metrics.f1_score(y_true, y_pred, average='macro') f1_micro = metrics.f1_score(y_true, y_pred, average='micro') acc = 0 for i in range(y_true.shape[0]): acc += jaccard_similarity_score( y_true.iloc[i, :], y_pred.iloc[i, :]) # jaccard_similarity_score acc = acc / y_true.shape[0] zero_one = zero_one_loss(y_true, y_pred) # 0-1 error performance = { "coverage_error": coverage, "ranking_loss": ranking_loss, "hamming_loss": hamming, "f1_macro": f1_macro, "f1_micro": f1_micro, "Jaccard_Index": acc, "zero_one_error": zero_one } return performance
def metric(pred_prob, label, inclusion_index_set, threshold=0.5): # label, pred_prob structure: [n_classes, n_samples] included_pred_prob = list() included_label = list() for index in inclusion_index_set: included_pred_prob.append(pred_prob[index]) included_label.append(label[index]) prob = np.array(included_pred_prob).transpose() pred = np.array(included_pred_prob).transpose() > threshold true = np.array(included_label).transpose() micro_auc = roc_auc_score(true, prob, average='micro') macro_auc = roc_auc_score(true, prob, average='macro') micro_f1 = f1_score(true, pred, average='micro') macro_f1 = f1_score(true, pred, average='macro') micro_avg_precision = average_precision_score(true, prob, average='micro') macro_avg_precision = average_precision_score(true, prob, average='macro') coverage = coverage_error(true, prob) ranking_loss = label_ranking_loss(true, prob) hamming = hamming_loss(true, pred) fuse = np.concatenate([prob[:, :, np.newaxis], true[:, :, np.newaxis]], axis=2).transpose([1, 0, 2]) top_1_num = top_k_num(fuse, 1) top_3_num = top_k_num(fuse, 3) top_5_num = top_k_num(fuse, 5) top_10_num = top_k_num(fuse, 10) top_20_num = top_k_num(fuse, 20) top_30_num = top_k_num(fuse, 30) top_40_num = top_k_num(fuse, 40) top_50_num = top_k_num(fuse, 50) return macro_auc, micro_auc, micro_f1, macro_f1, micro_avg_precision, macro_avg_precision, coverage, ranking_loss, \ hamming, top_1_num, top_3_num, top_5_num, top_10_num, top_20_num, top_30_num, top_40_num, top_50_num
def update_from_numpy(self, preds, labels): for pred, label, cls in zip(zip(*preds), zip(*labels), self.confusion): true_pos = np.sum([p and l for p, l in zip(pred, label)]) true_neg = np.sum([not p and not l for p, l in zip(pred, label)]) false_pos = np.sum([p and not l for p, l in zip(pred, label)]) false_neg = np.sum([not p and l for p, l in zip(pred, label)]) self.num_true_positives += true_pos self.num_true_positives += true_neg self.num_false_positives += false_pos self.num_false_negatives += false_neg cls["true_pos"] += true_pos cls["true_neg"] += true_neg cls["false_pos"] += false_pos cls["false_neg"] += false_neg cls["support"] += true_pos + false_neg n = len(preds) self.n += n self.ranking_loss += label_ranking_loss(labels, preds) * n self.coverage += coverage_error(labels, preds) * n self.average_precision += label_ranking_average_precision_score( labels, preds) * n for pred, label in zip(preds, labels): lowest_rank_prediction = np.argsort(pred)[-1] label = np.argwhere(label) if lowest_rank_prediction not in label: self.one_error += 1
def on_epoch_end(self, epoch, logs={}): result = self.model.predict_generator(self.val_gen, steps=self.val_gen.n / BATCH, verbose=1) print(self.y[0]) print(result[0]) roc_auc = metrics.roc_auc_score(self.y.ravel(), result.ravel()) print('\r Micro val_roc_auc: %s' % (str(round(roc_auc,4))), end=100*' '+'\n') fpr = dict() tpr = dict() roc_auc = dict() for i in range(14): fpr[i], tpr[i], _ = roc_curve(self.y[:, i], result[:, i]) roc_auc[i] = auc(fpr[i], tpr[i]) print("Class " + str(i) + "auc = " + str(roc_auc[i])) value = coverage_error(self.y, result) print('\r coverage_error: %s' % (str(round(value,4))), end=100*' '+'\n') value = label_ranking_loss(self.y, result) print('\r label_ranking_loss: %s' % (str(round(value, 4))), end=100 * ' ' + '\n') roc_auc = label_ranking_average_precision_score(self.y, result) print('\r label_ranking_average_precision_score: %s' % (str(round(roc_auc,4))), end=100*' '+'\n') return
def evaluate(predictions, labels, threshold=0.5): ''' True Positive : Label : 1, Prediction : 1 False Positive : Label : 0, Prediction : 1 False Negative : Label : 0, Prediction : 0 True Negative : Label : 1, Prediction : 0 Precision : TP/(TP + FP) Recall : TP/(TP + FN) F Score : 2.P.R/(P + R) Ranking Loss : The average number of label pairs that are incorrectly ordered given predictions Hammming Loss : The fraction of labels that are incorrectly predicted. (Hamming Distance between predictions and labels) ''' assert predictions.shape == labels.shape, "Shapes: %s, %s" % ( predictions.shape, labels.shape, ) metrics = dict() # print('pre', predictions) # print('label', labels) metrics['coverage'] = coverage_error(labels, predictions) metrics['average_precision'] = label_ranking_average_precision_score( labels, predictions) metrics['ranking_loss'] = label_ranking_loss(labels, predictions) for i in range(predictions.shape[0]): predictions[i, :][predictions[i, :] >= threshold] = 1 predictions[i, :][predictions[i, :] < threshold] = 0 metrics['bae'] = 0 metrics['patk'] = patk(predictions, labels) metrics['hamming_loss'] = hamming_loss(y_pred=predictions, y_true=labels) metrics['micro_precision'], metrics['micro_recall'], metrics['micro_f1'], metrics['macro_precision'], \ metrics['macro_recall'], metrics['macro_f1'] = bipartition_scores(labels, predictions) return metrics
def cross_validation(self, features): ''' standalone validation of an untrained classifier splits the features into a training test set and a set for validation Warning: overwrites existing trained model ''' values, classes, categories = self._features_to_values(features) values = np.nan_to_num(values) n_classes = len(categories) (training_values, test_values, training_classes, test_classes) = train_test_split(values, classes, test_size=self.validation_split, random_state=self.seed) self._train(np.array(training_values), training_classes, n_classes) predictions = self._predict(np.array(test_values)) predicted_classes = np_utils.probas_to_classes(predictions) binary_labels = np_utils.to_categorical(test_classes) # compute the metrics accuracy = accuracy_score(test_classes, predicted_classes) precision_score = average_precision_score(binary_labels, predictions) error = coverage_error(binary_labels, predictions) loss = label_ranking_loss(binary_labels, predictions) label_precision = label_ranking_average_precision_score( binary_labels, predictions) real_cat = categories[test_classes] predicted_cat = categories[predicted_classes] return (real_cat, predicted_cat, accuracy, precision_score, error, loss, label_precision)
def evaluate_model(model, X_test, y_test, category_names): """ Evaluate the performance of the model on test data using coverage error and classification report Args: model: (sklearn.model_selectionGridSearchCV) estimator created from build_model() X_test: (pandas.DataFrame) containing the features for the test data y_test: (pandas.DataFrame) containing the multilabel targets for the test data category_names: (list) containing the target names from load_data() Returns: None """ try: y_pred = model.predict(X_test) y_test_avg_labels = round(np.mean(y_test.sum(axis=1)), 2) print("Printing classification report...\n") y_pred = model.predict(X_test).todense() i = -1 for col in category_names: i += 1 ytrue = y_test[col] ypred = y_pred[:, i] print(col) print(classification_report(ytrue, ypred)) print('-' * 60) print("\n Printing coverage error...\n") print(round(coverage_error(y_test, y_pred), 2)) print( f"\n Average number of true labels per sample in test sample: {y_test_avg_labels}" ) except: raise Exception("Could not evaluate model.")
def compute_evaluation(true_matrix, predict_matrix): h = hamming_loss(true_matrix, predict_matrix) z = zero_one_loss(true_matrix, predict_matrix) c = coverage_error(true_matrix, predict_matrix) result = [h, z, c] return result
def evaluate_ouput(y_test, output): metrics = dict() metrics['coverage'] = coverage_error(y_test, output) metrics['average_precision'] = label_ranking_average_precision_score( y_test, output) metrics['ranking_loss'] = label_ranking_loss(y_test, output) metrics['one_error'] = OneError(output, y_test) return metrics
def compute_evaluation(true_matrix, predict_matrix): h = hamming_loss(true_matrix, predict_matrix) z = zero_one_loss(true_matrix, predict_matrix) c = coverage_error(true_matrix, predict_matrix) - 1 r = label_ranking_loss(true_matrix, predict_matrix) a = average_precision_score(true_matrix, predict_matrix) result = [h, z, c, r, a] return result
def coverage(vote, target, no_labels): vote0 = zeros(no_labels) target0 = zeros(no_labels) for k, v in vote.items(): vote0[k] = v for t in target: target0[t] = 1.0 vote0 = vote0.reshape((1, no_labels)) target0 = target0.reshape((1, no_labels)) return coverage_error(target0, vote0)
def get_score(y_true, y_pred, labels=None): scores = {} scores["lrap"] = label_ranking_average_precision_score(y_true, y_pred) scores["lrloss"] = label_ranking_loss(y_true, y_pred) scores["ndcg_score"] = ndcg_score(y_true, y_pred) scores["coverage_error"] = coverage_error(y_true, y_pred) try: scores["hamming_loss"] = hamming_loss(y_true, y_pred) except: scores["hamming_loss"] = None try: scores["subset_accuracy"] = accuracy_score(y_true, y_pred) except: scores["subset_accuracy"] = None for avg in [None, "micro", "macro", "weighted", "samples"]: if avg: avg_suffix = f"_{avg}" try: ( scores[f"precision{avg_suffix}"], scores[f"recall{avg_suffix}"], scores[f"f1{avg_suffix}"], _, ) = precision_recall_fscore_support(y_true, y_pred, average=avg) except: ( scores[f"precision{avg_suffix}"], scores[f"recall{avg_suffix}"], scores[f"f1{avg_suffix}"], ) = (None, None, None) try: scores[f"roc_auc{avg_suffix}"] = roc_auc_score( y_true, y_pred, average=avg ) except: scores[f"roc_auc{avg_suffix}"] = None else: try: p, r, f, _ = precision_recall_fscore_support(y_true, y_pred) scores[f"precision"], scores[f"recall"], scores[f"f1"] = ( dict(zip(labels, list(sc))) for sc in (p, r, f) ) except: scores[f"precision"], scores[f"recall"], scores[f"f1"] = ( None, None, None, ) try: scores["roc_auc"] = roc_auc_score(y_true, y_pred) except: scores["roc_auc"] = None return scores
def get_classification_report_2(self,train_y, predicted_score, verbose = 1): cov_err = metrics.coverage_error(train_y,predicted_score) label_rank_avg_prec = metrics.label_ranking_average_precision_score(train_y, predicted_score) rank_loss = metrics.label_ranking_loss(train_y, predicted_score) log_loss = metrics.log_loss(train_y, predicted_score) if(verbose): print('CoverageError', cov_err) print('LabelRankingAvgPrec', label_rank_avg_prec) print('LabelRankingLoss', rank_loss) print('log_loss', log_loss) return [cov_err, label_rank_avg_prec, rank_loss, log_loss]
def print_predict(ground_truth, prediction, hyper_params): rounded = 4 AUC_macro = round(roc_auc_score(ground_truth, prediction, average='macro'), rounded) AUC_micro = round(roc_auc_score(ground_truth, prediction, average='micro'), rounded) Coverage_error = round( (coverage_error(ground_truth, prediction)) / ground_truth.shape[1], rounded) rankloss = round(label_ranking_loss(ground_truth, prediction), rounded) One_error = round(one_error(ground_truth, prediction), rounded) Precision_at_ks = precision_at_ks(ground_truth, prediction) Log_loss = round(log_loss(ground_truth, prediction), rounded) Average_precision_score = round( average_precision_score(ground_truth, prediction), rounded) prediction = np.round(prediction) F1_Micro = round(f1_score(ground_truth, prediction, average='micro'), rounded) Hamming_loss = round(hamming_loss(ground_truth, prediction), rounded) Accuracy = round(accuracy_score(ground_truth, prediction), rounded) Recall_score_macro = round( recall_score(ground_truth, prediction, average='macro'), rounded) Recall_score_micro = round( recall_score(ground_truth, prediction, average='micro'), rounded) Precision_score_macro = round( precision_score(ground_truth, prediction, average='macro'), rounded) Precision_score_micro = round( precision_score(ground_truth, prediction, average='micro'), rounded) Jaccard_score_macro = round( jaccard_score(ground_truth, prediction, average='macro'), rounded) Jaccard_score_micro = round( jaccard_score(ground_truth, prediction, average='micro'), rounded) print('Recall_score_macro: ', Recall_score_macro) print('Recall_score_micro: ', Recall_score_micro) print('Precision_score_macro: ', Precision_score_macro) print('Precision_score_micro: ', Precision_score_micro) print('Jaccard_score_macro: ', Jaccard_score_macro) print('Jaccard_score_micro: ', Jaccard_score_micro) print("Accuracy = ", Accuracy) print('precision_at_ks: ', Precision_at_ks) print('Hamming_loss: ', Hamming_loss) print('Log_loss: ', Log_loss) print('Average_precision_score: ', Average_precision_score) print('F1_Micro ', F1_Micro) print('One_error: ', One_error) print('Ranking loss: ', rankloss) print('coverage: ', Coverage_error) print('AUC-micro: ', AUC_micro) print('AUC-macro: ', AUC_macro) print('\n')
def treino_binarizacao(X, Y): labels = [ 'Latitude', 'Longitude', 'DiaSemChuva', 'Precipitacao', 'RiscoFogo', 'TempBulboSecoEst1', 'TempBulboUmidoEst1', 'UmidadeRelativaEst1', 'DirecaoVentoEst1', 'VelocidadeVentoNebulosidadeEst1', 'DistanciaParaEst1', 'TempBulboSecoEst2', 'TempBulboUmidoEst2', 'UmidadeRelativaEst2', 'DirecaoVentoEst2', 'VelocidadeVentoNebulosidadeEst2', 'DistanciaParaEst2' ] mlb = MultiLabelBinarizer() Ybin = mlb.fit_transform(Y) mlp = neuralnetwork.MLPClassifier(hidden_layer_sizes=(10, 4), activation='tanh', solver='lbfgs', learning_rate='invscaling', random_state=2818, max_iter=400, early_stopping=True) x_train, x_test, y_train, y_test = model.train_test_split(X, Y, train_size=0.33) mlp.fit(x_train, y_train) y_pred = mlp.predict(x_test) print("Erro de cobertura:" + str(metrics.coverage_error(y_test, y_pred))) print("Precisão média de labels:" + str(metrics.label_ranking_average_precision_score(y_test, y_pred))) print("Perda de ranks:" + str(metrics.label_ranking_loss(y_test, y_pred))) matriz = matriz_confusao(y_test, y_pred) results = { "Erro de cobertura": metrics.coverage_error(y_test, y_pred), "Precisão média de labels": metrics.label_ranking_average_precision_score(y_test, y_pred), "Perda de ranks": metrics.label_ranking_loss(y_test, y_pred), "Matrizes": matriz } res_df = pd.DataFrame(results) res_df.to_csv( "C:\\Users\Livnick\Documents\dadosFocos\ResultadosMAcomMatriz2.csv")
def Coverage(labels, probs, mode=1): ''' 用于度量平均上需要多少步才能遍历样本所有的相关标记 @labels: true labels of samples @probs: label's probility of samples ''' if mode: steps = coverage_error(labels, probs) else: steps = np.mean(list(map(_coverage, probs, labels))) return steps
def coverage_error(self): """ The coverage_error function computes the average number of labels that have to be included in the final prediction such that all true labels are predicted. This is useful if you want to know how many top-scored-labels you have to predict in average without missing any true one. The best value of this metrics is thus the average number of true labels. """ self.coverage_error = metrics.coverage_error( self.ground_truth, self.predictions_raw) avg_true_labels = np.count_nonzero(self.ground_truth) / self.ntrials ce_message = 'Coverage Error [' + str(avg_true_labels) + ', ~): ' return ce_message + str(self.coverage_error)
def get_avg_results(hat_y, y): values = {} values['avg_precision_micro'] = average_precision_score(y, hat_y, average='micro') # values['avg_precision_macro'] = average_precision_score(y, hat_y, average = 'macro') values['roc_auc_score_micro'] = roc_auc_score(y, hat_y, average='micro') # values['roc_auc_score_macro'] = roc_auc_score(y, hat_y, average = 'macro') values['coverage_error'] = coverage_error(y, hat_y) values[ 'label_ranking_average_precision_score'] = label_ranking_average_precision_score( y, hat_y) values['label_ranking_loss'] = label_ranking_loss(y, hat_y) return values
def coverage_error(self): """ The coverage_error function computes the average number of labels that have to be included in the final prediction such that all true labels are predicted. This is useful if you want to know how many top-scored-labels you have to predict in average without missing any true one. The best value of this metrics is thus the average number of true labels. """ self.coverage_error = metrics.coverage_error(self.ground_truth, self.predictions_raw) avg_true_labels = np.count_nonzero(self.ground_truth) / self.ntrials ce_message = 'Coverage Error [' + str(avg_true_labels) + ', ~): ' return ce_message + str(self.coverage_error)
def coverage_err(y_true, y_pred): """ Coverage error: For every sample, how far down the ranked list of predicted classes must we reach to get all actual class labels? The average value of this metric across samples is the coverage error. :param y_true: array of shape (n_samples, n_labels) :param y_pred: array of shape (n_samples, n_labels) :return: coverage_error, float """ y_true = np.asarray(y_true) y_pred = np.asarray(y_pred) # We want to exclude rows with 0 labels from the function rows_to_keep = np.sum(y_true, axis = 1) != 0 return coverage_error(y_true[rows_to_keep], y_pred[rows_to_keep])
def _multi_class(y_true=0, y_pred=0): a = label_ranking_average_precision_score(y_true, y_pred) b = label_ranking_loss(y_true, y_pred) c = coverage_error(y_true, y_pred) erg = np.zeros((1, 3)) erg[0, 0] = a erg[0, 1] = b erg[0, 2] = c res_mclass = pd.DataFrame(data=erg, columns=[ 'label_ranking_average_precision_score', 'label_ranking_loss', 'coverage_error' ]) return res_mclass
def _generate_classification_reports(y_true, y_pred, target_names=None): # Calculate additional stats total_accuracy = accuracy_score(y_true, y_pred) cov_error = coverage_error(y_true, y_pred) lrap = label_ranking_average_precision_score(y_true, y_pred) report = metrics.multilabel_prediction_report(y_true, y_pred) report += '\n\n' report += metrics.multilabel_classification_report(y_true, y_pred, target_names=target_names) report += '\n\n' report += 'coverage error: %.3f' % cov_error report += '\n' report += 'LRAP: %.3f' % lrap report += '\n' report += 'total accuracy: %.3f' % total_accuracy return report
def eval_metrics(model): y_gold = model["target_codec"].transform(model["data"]["test"]["y"]) y_pred = model["test_predicted"] test_res = zero_one_loss(y_gold, y_pred) print('Zero_One_loss: %.4f' % test_res) test_res = coverage_error(y_gold, y_pred) print('coverage_error: %.4f' % test_res) test_res = label_ranking_average_precision_score(y_gold, y_pred) print('LRAP: %.4f' % test_res) test_res = r2_score(y_gold, y_pred) print('r2_score: %.4f' % test_res)
def multilabel_metrics(pred_list, verbose, extra_vars, split): ''' Multiclass classification metrics see multilabel ranking metrics in sklearn library for more info: http://scikit-learn.org/stable/modules/model_evaluation.html#multilabel-ranking-metrics # Arguments gt_list, dictionary of reference sentences pred_list, dictionary of hypothesis sentences verbose - if greater than 0 the metric measures are printed out extra_vars - extra variables, here are: extra_vars['word2idx'] - dictionary mapping from words to indices extra_vars['references'] - list of GT labels ''' word2idx = extra_vars[split]['word2idx'] n_classes = len(word2idx) n_samples = len(pred_list) # Create prediction matrix y_pred = np.zeros((n_samples, n_classes)) for i_s, sample in enumerate(pred_list): for word in sample: y_pred[i_s, word2idx[word]] = 1 gt_list = extra_vars[split]['references'] y_gt = np.array(gt_list) # Compute Coverage Error coverr = sklearn_metrics.coverage_error(y_gt, y_pred) # Compute Label Ranking AvgPrec avgprec = sklearn_metrics.label_ranking_average_precision_score( y_gt, y_pred) # Compute Label Ranking Loss rankloss = sklearn_metrics.label_ranking_loss(y_gt, y_pred) if verbose > 0: logging.info('Coverage Error (best: avg labels per sample = %f): %f' % (np.sum(y_gt) / float(n_samples), coverr)) logging.info('Label Ranking Average Precision (best: 1.0): %f' % avgprec) logging.info('Label Ranking Loss (best: 0.0): %f' % rankloss) return { 'coverage error': coverr, 'average precision': avgprec, 'ranking loss': rankloss }
def _generate_classification_reports(y_true, y_pred, target_names=None): # Calculate additional stats total_accuracy = accuracy_score(y_true, y_pred) cov_error = coverage_error(y_true, y_pred) lrap = label_ranking_average_precision_score(y_true, y_pred) report = metrics.multilabel_prediction_report(y_true, y_pred) report += '\n\n' report += metrics.multilabel_classification_report( y_true, y_pred, target_names=target_names) report += '\n\n' report += 'coverage error: %.3f' % cov_error report += '\n' report += 'LRAP: %.3f' % lrap report += '\n' report += 'total accuracy: %.3f' % total_accuracy return report
def calcu_one_metric(scores, labels, metric, threshold=None): ans = None if metric == 'mean_average_precision': scores, labels = _filter_all_negative(scores, labels) ans = metrics.average_precision_score(labels, scores) elif metric == 'macro_auc': scores, labels = _filter_all_negative(scores, labels) ans = metrics.roc_auc_score(labels, scores, average='macro') elif metric == 'micro_auc': scores, labels = _filter_all_negative(scores, labels) ans = metrics.roc_auc_score(labels, scores, average='micro') elif metric == 'macro_f1': scores, labels = _filter_all_negative(scores, labels) pred = pred_from_score(scores, threshold) ans = metrics.f1_score(labels, pred, average='macro') elif metric == 'micro_f1': scores, labels = _filter_all_negative(scores, labels) pred = pred_from_score(scores, threshold) ans = metrics.f1_score(labels, pred, average='micro') elif metric == 'ranking_mean_average_precision': ans = metrics.label_ranking_average_precision_score(labels, scores) elif metric == 'coverage': cove = metrics.coverage_error(labels, scores) # see http://scikit-learn.org/stable/modules/model_evaluation.html#coverage-error ans = cove - 1 elif metric == 'ranking_loss': ans = metrics.label_ranking_loss(labels, scores) elif metric == 'one_error': top_score = np.argmax(scores, axis=1) top_label = labels[range(len(top_score)), top_score] ans = 1 - np.sum(top_label) / len(top_label) else: raise f"unsuppored metric: {metric}" return ans
def get_metrics(y_true, y_score, y_binary_score): """ create the metrics object containing all relevant metrics """ metrics = {} metrics['total_positive'] = np.sum(np.sum(y_binary_score)) #TODO remove those two when running on the whole set to avoid excessive storage costs #metrics['y_true'] = y_true #metrics['y_score'] = y_score #metrics['y_binary_score'] = y_binary_score metrics['coverage_error'] = coverage_error(y_true, y_score) metrics['average_num_of_labels'] = round(float(np.sum(np.sum(y_true, axis=1)))/y_true.shape[0], 2) #metrics['average_precision_micro'] = sklearn.metrics.average_precision_score(y_true, y_binary_score, average='micro') #metrics['average_precision_macro'] = sklearn.metrics.average_precision_score(y_true, y_binary_score, average='macro') metrics['precision_micro'] = sklearn.metrics.precision_score(y_true, y_binary_score, average='micro') metrics['precision_macro'] = sklearn.metrics.precision_score(y_true, y_binary_score, average='macro') metrics['recall_micro'] = sklearn.metrics.recall_score(y_true, y_binary_score, average='micro') metrics['recall_macro'] = sklearn.metrics.recall_score(y_true, y_binary_score, average='macro') metrics['f1_micro'] = sklearn.metrics.f1_score(y_true, y_binary_score, average='micro') metrics['f1_macro'] = sklearn.metrics.f1_score(y_true, y_binary_score, average='macro') # only calculate those for cases with a small number of labels (sections only) if y_true.shape[1] < 100: precision_scores = np.zeros(y_true.shape[1]) for i in range(0, y_true.shape[1]): precision_scores[i] = sklearn.metrics.precision_score(y_true[:,i], y_binary_score[:,i]) metrics['precision_scores_array'] = precision_scores.tolist() recall_scores = np.zeros(y_true.shape[1]) for i in range(0, y_true.shape[1]): recall_scores[i] = sklearn.metrics.recall_score(y_true[:,i], y_binary_score[:,i]) metrics['recall_scores_array'] = recall_scores.tolist() f1_scores = np.zeros(y_true.shape[1]) for i in range(0, y_true.shape[1]): f1_scores[i] = sklearn.metrics.f1_score(y_true[:,i], y_binary_score[:,i]) metrics['f1_scores_array'] = f1_scores.tolist() metrics['top_1'] = get_top_N_percentage(y_score, y_true, max_N=1) metrics['top_3'] = get_top_N_percentage(y_score, y_true, max_N=3) metrics['top_5'] = get_top_N_percentage(y_score, y_true, max_N=5) return metrics
def evaluate(predictions, labels, threshold=0.4, multi_label=True): ''' True Positive : Label : 1, Prediction : 1 False Positive : Label : 0, Prediction : 1 False Negative : Label : 0, Prediction : 0 True Negative : Label : 1, Prediction : 0 Precision : TP/(TP + FP) Recall : TP/(TP + FN) F Score : 2.P.R/(P + R) Ranking Loss : The average number of label pairs that are incorrectly ordered given predictions Hammming Loss : The fraction of labels that are incorrectly predicted. (Hamming Distance between predictions and labels) ''' assert predictions.shape == labels.shape, "Shapes: %s, %s" % (predictions.shape, labels.shape,) metrics = dict() if not multi_label: metrics['bae'] = BAE(labels, predictions) labels, predictions = np.argmax(labels, axis=1), np.argmax(predictions, axis=1) metrics['accuracy'] = accuracy_score(labels, predictions) metrics['micro_precision'], metrics['micro_recall'], metrics['micro_f1'], _ = \ precision_recall_fscore_support(labels, predictions, average='micro') metrics['macro_precision'], metrics['macro_recall'], metrics['macro_f1'], metrics['coverage'], \ metrics['average_precision'], metrics['ranking_loss'], metrics['pak'], metrics['hamming_loss'] \ = 0, 0, 0, 0, 0, 0, 0, 0 else: metrics['coverage'] = coverage_error(labels, predictions) metrics['average_precision'] = label_ranking_average_precision_score(labels, predictions) metrics['ranking_loss'] = label_ranking_loss(labels, predictions) for i in range(predictions.shape[0]): predictions[i, :][predictions[i, :] >= threshold] = 1 predictions[i, :][predictions[i, :] < threshold] = 0 metrics['bae'] = 0 metrics['patk'] = patk(predictions, labels) metrics['micro_precision'], metrics['micro_recall'], metrics['micro_f1'], metrics['macro_precision'], \ metrics['macro_recall'], metrics['macro_f1'] = bipartition_scores(labels, predictions) return metrics
def multilabel_metrics(pred_list, verbose, extra_vars, split): """ Multiclass classification metrics. see multilabel ranking metrics in sklearn library for more info: http://scikit-learn.org/stable/modules/model_evaluation.html#multilabel-ranking-metrics :param pred_list: dictionary of hypothesis sentences :param verbose: if greater than 0 the metric measures are printed out :param extra_vars: extra variables extra_vars['word2idx'] - dictionary mapping from words to indices extra_vars['references'] - list of GT labels :param split: split on which we are evaluating :return: Dictionary of multilabel metrics """ from sklearn import metrics as sklearn_metrics word2idx = extra_vars[split]['word2idx'] # check if an additional dictionary matching raw to basic and general labels is provided # in that case a more general evaluation will be considered raw2basic = extra_vars[split].get('raw2basic', None) if raw2basic is not None: logging.info('Applying general evaluation with raw2basic dictionary.') if raw2basic is None: n_classes = len(word2idx) else: basic_values = set(raw2basic.values()) n_classes = len(basic_values) n_samples = len(pred_list) # Create prediction matrix y_pred = np.zeros((n_samples, n_classes)) for i_s, sample in list(enumerate(pred_list)): for word in sample: if raw2basic is None: y_pred[i_s, word2idx[word]] = 1 else: word = word.strip() y_pred[i_s, raw2basic[word]] = 1 # Prepare GT gt_list = extra_vars[split]['references'] if raw2basic is None: y_gt = np.array(gt_list) else: idx2word = {v: k for k, v in iteritems(word2idx)} y_gt = np.zeros((n_samples, n_classes)) for i_s, sample in list(enumerate(gt_list)): for raw_idx, is_active in list(enumerate(sample)): if is_active: word = idx2word[raw_idx].strip() y_gt[i_s, raw2basic[word]] = 1 # Compute Coverage Error coverr = sklearn_metrics.coverage_error(y_gt, y_pred) # Compute Label Ranking AvgPrec avgprec = sklearn_metrics.label_ranking_average_precision_score(y_gt, y_pred) # Compute Label Ranking Loss rankloss = sklearn_metrics.label_ranking_loss(y_gt, y_pred) # Compute Precision, Recall and F1 score precision, recall, f1, _ = sklearn_metrics.precision_recall_fscore_support(y_gt, y_pred, average='micro') if verbose > 0: logging.info( '"coverage_error" (best: avg labels per sample = %f): %f' % (float(np.sum(y_gt)) / float(n_samples), coverr)) logging.info('Label Ranking "average_precision" (best: 1.0): %f' % avgprec) logging.info('Label "ranking_loss" (best: 0.0): %f' % rankloss) logging.info('precision: %f' % precision) logging.info('recall: %f' % recall) logging.info('f1: %f' % f1) return {'coverage_error': coverr, 'average_precision': avgprec, 'ranking_loss': rankloss, 'precision': precision, 'recall': recall, 'f1': f1}
im.close() xTrain /= 255 #xTrain = xTrain.reshape(xTrain.shape[0], 1, 40, 30).astype('float32') #print(xTrain.shape) #xTrain /= xTrain.std(axis = None) #xTrain -= xTrain.mean() y = np.array([int(x[-1:]) for x in trainingLabels['classname']]).astype('int32') #y = to_categorical(y, 10) print(y.shape) x_fit, x_eval, y_fit, y_eval = cross_validation.train_test_split(xTrain, y, test_size=0.2) clf = xgb.XGBClassifier(objective='multi:softmax', n_estimators=200, learning_rate=0.05, max_depth=20, nthread=4, subsample=0.7, colsample_bytree=0.85, seed=2471) clf.fit(x_fit, y_fit, early_stopping_rounds=20, eval_metric='mlogloss', eval_set=[(x_eval, y_eval)]) clf.fit(xTrain, y) predictY = clf.predict_proba(xTrain) from sklearn import metrics y = to_categorical(y, 10) print(metrics.coverage_error(y, predictY)) with open(pickleFile,'wb') as f: sys.setrecursionlimit(20000) pickle.dump(clf, f)
def test_coverage_error(): # Toy case assert_almost_equal(coverage_error([[0, 1]], [[0.25, 0.75]]), 1) assert_almost_equal(coverage_error([[0, 1]], [[0.75, 0.25]]), 2) assert_almost_equal(coverage_error([[1, 1]], [[0.75, 0.25]]), 2) assert_almost_equal(coverage_error([[0, 0]], [[0.75, 0.25]]), 0) assert_almost_equal(coverage_error([[0, 0, 0]], [[0.25, 0.5, 0.75]]), 0) assert_almost_equal(coverage_error([[0, 0, 1]], [[0.25, 0.5, 0.75]]), 1) assert_almost_equal(coverage_error([[0, 1, 0]], [[0.25, 0.5, 0.75]]), 2) assert_almost_equal(coverage_error([[0, 1, 1]], [[0.25, 0.5, 0.75]]), 2) assert_almost_equal(coverage_error([[1, 0, 0]], [[0.25, 0.5, 0.75]]), 3) assert_almost_equal(coverage_error([[1, 0, 1]], [[0.25, 0.5, 0.75]]), 3) assert_almost_equal(coverage_error([[1, 1, 0]], [[0.25, 0.5, 0.75]]), 3) assert_almost_equal(coverage_error([[1, 1, 1]], [[0.25, 0.5, 0.75]]), 3) assert_almost_equal(coverage_error([[0, 0, 0]], [[0.75, 0.5, 0.25]]), 0) assert_almost_equal(coverage_error([[0, 0, 1]], [[0.75, 0.5, 0.25]]), 3) assert_almost_equal(coverage_error([[0, 1, 0]], [[0.75, 0.5, 0.25]]), 2) assert_almost_equal(coverage_error([[0, 1, 1]], [[0.75, 0.5, 0.25]]), 3) assert_almost_equal(coverage_error([[1, 0, 0]], [[0.75, 0.5, 0.25]]), 1) assert_almost_equal(coverage_error([[1, 0, 1]], [[0.75, 0.5, 0.25]]), 3) assert_almost_equal(coverage_error([[1, 1, 0]], [[0.75, 0.5, 0.25]]), 2) assert_almost_equal(coverage_error([[1, 1, 1]], [[0.75, 0.5, 0.25]]), 3) assert_almost_equal(coverage_error([[0, 0, 0]], [[0.5, 0.75, 0.25]]), 0) assert_almost_equal(coverage_error([[0, 0, 1]], [[0.5, 0.75, 0.25]]), 3) assert_almost_equal(coverage_error([[0, 1, 0]], [[0.5, 0.75, 0.25]]), 1) assert_almost_equal(coverage_error([[0, 1, 1]], [[0.5, 0.75, 0.25]]), 3) assert_almost_equal(coverage_error([[1, 0, 0]], [[0.5, 0.75, 0.25]]), 2) assert_almost_equal(coverage_error([[1, 0, 1]], [[0.5, 0.75, 0.25]]), 3) assert_almost_equal(coverage_error([[1, 1, 0]], [[0.5, 0.75, 0.25]]), 2) assert_almost_equal(coverage_error([[1, 1, 1]], [[0.5, 0.75, 0.25]]), 3) # Non trival case assert_almost_equal(coverage_error([[0, 1, 0], [1, 1, 0]], [[0.1, 10., -3], [0, 1, 3]]), (1 + 3) / 2.) assert_almost_equal(coverage_error([[0, 1, 0], [1, 1, 0], [0, 1, 1]], [[0.1, 10, -3], [0, 1, 3], [0, 2, 0]]), (1 + 3 + 3) / 3.) assert_almost_equal(coverage_error([[0, 1, 0], [1, 1, 0], [0, 1, 1]], [[0.1, 10, -3], [3, 1, 3], [0, 2, 0]]), (1 + 3 + 3) / 3.)
def test_coverage_tie_handling(): assert_almost_equal(coverage_error([[0, 0]], [[0.5, 0.5]]), 0) assert_almost_equal(coverage_error([[1, 0]], [[0.5, 0.5]]), 2) assert_almost_equal(coverage_error([[0, 1]], [[0.5, 0.5]]), 2) assert_almost_equal(coverage_error([[1, 1]], [[0.5, 0.5]]), 2) assert_almost_equal(coverage_error([[0, 0, 0]], [[0.25, 0.5, 0.5]]), 0) assert_almost_equal(coverage_error([[0, 0, 1]], [[0.25, 0.5, 0.5]]), 2) assert_almost_equal(coverage_error([[0, 1, 0]], [[0.25, 0.5, 0.5]]), 2) assert_almost_equal(coverage_error([[0, 1, 1]], [[0.25, 0.5, 0.5]]), 2) assert_almost_equal(coverage_error([[1, 0, 0]], [[0.25, 0.5, 0.5]]), 3) assert_almost_equal(coverage_error([[1, 0, 1]], [[0.25, 0.5, 0.5]]), 3) assert_almost_equal(coverage_error([[1, 1, 0]], [[0.25, 0.5, 0.5]]), 3) assert_almost_equal(coverage_error([[1, 1, 1]], [[0.25, 0.5, 0.5]]), 3)
def coverage_error(self): self.coverage_error = metrics.coverage_error(self.ground_truth, self.predictions_raw) avg_true_labels = np.count_nonzero(self.ground_truth) / self.ntrials return 'Coverage Error [' + str(avg_true_labels) + ', ~): ' + str(self.coverage_error)