def evaluate_on_batch(self, sess, inputs_batch, labels_batch, log=True): """Return the loss after evaluating on the provided batch of data Args: sess: tf.Session() input_batch: np.ndarray of shape (n_samples, n_features) labels_batch: np.ndarray of shape (n_samples,) Returns: loss: loss over the batch (a scalar) """ feed = self.create_feed_dict(inputs_batch, labels_batch=labels_batch) output_pred = tf.argmax(self.pred, axis=1) output = sess.run(output_pred, feed_dict=feed) num_correct = 0 if log: confusion_matrix = ConfusionMatrix(np.sort( np.unique(labels_batch))) for i in range(inputs_batch.shape[0]): y = labels_batch[i] y_hat = output[i] if log: confusion_matrix.update(y, y_hat) if y == y_hat: num_correct += 1 # else: # print("pred was {}, truth was {}".format(y_hat, y)) if log: print(confusion_matrix.as_table()) return 1 - (1.0 * num_correct / inputs_batch.shape[0])
def evaluate(self, sess, examples, examples_raw): """Evaluates model performance on @examples. This function uses the model to predict labels for @examples and constructs a confusion matrix. Args: sess: the current TensorFlow session. examples: A list of vectorized input/output pairs. examples: A list of the original input/output sequence pairs. Returns: The F1 score for predicting tokens as named entities. """ token_cm = ConfusionMatrix(labels=LBLS) correct_preds, total_correct, total_preds = 0., 0., 0. for _, labels, labels_ in self.output(sess, examples_raw, examples): for l, l_ in zip(labels, labels_): token_cm.update(l, l_) gold = set(get_chunks(labels)) pred = set(get_chunks(labels_)) correct_preds += len(gold.intersection(pred)) total_preds += len(pred) total_correct += len(gold) p = correct_preds / total_preds if correct_preds > 0 else 0 r = correct_preds / total_correct if correct_preds > 0 else 0 f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0 return token_cm, (p, r, f1)
def evaluate(self, examples): """Evaluates model performance on @examples. This function uses the model to predict labels for @examples and constructs a confusion matrix. Returns: The F1 score for predicting tokens as named entities. """ token_cm = ConfusionMatrix(labels=LBLS) correct_preds, total_correct, total_preds = 0., 0., 0. for data in self._predictor.predict(examples): (_, labels, labels_) = data for l, l_ in zip(labels, labels_): token_cm.update(l, l_) gold = set(get_chunks(labels)) pred = set(get_chunks(labels_)) correct_preds += len(gold.intersection(pred)) total_preds += len(pred) total_correct += len(gold) p = correct_preds / total_preds if correct_preds > 0 else 0 r = correct_preds / total_correct if correct_preds > 0 else 0 f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0 return token_cm, (p, r, f1)
def evaluate_prediction(self, session, batch_size, dataset): print("\nEVALUATING") cm = ConfusionMatrix(labels=self.LBLS) total_loss = 0 total_correct = 0 num_batches = 0 for batch in minibatches(dataset, batch_size, bucket=self.bucket): probs, loss = self.predict(session, batch_size, batch) _, _, _, _, goldlabels = batch for i in xrange(len(probs)): total_correct += 1 if label_to_name(probs[i]) == label_to_name( goldlabels[i]) else 0 gold_idx = np.argmax(goldlabels[i]) predicted_idx = np.argmax(probs[i]) cm.update(gold_idx, predicted_idx) total_loss += loss num_batches += 1 accuracy = total_correct / float(len(dataset[0])) print("Accuracy: " + str(accuracy)) average_loss = total_loss / float(num_batches) print("Average Loss: " + str(average_loss)) print("Token-level confusion matrix:\n" + cm.as_table()) print("Token-level scores:\n" + cm.summary()) return (accuracy, average_loss, cm)
def evaluate(model, X, Y): cm = ConfusionMatrix(labels=LBLS) Y_ = model.predict(X) for i in range(Y.shape[0]): y, y_ = np.argmax(Y[i]), np.argmax(Y_[i]) cm.update(y, y_) cm.print_table() return cm.summary()
def evaluate(self, sess, examples): """Evaluates model performance on @examples. This function uses the model to predict labels for @examples and constructs a confusion matrix. Args: sess: the current TensorFlow session. examples: A list of vectorized input/output pairs. Returns: The F1 score for predicting the relationship between headline-body pairs """ # TODO(akshayka): Implement a report that tells us the inputs # on which we guessed incorrectly token_cm = ConfusionMatrix(labels=LBLS, default_label=UNRELATED) correct_guessed_related, total_gold_related, total_guessed_related = ( 0., 0., 0.) _, labels, labels_hat = self.output(sess, examples) score = 0 num_unrelated = len([l for l in labels if l == UNRELATED]) num_related = len(labels) - num_unrelated unrelated_score = 0.25 * num_unrelated max_score = unrelated_score + 1.0 * num_related for l, l_hat in zip(labels, labels_hat): token_cm.update(l, l_hat) if l == l_hat: score += 0.25 if l != UNRELATED: score += 0.5 if l in RELATED and l_hat in RELATED: score += 0.25 if l == l_hat and l in RELATED: correct_guessed_related += 1 if l in RELATED: total_gold_related += 1 if l_hat in RELATED: total_guessed_related += 1 p = correct_guessed_related / total_guessed_related if \ total_guessed_related > 0 else 0 r = correct_guessed_related / total_gold_related if \ total_gold_related > 0 else 0 if total_guessed_related == 0: logging.warn("total_guessed_related == 0!") if total_gold_related == 0: logging.warn("total_gold_related == 0!") f1 = 2 * p * r / (p + r) if p + r > 0 else 0 unrelated_ratio = unrelated_score / max_score score_ratio = score / max_score return token_cm, (p, r, f1), (unrelated_ratio, score_ratio)
def generate_cm(real, pred, n_class): LBLS = [str(x) for x in xrange(n_class)] token_cm = ConfusionMatrix(labels=LBLS) for l, l_ in zip(real, pred): token_cm.update(l, l_) # self.counts[gold][guess] += 1 return token_cm # for test # real = [s for s in xrange(50)] * 10 # real.extend([random.randint(0, 49) for r in xrange(1000)]) # pred = [p for p in xrange(50)] * 10 # pred.extend([random.randint(0,49) for r in xrange(1000)]) # # t_cm = generate_cm(real, pred, 50) # print t_cm.as_table() # print t_cm.summary() # print t_cm.as_matrix()
def evaluate(self, sess, examples, examples_raw): """ Evaluates model performance on @examples. """ token_cm = ConfusionMatrix(labels=LBLS) correct_preds, total_correct, total_preds = 0., 0., 0. for _, labels, labels_ in self.output(sess, examples_raw, examples): for l, l_ in zip(labels, labels_): token_cm.update(l, l_) gold = set(get_chunks(labels)) pred = set(get_chunks(labels_)) correct_preds += len(gold.intersection(pred)) total_preds += len(pred) total_correct += len(gold) p = correct_preds / total_preds if correct_preds > 0 else 0 r = correct_preds / total_correct if correct_preds > 0 else 0 f1 = 2 * p * r / (p + r) if correct_preds > 0 else 0 return token_cm, (p, r, f1)