def compute_binary_eval_metric(gold_list, predicted_list, matching_fn): """Compute binary evaluation metric """ binary_alphabet = Alphabet() binary_alphabet.add('yes') binary_alphabet.add('no') cm = ConfusionMatrix(binary_alphabet) matched_predicted = [False for x in predicted_list] for gold_span in gold_list: found_match = False for i, predicted_span in enumerate(predicted_list): if matching_fn(gold_span, predicted_span) and \ not matched_predicted[i]: cm.add('yes', 'yes') matched_predicted[i] = True found_match = True break if not found_match: cm.add('no', 'yes') # Predicted span that does not match with any for matched in matched_predicted: if not matched: cm.add('yes', 'no') return cm
def evaluate_sense(gold_list, predicted_list): """Evaluate sense classifier The label 'no' is for the relations that are missed by the system because the arguments don't match any of the gold relations. """ sense_alphabet = Alphabet() for relation in gold_list: sense_alphabet.add(relation['Sense'][0]) sense_alphabet.add('no') sense_cm = ConfusionMatrix(sense_alphabet) gold_to_predicted_map, predicted_to_gold_map = \ _link_gold_predicted(gold_list, predicted_list, spans_exact_matching) for i, gold_relation in enumerate(gold_list): if i in gold_to_predicted_map: predicted_sense = gold_to_predicted_map[i]['Sense'][0] if predicted_sense in gold_relation['Sense']: sense_cm.add(predicted_sense, predicted_sense) else: if not sense_cm.alphabet.has_label(predicted_sense): predicted_sense = 'no' sense_cm.add(predicted_sense, gold_relation['Sense'][0]) else: sense_cm.add('no', gold_relation['Sense'][0]) for i, predicted_relation in enumerate(predicted_list): if i not in predicted_to_gold_map: predicted_sense = predicted_relation['Sense'][0] if not sense_cm.alphabet.has_label(predicted_sense): predicted_sense = 'no' sense_cm.add(predicted_sense, 'no') return sense_cm
def Evaluation_all(gold_label, predict_label): binary_alphabet = Alphabet() for i in range(20): binary_alphabet.add(DICT_INDEX_TO_LABEL[i]) cm = ConfusionMatrix(binary_alphabet) cm.add_list(predict_label, gold_label) macro_p, macro_r, macro_f1 = cm.get_average_prf() overall_accuracy = cm.get_accuracy() return overall_accuracy, macro_p, macro_r, macro_f1
def Evaluation_lst(gold_label, predict_label, print_all=False): binary_alphabet = Alphabet() for i in range(20): binary_alphabet.add(DICT_INDEX_TO_LABEL[i]) cm = ConfusionMatrix(binary_alphabet) cm.add_list(predict_label, gold_label) if print_all: cm.print_out() overall_accuracy = cm.get_accuracy() return overall_accuracy
def Evalation_list(gold_label, predict_label, print_all=False): binary_alphabet = Alphabet() for i in range(2): binary_alphabet.add(str(i)) cm = ConfusionMatrix(binary_alphabet) predict_label = list(map(str, predict_label)) gold_label = list(map(str, gold_label)) cm.add_list(predict_label, gold_label) if print_all: cm.print_out() overall_accuracy = cm.get_accuracy() return overall_accuracy
def evaluate_sense(relation_pairs, valid_senses): sense_alphabet = Alphabet() #for g_relation, _ in relation_pairs: #if g_relation is not None: #sense = g_relation['Sense'][0] #if sense in valid_senses: #sense_alphabet.add(sense) for sense in valid_senses: sense_alphabet.add(sense) sense_alphabet.add(ConfusionMatrix.NEGATIVE_CLASS) sense_alphabet.growing = False sense_cm = ConfusionMatrix(sense_alphabet) for g_relation, p_relation in relation_pairs: assert g_relation is not None or p_relation is not None if g_relation is None: predicted_sense = p_relation['Sense'][0] sense_cm.add(predicted_sense, ConfusionMatrix.NEGATIVE_CLASS) elif p_relation is None: gold_sense = g_relation['Sense'][0] if gold_sense in valid_senses: sense_cm.add(ConfusionMatrix.NEGATIVE_CLASS, gold_sense) else: predicted_sense = p_relation['Sense'][0] gold_sense = g_relation['Sense'][0] if gold_sense in valid_senses: sense_cm.add(predicted_sense, gold_sense) return sense_cm
def Evaluation(gold_file_path, predict_file_path): with open(gold_file_path) as gold_file, open(predict_file_path) as predict_file: gold_list = [int(line.strip().split('\t')[0]) for line in gold_file] predicted_list = [int(line.strip().split("\t")[0]) for line in predict_file] predict_labels = [config.id2category[int(predict)] for predict in predicted_list] gold_labels = [config.id2category[int(gold)] for gold in gold_list] binary_alphabet = Alphabet() for i in range(20): binary_alphabet.add(DICT_INDEX_TO_LABEL[i]) cm = ConfusionMatrix(binary_alphabet) cm.add_list(predict_labels, gold_labels) confusion_matrix(gold_list, predicted_list) cm.print_summary() macro_p, macro_r, macro_f1 = cm.get_average_prf() overall_accuracy = cm.get_accuracy() return overall_accuracy, macro_p, macro_r, macro_f1
def Evaluation(gold_file_path, predict_file_path): with open(gold_file_path) as gold_file, open(predict_file_path) as predict_file: gold_list = [ line.strip().split('\t')[0] for line in gold_file] predicted_list = [line.strip().split("\t#\t")[0] for line in predict_file] binary_alphabet = Alphabet() for i in range(18): binary_alphabet.add(DICT_INDEX_TO_LABEL[i]) cm = ConfusionMatrix(binary_alphabet) cm.add_list(predicted_list, gold_list) cm.print_out() macro_p, macro_r, macro_f1 = cm.get_average_prf() overall_accuracy = cm.get_accuracy() return overall_accuracy, macro_p, macro_r, macro_f1
def evaluate_sense(gold_list, predicted_list): print "In function: evaluate_sense"; """Evaluate sense classifier The label ConfusionMatrix.NEGATIVE_CLASS is for the relations that are missed by the system because the arguments don't match any of the gold relations. """ sense_alphabet = Alphabet() valid_senses = validator.identify_valid_senses(gold_list) for relation in gold_list: sense = relation['Sense'][0] if sense in valid_senses: sense_alphabet.add(sense) sense_alphabet.add(ConfusionMatrix.NEGATIVE_CLASS) sense_cm = ConfusionMatrix(sense_alphabet) gold_to_predicted_map, predicted_to_gold_map = \ _link_gold_predicted(gold_list, predicted_list, spans_exact_matching) for i, gold_relation in enumerate(gold_list): gold_sense = gold_relation['Sense'][0] if gold_sense in valid_senses: if i in gold_to_predicted_map: predicted_sense = gold_to_predicted_map[i]['Sense'][0] if predicted_sense in gold_relation['Sense']: sense_cm.add(predicted_sense, predicted_sense) else: if not sense_cm.alphabet.has_label(predicted_sense): predicted_sense = ConfusionMatrix.NEGATIVE_CLASS sense_cm.add(predicted_sense, gold_sense) else: sense_cm.add(ConfusionMatrix.NEGATIVE_CLASS, gold_sense) for i, predicted_relation in enumerate(predicted_list): if i not in predicted_to_gold_map: predicted_sense = predicted_relation['Sense'][0] if not sense_cm.alphabet.has_label(predicted_sense): predicted_sense = ConfusionMatrix.NEGATIVE_CLASS sense_cm.add(predicted_sense, ConfusionMatrix.NEGATIVE_CLASS) return sense_cm
def compute_binary_eval_metric(gold_list, predicted_list, matching_fn): """Compute binary evaluation metric """ binary_alphabet = Alphabet() binary_alphabet.add('yes') binary_alphabet.add('no') cm = ConfusionMatrix(binary_alphabet) matched_predicted = [False for x in predicted_list] for gold_span in gold_list: found_match = False for i, predicted_span in enumerate(predicted_list): if matching_fn(gold_span, predicted_span) and not matched_predicted[i]: cm.add('yes', 'yes') matched_predicted[i] = True found_match = True break if not found_match: cm.add('no', 'yes') # Predicted span that does not match with any for matched in matched_predicted: if not matched: cm.add('yes', 'no') return cm
def evaluate_sense(relation_pairs, valid_senses): sense_alphabet = Alphabet() for g_relation, _ in relation_pairs: if g_relation is not None: sense = g_relation["Sense"][0] if sense in valid_senses: sense_alphabet.add(sense) sense_alphabet.add(ConfusionMatrix.NEGATIVE_CLASS) sense_alphabet.growing = False sense_cm = ConfusionMatrix(sense_alphabet) for g_relation, p_relation in relation_pairs: assert g_relation is not None or p_relation is not None if g_relation is None: predicted_sense = p_relation["Sense"][0] sense_cm.add(predicted_sense, ConfusionMatrix.NEGATIVE_CLASS) elif p_relation is None: gold_sense = g_relation["Sense"][0] if gold_sense in valid_senses: sense_cm.add(ConfusionMatrix.NEGATIVE_CLASS, gold_sense) else: predicted_sense = p_relation["Sense"][0] gold_sense = g_relation["Sense"][0] if gold_sense in valid_senses: sense_cm.add(predicted_sense, gold_sense) return sense_cm
def compute_span_exact_match_metric(gold_list, predicted_list, verbose=False): """Compute binary evaluation metric """ binary_alphabet = Alphabet() binary_alphabet.add('yes') binary_alphabet.add('no') cm = ConfusionMatrix(binary_alphabet) matched_predicted = [False for x in predicted_list] predicted = defaultdict(list) for i, pspan in enumerate(predicted_list): predicted[pspan].append(i) empty_list = [] key = indices = None for gold in gold_list: found_match = False indices = predicted.get(gold, empty_list) for i in indices: if not matched_predicted[i]: cm.add('yes', 'yes') matched_predicted[i] = True found_match = True break if not found_match: if verbose: print('Span:') print('<<<\t{:s}'.format(gold).encode(ENCODING)) print() cm.add('no', 'yes') # Predicted span that does not match with any for matched, pred in zip(matched_predicted, predicted_list): if not matched: if verbose: print('Span:') print('>>>\t{:s}'.format(pred).encode(ENCODING)) print() cm.add('yes', 'no') return cm
def Evalution(gold_file_path, pred_file_path): gold_authorIdPaperId_to_label = {} pred_authorIdPaperId_to_label = {} gold_data = util.read_dict_from_csv(gold_file_path) for item in gold_data: AuthorId = item["AuthorId"] # 正样本 for paperId in item["ConfirmedPaperIds"].split(" "): gold_authorIdPaperId_to_label[(AuthorId, paperId)] = "1" # 负样本 for paperId in item["DeletedPaperIds"].split(" "): gold_authorIdPaperId_to_label[(AuthorId, paperId)] = "0" pred_data = util.read_dict_from_csv(pred_file_path) for item in pred_data: AuthorId = item["AuthorId"] # 正样本 for paperId in item["ConfirmedPaperIds"].split(" "): pred_authorIdPaperId_to_label[(AuthorId, paperId)] = "1" # 负样本 for paperId in item["DeletedPaperIds"].split(" "): pred_authorIdPaperId_to_label[(AuthorId, paperId)] = "0" # evaluation alphabet = Alphabet() alphabet.add("0") alphabet.add("1") cm = ConfusionMatrix(alphabet) for AuthorId, paperId in gold_authorIdPaperId_to_label: gold = gold_authorIdPaperId_to_label[(AuthorId, paperId)] pred = pred_authorIdPaperId_to_label[(AuthorId, paperId)] cm.add(pred, gold) return cm
def evaluate_sense(gold_list, predicted_list): print "In function: evaluate_sense" """Evaluate sense classifier The label ConfusionMatrix.NEGATIVE_CLASS is for the relations that are missed by the system because the arguments don't match any of the gold relations. """ sense_alphabet = Alphabet() valid_senses = validator.identify_valid_senses(gold_list) for relation in gold_list: sense = relation['Sense'][0] if sense in valid_senses: sense_alphabet.add(sense) sense_alphabet.add(ConfusionMatrix.NEGATIVE_CLASS) sense_cm = ConfusionMatrix(sense_alphabet) gold_to_predicted_map, predicted_to_gold_map = \ _link_gold_predicted(gold_list, predicted_list, spans_exact_matching) for i, gold_relation in enumerate(gold_list): gold_sense = gold_relation['Sense'][0] if gold_sense in valid_senses: if i in gold_to_predicted_map: predicted_sense = gold_to_predicted_map[i]['Sense'][0] if predicted_sense in gold_relation['Sense']: sense_cm.add(predicted_sense, predicted_sense) else: if not sense_cm.alphabet.has_label(predicted_sense): predicted_sense = ConfusionMatrix.NEGATIVE_CLASS sense_cm.add(predicted_sense, gold_sense) else: sense_cm.add(ConfusionMatrix.NEGATIVE_CLASS, gold_sense) for i, predicted_relation in enumerate(predicted_list): if i not in predicted_to_gold_map: predicted_sense = predicted_relation['Sense'][0] if not sense_cm.alphabet.has_label(predicted_sense): predicted_sense = ConfusionMatrix.NEGATIVE_CLASS sense_cm.add(predicted_sense, ConfusionMatrix.NEGATIVE_CLASS) return sense_cm
def evaluate(gold_file, pred_file): with codecs.open(gold_file, encoding="utf-8") as fin_gold, codecs.open(pred_file, encoding="utf-8") as fin_pred: dict_P_to_url_label = {} for line in fin_gold: P, url, label, _ = line.strip().split("\t") if P not in dict_P_to_url_label: dict_P_to_url_label[P] = set() dict_P_to_url_label[P].add((url.strip(), label)) # predict_set = set() for line in fin_pred: url, s, p, o, confidence = line.strip().split("\t") predict_set.add((url.strip(), p)) alphabet = Alphabet() alphabet.add("0") alphabet.add("1") # 评估 marco_p, marco_r, marco_f = 0, 0, 0 N = 0 for P in sorted(dict_P_to_url_label.keys()): confusionMatrix = ConfusionMatrix(alphabet) recall_error_cases = [] precision_error_cases= [] for url, label in dict_P_to_url_label[P]: pred = "0" if (url, P) in predict_set: pred = "1" if label != pred: if label == "1" and pred == "0": recall_error_cases.append("%s\t%s->%s" % (url, label, pred)) if label == "0" and pred == "1": precision_error_cases.append("%s\t%s->%s" % (url, label, pred)) confusionMatrix.add(pred, label) print "==" * 40 print P print confusionMatrix.print_out() p, r, f = confusionMatrix.get_prf("1") marco_p += p marco_r += r marco_f += f N += 1 print "\n==>recall error cases:" print "\n".join(recall_error_cases) print "\n==>precision error cases:" print "\n".join(precision_error_cases) print "**" * 40 print "marco, P: %f; R: %f; F1: %f" % (marco_p / N, marco_r / N, marco_f / N)
def evaluate(gold_file, pred_file): with codecs.open(gold_file, encoding="utf-8") as fin_gold, codecs.open( pred_file, encoding="utf-8") as fin_pred: dict_P_to_url_label = {} for line in fin_gold: P, url, label, _ = line.strip().split("\t") if P not in dict_P_to_url_label: dict_P_to_url_label[P] = set() dict_P_to_url_label[P].add((url.strip(), label)) # predict_set = set() for line in fin_pred: url, s, p, o, confidence = line.strip().split("\t") predict_set.add((url.strip(), p)) alphabet = Alphabet() alphabet.add("0") alphabet.add("1") # 评估 marco_p, marco_r, marco_f = 0, 0, 0 N = 0 for P in sorted(dict_P_to_url_label.keys()): confusionMatrix = ConfusionMatrix(alphabet) recall_error_cases = [] precision_error_cases = [] for url, label in dict_P_to_url_label[P]: pred = "0" if (url, P) in predict_set: pred = "1" if label != pred: if label == "1" and pred == "0": recall_error_cases.append("%s\t%s->%s" % (url, label, pred)) if label == "0" and pred == "1": precision_error_cases.append("%s\t%s->%s" % (url, label, pred)) confusionMatrix.add(pred, label) print "==" * 40 print P print confusionMatrix.print_out() p, r, f = confusionMatrix.get_prf("1") marco_p += p marco_r += r marco_f += f N += 1 print "\n==>recall error cases:" print "\n".join(recall_error_cases) print "\n==>precision error cases:" print "\n".join(precision_error_cases) print "**" * 40 print "marco, P: %f; R: %f; F1: %f" % (marco_p / N, marco_r / N, marco_f / N)
def evaluate_sense(gold_list, predicted_list, verbose=False): """Evaluate sense classifier The label ConfusionMatrix.NEGATIVE_CLASS is for the relations that are missed by the system because the arguments don't match any of the gold relations. """ sense_alphabet = Alphabet() valid_senses = validator.identify_valid_senses(gold_list) isense = None for relation in gold_list: isense = relation['Sense'][0] if isense in valid_senses: sense_alphabet.add(isense) sense_alphabet.add(ConfusionMatrix.NEGATIVE_CLASS) sense_cm = ConfusionMatrix(sense_alphabet) gold_to_predicted_map, predicted_to_gold_map = \ _link_gold_predicted(gold_list, predicted_list, spans_exact_matching) for i, gold_relation in enumerate(gold_list): gold_sense = gold_relation['Sense'][0] if gold_sense in valid_senses: if i in gold_to_predicted_map: predicted_sense = gold_to_predicted_map[i]['Sense'][0] if predicted_sense in gold_relation['Sense']: sense_cm.add(predicted_sense, predicted_sense) else: if not sense_cm.alphabet.has_label(predicted_sense): predicted_sense = ConfusionMatrix.NEGATIVE_CLASS if verbose: print('Sense:') print('<<<\t{:s}'.format(gold_sense).encode(ENCODING)) print('>>>\t{:s}'.format(predicted_sense).encode( ENCODING)) print('Arg1:\t{:s}'.format( gold_relation['Arg1']['RawText']).encode(ENCODING)) print('Arg2:\t{:s}'.format( gold_relation['Arg2']['RawText']).encode(ENCODING)) print() sense_cm.add(predicted_sense, gold_sense) else: if verbose: print('Sense:') print('<<<\t{:s}'.format(gold_sense).encode(ENCODING)) print('>>>\t{:s}'.format( ConfusionMatrix.NEGATIVE_CLASS).encode( ENCODING)) print('Arg1:\t{:s}'.format( gold_relation['Arg1']['RawText']).encode(ENCODING)) print('Arg2:\t{:s}'.format( gold_relation['Arg2']['RawText']).encode(ENCODING)) print() sense_cm.add(ConfusionMatrix.NEGATIVE_CLASS, gold_sense) for i, predicted_relation in enumerate(predicted_list): if i not in predicted_to_gold_map: predicted_sense = predicted_relation['Sense'][0] if not sense_cm.alphabet.has_label(predicted_sense): predicted_sense = ConfusionMatrix.NEGATIVE_CLASS if verbose: print('Sense:') print('<<<\t{:s}'.format(gold_sense).encode(ENCODING)) print('>>>\t{:s}'.format( ConfusionMatrix.NEGATIVE_CLASS).encode( ENCODING)) print('Arg1:\t{:s}'.format( gold_relation['Arg1']['RawText']).encode(ENCODING)) print('Arg2:\t{:s}'.format( gold_relation['Arg2']['RawText']).encode(ENCODING)) print() sense_cm.add(predicted_sense, ConfusionMatrix.NEGATIVE_CLASS) return sense_cm
def test_step_for_cqa(s1_all, s2_all, y_all, tag): """ Evaluates model on a dev/test set """ golds = [] preds = [] softmax_scores = [] n = len(s1_all) batch_size = FLAGS.batch_size start_index = 0 while start_index < n: if start_index + batch_size <= n: s1_batch = s1_all[start_index:start_index + batch_size] s2_batch = s2_all[start_index:start_index + batch_size] y_batch = y_all[start_index:start_index + batch_size] feed_dict = { model.input_s1: s1_batch, model.input_s2: s2_batch, model.input_y: y_batch, model.dropout_keep_prob: 1.0 } step, loss, accuracy, curr_softmax_scores, curr_predictions, curr_golds = sess.run( [ global_step, model.loss, model.accuracy, model.softmax_scores, model.predictions, model.golds ], feed_dict) golds += list(curr_golds) preds += list(curr_predictions) softmax_scores += list(curr_softmax_scores) else: left_num = n - start_index # 填充一下 s1_batch = np.concatenate( (s1_all[start_index:], s1_all[:batch_size - left_num]), axis=0) s2_batch = np.concatenate( (s2_all[start_index:], s2_all[:batch_size - left_num]), axis=0) y_batch = np.concatenate( (y_all[start_index:], y_all[:batch_size - left_num]), axis=0) feed_dict = { model.input_s1: s1_batch, model.input_s2: s2_batch, model.input_y: y_batch, model.dropout_keep_prob: 1.0 } step, loss, accuracy, curr_softmax_scores, curr_predictions, curr_golds = sess.run( [ global_step, model.loss, model.accuracy, model.softmax_scores, model.predictions, model.golds ], feed_dict) golds += list(curr_golds[:left_num]) preds += list(curr_predictions[:left_num]) softmax_scores += list(curr_softmax_scores[:left_num]) break start_index += batch_size alphabet = Alphabet() for i in range(num_classes): alphabet.add(str(i)) confusionMatrix = ConfusionMatrix(alphabet) predictions = list(map(str, preds)) golds = list(map(str, golds)) confusionMatrix.add_list(predictions, golds) id_file = "" if tag == "dev": id_file = train_data_dir + "/dev/id" if tag == "test": id_file = train_data_dir + "/test/id" subtask = "" if train_data_dir.split("/")[-1] == "QA": subtask = "A" if train_data_dir.split("/")[-1] == "QQ": subtask = "B" pred_file = train_data_dir + "/result.%s.txt" % (timestamp) with open(pred_file, "w") as fw: for i, s in enumerate(softmax_scores): fw.write("%d\t%.4f\n" % (preds[i], s[num_classes - 1])) print(pred_file, id_file, tag, subtask) map_score, mrr_score = get_rank_score_by_file( pred_file, id_file, tag, subtask) return map_score, mrr_score, confusionMatrix.get_accuracy()
def test_step(s1_all, s2_all, y_all): """ Evaluates model on a dev set """ golds = [] preds = [] n = len(s1_all) batch_size = FLAGS.main_batch_size start_index = 0 while start_index < n: if start_index + batch_size <= n: s1_batch = s1_all[start_index: start_index + batch_size] s2_batch = s2_all[start_index: start_index + batch_size] y_batch = y_all[start_index: start_index + batch_size] feed_dict = { model.main_input_s1: s1_batch, model.main_input_s2: s2_batch, model.main_input_y: y_batch, model.dropout_keep_prob: 1.0, model.is_train: False } step, curr_predictions, curr_golds = sess.run( [global_step, model.main_predictions, model.main_golds], feed_dict) golds += list(curr_golds) preds += list(curr_predictions) else: left_num = n - start_index # 填充一下 s1_batch = np.concatenate((s1_all[start_index:], s1_all[:batch_size - left_num]), axis=0) s2_batch = np.concatenate((s2_all[start_index:], s2_all[:batch_size - left_num]), axis=0) y_batch = np.concatenate((y_all[start_index:], y_all[:batch_size - left_num]), axis=0) feed_dict = { model.main_input_s1: s1_batch, model.main_input_s2: s2_batch, model.main_input_y: y_batch, model.dropout_keep_prob: 1.0, model.is_train: False } step, curr_predictions, curr_golds = sess.run( [global_step, model.main_predictions, model.main_golds], feed_dict) golds += list(curr_golds[:left_num]) preds += list(curr_predictions[:left_num]) break start_index += batch_size alphabet = Alphabet() for i in range(main_num_classes): alphabet.add(str(i)) confusionMatrix = ConfusionMatrix(alphabet) preds = list(map(str, preds)) golds = list(map(str, golds)) confusionMatrix.add_list(preds, golds) return confusionMatrix