def evaluate_task_test(self, task, checkpoint, split="test", return_results=True): """Evaluate the current model.""" utils.log("Testing ", task.name) eval_input_fn, _ = self._preprocessor.prepare_predict([task], split) results = self._estimator.predict(input_fn=eval_input_fn, yield_single_examples=True, checkpoint_path=checkpoint) scorer = task.get_scorer() for r in results: if r["task_id"] != len(self._tasks): # ignore padding examples r = utils.nest_dict(r, self._config.task_names) scorer.update(r[task.name]) scores = dict(scorer.get_results()) if return_results: utils.log("test_results " + task.name + ": " + " - ".join("{}: {}".format(k, v) for k, v in scores.items())) return scores else: return scorer
def write_classification_outputs(self, tasks, trial, split): """Write classification predictions to disk.""" utils.log("Writing out predictions for", tasks, split) predict_input_fn, _ = self._preprocessor.prepare_predict(tasks, split) results = self._estimator.predict(input_fn=predict_input_fn, yield_single_examples=True) # task name -> eid -> model-logits logits = collections.defaultdict(dict) a = [] for r in results: if r["task_id"] != len(self._tasks): r = utils.nest_dict(r, self._config.task_names) task_name = self._config.task_names[r["task_id"]] # logits[task_name][r[task_name]["eid"]] = ( # r[task_name]["logits"] if "logits" in r[task_name] # else r[task_name]["predictions"]) logits[task_name][r[task_name]["eid"]] = { 'logits': r[task_name]["logits"], 'prediction': r[task_name]["predictions"] } for task_name in logits: utils.log("Pickling predictions for {:} {:} examples ({:})".format( len(logits[task_name]), task_name, split)) if trial <= self._config.n_writes_test: utils.write_pickle( logits[task_name], self._config.test_predictions(task_name, split, trial))
def write_classification_outputs( self, tasks, trial, split, config: configure_finetuning.FinetuningConfig): """Write classification predictions to disk.""" utils.log("Writing out predictions for", tasks, split) predict_input_fn, _ = self._preprocessor.prepare_predict(tasks, split) results = self._estimator.predict(input_fn=predict_input_fn, yield_single_examples=True) # task name -> eid -> model-logits logits = collections.defaultdict(dict) for r in results: if r["task_id"] != len(self._tasks): r = utils.nest_dict(r, self._config.task_names) task_name = self._config.task_names[r["task_id"]] logits[task_name][r[task_name]["eid"]] = ( r[task_name]["eid"], r[task_name]["input_ids"], r[task_name]["input_mask"], r[task_name]["token_type_ids"], r[task_name]["logits"] if "logits" in r[task_name] else None, r[task_name]["predictions"], r[task_name]["label_ids"] if "label_ids" in r[task_name] else r[task_name]['targets'], ) print('[RESULT]') tokenizer = tokenization.FullTokenizer( vocab_file=config.vocab_file, do_lower_case=config.do_lower_case) for task_name in logits: utils.log( "Saving Dev Error Analysis for {:} {:} examples ({:})".format( len(logits[task_name]), task_name, split)) if trial <= self._config.n_writes_test: print('Write to: ' + self._config.dev_analysis(task_name, split, trial)) with open(self._config.dev_analysis(task_name, split, trial), 'w', encoding='utf-8') as fout: fout.write('ID\tINPUT\tLOGITS\tPREDICTION\tLABEL\n') for eid in logits[task_name]: print('=>' + str(eid)) (_, input_id, input_mask, token_type_id, logit, prediction, label_id) = logits[task_name][eid] input_tokens = tokenizer.convert_ids_to_tokens( input_id) input_tokens = filter(lambda x: x != '[PAD]', input_tokens) input_tokens = ' '.join(input_tokens) fout.write( str(eid) + '\t' + str(input_tokens) + '\t' + str(logit) + '\t' + str(prediction) + '\t' + str(label_id) + '\n') print('Inputs: ' + str(input_tokens) + ', Logits: ' + str(logit) + ', Predictions: ' + str(prediction) + ', Labels: ' + str(label_id))
def evaluate_task(self, task, split="dev", return_results=True): """Evaluate the current model.""" utils.log("Evaluating", task.name) eval_input_fn, _ = self._preprocessor.prepare_predict([task], split) results = self._estimator.predict(input_fn=eval_input_fn, yield_single_examples=True) eval_examples = task.get_examples(split) scorer = task.get_scorer() for r in results: if r["task_id"] != len(self._tasks): # ignore padding examples r = utils.nest_dict(r, self._config.task_names) scorer.update(r[task.name]) scorer.write_predictions()
def evaluate_task(self, task, split="dev", return_results=True): """Evaluate the current model.""" utils.log("Evaluating", task.name) eval_input_fn, _ = self._preprocessor.prepare_predict([task], split) checkpoints = sorted([ f for f in tf.gfile.ListDirectory(self._config.model_dir) if f[-6:] == ".index" ], key=lambda x: int(x[11:-6])) checkpoints = [ os.path.join(self._config.model_dir, checkpoint[:-6]) for checkpoint in checkpoints ] best_scores = None best_scorer = None key = self._config.eval_key for checkpoint in checkpoints: if int(checkpoint.split("-")[-1]) == 0: continue results = self._estimator.predict(input_fn=eval_input_fn, yield_single_examples=True, checkpoint_path=checkpoint) scorer = task.get_scorer() for r in results: if r["task_id"] != len(self._tasks): # ignore padding examples r = utils.nest_dict(r, self._config.task_names) scorer.update(r[task.name]) scores = dict(scorer.get_results()) scores["checkpoint_path"] = checkpoint if return_results: utils.log(task.name + ": " + " - ".join("{}: {}".format(k, v) for k, v in scores.items())) utils.log() if key is None or best_scores is None or scores[ key] > best_scores[key]: best_scores = scores else: if key is None or best_scores is None or scores[ key] > best_scores[key]: best_scores = scores best_scorer = scorer if return_results: utils.log("eval_results " + task.name + ": " + " - ".join("{}: {}".format(k, v) for k, v in best_scores.items())) return best_scores else: return best_scorer
def write_tagging_outputs(self, tasks, trial, split): """Write classification predictions to disk.""" utils.log("Writing out predictions for", tasks, split) predict_input_fn, _ = self._preprocessor.prepare_predict(tasks, split) results = self._estimator.predict(input_fn=predict_input_fn, yield_single_examples=True) # task name -> eid -> model-logits labels = collections.defaultdict(dict) predictions = collections.defaultdict(dict) length = collections.defaultdict(dict) for r in results: if r["task_id"] != len(self._tasks): r = utils.nest_dict(r, self._config.task_names) task_name = self._config.task_names[r["task_id"]] predictions[task_name][r[task_name] ["eid"]] = r[task_name]["predictions"] labels[task_name][r[task_name]["eid"]] = r[task_name]["labels"] length[task_name][r[task_name]["eid"]] = np.sum( r[task_name]["labels_mask"]) for task_name in predictions: utils.log("Pickling predictions for {:} {:} examples ({:})".format( len(predictions[task_name]), task_name, split)) if trial <= self._config.n_writes_test: preds_file = self._config.test_predictions( task_name, split, trial) + "_pred.txt" label_file = self._config.test_predictions( task_name, split, trial) + "_label.txt" task_preds = predictions[task_name] task_labels = labels[task_name] task_length = length[task_name] num_ex = len(task_preds) if "/" in preds_file: tf.io.gfile.makedirs(preds_file.rsplit("/", 1)[0]) with tf.io.gfile.GFile(preds_file, "w") as fpred, tf.io.gfile.GFile( label_file, "w") as flabel: for i in range(num_ex): l = int(task_length[i]) p = np.array(map(str, task_preds[i][0:l])).tolist() l = np.array(map(str, task_labels[i][0:l])).tolist() fpred.write("{}\n".format(' '.join(p))) flabel.write("{}\n".format(' '.join(l)))
def evaluate_task(self, task, split="dev", return_results=True): """Evaluate the current model.""" utils.log("Evaluating", task.name, split) eval_input_fn, _ = self._preprocessor.prepare_predict([task], split) results = self._estimator.predict(input_fn=eval_input_fn, yield_single_examples=True) if task.name == "cmrc2018" or task.name == "drcd": scorer = task.get_scorer(split) else: scorer = task.get_scorer() for r in results: if r["task_id"] != len(self._tasks): # ignore padding examples r = utils.nest_dict(r, self._config.task_names) scorer.update(r[task.name]) if return_results: utils.log(task.name + ": " + scorer.results_str()) utils.log() return dict(scorer.get_results()) else: return scorer
def evaluate_task(self, task, split="dev", return_results=True): """Evaluate the current model.""" utils.log("Evaluating", task.name, split) eval_input_fn, _ = self._preprocessor.prepare_predict([task], split) results = self._estimator.predict(input_fn=eval_input_fn, yield_single_examples=True) if task.name in [ "squad", "squadv1", "newsqa", "naturalqs", "triviaqa", "searchqa", "cmrc2018", "drcd", "ccks42ec", "ccks42ee", "ccks42single", "ccks42multi", "ner", "ccks42num", "ccks42reg" ]: scorer = task.get_scorer(split) else: scorer = task.get_scorer() for r in results: if r["task_id"] != len(self._tasks): # ignore padding examples r = utils.nest_dict(r, self._config.task_names) scorer.update(r[task.name]) if return_results: utils.log(task.name + ": " + scorer.results_str()) utils.log() return dict(scorer.get_results()) else: return scorer