def test_perl_eval_script_can_run_on_printed_conll_files(self): bio_tags = ["B-ARG-1", "I-ARG-1", "O", "B-V", "B-ARGM-ADJ", "O"] sentence = ["Mark", "and", "Matt", "were", "running", "fast", "."] gold_file_path = os.path.join(self.TEST_DIR, "gold_conll_eval.txt") prediction_file_path = os.path.join(self.TEST_DIR, "prediction_conll_eval.txt") with open(gold_file_path, "a+") as gold_file, open(prediction_file_path, "a+") as prediction_file: # Use the same bio tags as prediction vs gold to make it obvious by looking # at the perl script output if something is wrong. Write them twice to # ensure that the perl script deals with multiple sentences. write_to_conll_eval_file(gold_file, prediction_file, 4, sentence, bio_tags, bio_tags) write_to_conll_eval_file(gold_file, prediction_file, 4, sentence, bio_tags, bio_tags) perl_script_command = [ "perl", str(self.TOOLS_ROOT / "srl-eval.pl"), prediction_file_path, gold_file_path ] exit_code = subprocess.check_call(perl_script_command) assert exit_code == 0
def main(serialization_directory, device): """ serialization_directory : str, required. The directory containing the serialized weights. device: int, default = -1 The device to run the evaluation on. """ config = Params.from_file( os.path.join(serialization_directory, "model_params.json")) dataset_reader = DatasetReader.from_params(config['dataset_reader']) evaluation_data_path = config['validation_data_path'] model = Model.load(config, serialization_dir=serialization_directory, cuda_device=device) prediction_file_path = os.path.join(serialization_directory, "predictions.txt") gold_file_path = os.path.join(serialization_directory, "gold.txt") prediction_file = open(prediction_file_path, "w+") gold_file = open(gold_file_path, "w+") # Load the evaluation data and index it. print("Reading evaluation data from {}".format(evaluation_data_path)) dataset = dataset_reader.read(evaluation_data_path) dataset.index_instances(model._vocab) iterator = BasicIterator(batch_size=32) model_predictions = [] for batch in tqdm.tqdm(iterator(dataset, num_epochs=1, shuffle=False)): tensor_batch = arrays_to_variables(batch, device, for_training=False) result = model.forward(**tensor_batch) predictions = model.decode(result) model_predictions.extend(predictions["tags"]) for instance, prediction in zip(dataset.instances, model_predictions): fields = instance.fields predicted_tags = [ model._vocab.get_token_from_index(x, namespace="labels") for x in prediction ] try: # Most sentences have a verbal predicate, but not all. verb_index = fields["verb_indicator"].labels.index(1) except ValueError: verb_index = None gold_tags = fields["tags"].labels sentence = fields["tokens"].tokens write_to_conll_eval_file(prediction_file, gold_file, verb_index, sentence, gold_tags, predicted_tags) prediction_file.close() gold_file.close()
def test_span_f1_matches_perl_script_for_continued_arguments(self): bio_tags = ["B-ARG1", "O", "B-C-ARG1", "B-V", "B-ARGM-ADJ", "O"] sentence = ["Mark", "and", "Matt", "were", "running", "fast", "."] gold_indices = [self.vocab.get_token_index(x, "tags") for x in bio_tags] gold_tensor = torch.Tensor([gold_indices]) prediction_tensor = torch.rand([1, 6, self.vocab.get_vocab_size("tags")]) mask = torch.LongTensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]]) # Make prediction so that it is exactly correct. for i, tag_index in enumerate(gold_indices): prediction_tensor[0, i, tag_index] = 1 metric = SpanBasedF1Measure(self.vocab, "tags") metric(prediction_tensor, gold_tensor, mask) metric_dict = metric.get_metric() # We merged the continued ARG1 label into a single span, so there should # be exactly 1 true positive for ARG1 and nothing present for C-ARG1 assert metric._true_positives["ARG1"] == 1 # The labels containing continuation references get merged into # the labels that they continue, so they should never appear in # the precision/recall counts. assert "C-ARG1" not in metric._true_positives.keys() assert metric._true_positives["V"] == 1 assert metric._true_positives["ARGM-ADJ"] == 1 numpy.testing.assert_almost_equal(metric_dict["recall-ARG1"], 1.0) numpy.testing.assert_almost_equal(metric_dict["precision-ARG1"], 1.0) numpy.testing.assert_almost_equal(metric_dict["f1-measure-ARG1"], 1.0) numpy.testing.assert_almost_equal(metric_dict["recall-V"], 1.0) numpy.testing.assert_almost_equal(metric_dict["precision-V"], 1.0) numpy.testing.assert_almost_equal(metric_dict["f1-measure-V"], 1.0) numpy.testing.assert_almost_equal(metric_dict["recall-ARGM-ADJ"], 1.0) numpy.testing.assert_almost_equal(metric_dict["precision-ARGM-ADJ"], 1.0) numpy.testing.assert_almost_equal(metric_dict["f1-measure-ARGM-ADJ"], 1.0) numpy.testing.assert_almost_equal(metric_dict["recall-overall"], 1.0) numpy.testing.assert_almost_equal(metric_dict["precision-overall"], 1.0) numpy.testing.assert_almost_equal(metric_dict["f1-measure-overall"], 1.0) # Check that the number of true positive ARG1 labels is the same as the perl script's output: gold_file_path = os.path.join(self.TEST_DIR, "gold_conll_eval.txt") prediction_file_path = os.path.join(self.TEST_DIR, "prediction_conll_eval.txt") with open(gold_file_path, "a+") as gold_file, open(prediction_file_path, "a+") as prediction_file: # Use the same bio tags as prediction vs gold to make it obvious by looking # at the perl script output if something is wrong. write_to_conll_eval_file(gold_file, prediction_file, 4, sentence, bio_tags, bio_tags) # Run the official perl script and collect stdout. perl_script_command = ["perl", str(self.TOOLS_ROOT / "srl-eval.pl"), prediction_file_path, gold_file_path] stdout = subprocess.check_output(perl_script_command, universal_newlines=True) stdout_lines = stdout.split("\n") # Parse the stdout of the perl script to find the ARG1 row (this happens to be line 8). num_correct_arg1_instances_from_perl_evaluation = int([token for token in stdout_lines[8].split(" ") if token][1]) assert num_correct_arg1_instances_from_perl_evaluation == metric._true_positives["ARG1"]
def write_predictions_viterbi_decoded(serialization_dir, split, epoch, predicted_tags, vocab: Vocabulary, tokens: Dict[str, torch.LongTensor], verb_indicator: torch.LongTensor, tags: torch.LongTensor = None, pos_tags: torch.LongTensor = None, spans: torch.LongTensor = None, span_labels: torch.LongTensor = None, metadata: Any = None): prediction_file_path = os.path.join(serialization_dir, "predictions", "predictions-" + split + "-" + str(epoch) + ".txt") gold_file_path = os.path.join(serialization_dir, "predictions", "gold-" + split + "-" + str(epoch) + ".txt") if not os.path.exists(os.path.dirname(prediction_file_path)): try: os.makedirs(os.path.dirname(prediction_file_path)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise # logger.info("Writing gold srl tags (in conll file format) to %s", gold_file_path) # logger.info("Writing predicted srl tags (in conll file format) to %s", prediction_file_path) prediction_file = open(prediction_file_path, "a+") gold_file = open(gold_file_path, "a+") sentences = tokens["tokens"] mask = get_text_field_mask(tokens) sentence_lengths = get_lengths_from_binary_sequence_mask(mask).data.tolist() for sentence, _gold_tags, _verb_indicator, _length, _predicted_tags in zip(sentences.data.cpu(), tags.data.cpu(), verb_indicator.data.cpu(), sentence_lengths, predicted_tags.data.cpu()): tokens = [vocab.get_token_from_index(x, namespace="tokens").__str__() for x in sentence[:_length]] gold_labels = [vocab.get_token_from_index(x, namespace="labels") for x in _gold_tags[:_length]] _verb_indicator = [x for x in _verb_indicator[: _length]] prediction = [vocab.get_token_from_index(x, namespace="labels") for x in _predicted_tags[:_length]] try: verb_index = _verb_indicator.index(1) except ValueError: verb_index = None # Defined in semantic_role_labeler model implementation write_to_conll_eval_file(prediction_file=prediction_file, gold_file=gold_file, verb_index=verb_index, sentence=tokens, prediction=prediction, gold_labels=gold_labels) prediction_file.close() gold_file.close()
def test_perl_eval_script_can_run_on_printed_conll_files(self): bio_tags = ["B-ARG-1", "I-ARG-1", "O", "B-V", "B-ARGM-ADJ", "O"] sentence = ["Mark", "and", "Matt", "were", "running", "fast", "."] gold_file_path = os.path.join(self.TEST_DIR, "gold_conll_eval.txt") prediction_file_path = os.path.join(self.TEST_DIR, "prediction_conll_eval.txt") with open(gold_file_path, "a+") as gold_file, open(prediction_file_path, "a+") as prediction_file: # Use the same bio tags as prediction vs gold to make it obvious by looking # at the perl script output if something is wrong. Write them twice to # ensure that the perl script deals with multiple sentences. write_to_conll_eval_file(gold_file, prediction_file, 4, sentence, bio_tags, bio_tags) write_to_conll_eval_file(gold_file, prediction_file, 4, sentence, bio_tags, bio_tags) perl_script_command = ["perl", str(self.TOOLS_ROOT / "srl-eval.pl"), prediction_file_path, gold_file_path] exit_code = subprocess.check_call(perl_script_command) assert exit_code == 0
def main(serialization_directory, device): """ serialization_directory : str, required. The directory containing the serialized weights. device: int, default = -1 The device to run the evaluation on. """ config = Params.from_file(os.path.join(serialization_directory, "config.json")) dataset_reader = DatasetReader.from_params(config['dataset_reader']) evaluation_data_path = config['validation_data_path'] model = Model.load(config, serialization_dir=serialization_directory, cuda_device=device) prediction_file_path = os.path.join(serialization_directory, "predictions.txt") gold_file_path = os.path.join(serialization_directory, "gold.txt") prediction_file = open(prediction_file_path, "w+") gold_file = open(gold_file_path, "w+") # Load the evaluation data and index it. print("Reading evaluation data from {}".format(evaluation_data_path)) instances = dataset_reader.read(evaluation_data_path) iterator = BasicIterator(batch_size=32) iterator.index_with(model.vocab) model_predictions = [] batches = iterator(instances, num_epochs=1, shuffle=False, cuda_device=device, for_training=False) for batch in Tqdm.tqdm(batches): result = model(**batch) predictions = model.decode(result) model_predictions.extend(predictions["tags"]) for instance, prediction in zip(instances, model_predictions): fields = instance.fields try: # Most sentences have a verbal predicate, but not all. verb_index = fields["verb_indicator"].labels.index(1) except ValueError: verb_index = None gold_tags = fields["tags"].labels sentence = fields["tokens"].tokens write_to_conll_eval_file(prediction_file, gold_file, verb_index, sentence, prediction, gold_tags) prediction_file.close() gold_file.close()
def write_predictions(serialization_dir, instances, model_predictions, split, epoch = None): if epoch: prediction_file_path = os.path.join(serialization_dir, "predictions", "predictions-" + split + "-" + str(epoch) + ".txt") gold_file_path = os.path.join(serialization_dir, "predictions", "gold-" + split + "-" + str(epoch) + ".txt") else: prediction_file_path = os.path.join(serialization_dir, "predictions", "predictions-"+split+".txt") gold_file_path = os.path.join(serialization_dir, "predictions", "gold-"+split+".txt") if not os.path.exists(os.path.dirname(prediction_file_path)): try: os.makedirs(os.path.dirname(prediction_file_path)) except OSError as exc: # Guard against race condition if exc.errno != errno.EEXIST: raise logger.info("Writing gold srl tags (in conll file format) to %s", gold_file_path) logger.info("Writing predicted srl tags (in conll file format) to %s", prediction_file_path) prediction_file = open(prediction_file_path, "a+") gold_file = open(gold_file_path, "a+") for instance, prediction in zip(instances, model_predictions): fields = instance.fields try: # Most sentences have a verbal predicate, but not all. verb_index = fields["verb_indicator"].labels.index(1) except ValueError: verb_index = None gold_labels = fields["tags"].labels sentence = fields["tokens"].tokens # Defined in semantic_role_labeler model implementation write_to_conll_eval_file(prediction_file, gold_file, verb_index, sentence, prediction, gold_labels) prediction_file.close() gold_file.close()
def main(serialization_directory: int, device: int, data: str, prefix: str, domain: str = None): """ serialization_directory : str, required. The directory containing the serialized weights. device: int, default = -1 The device to run the evaluation on. data: str, default = None The data to evaluate on. By default, we use the validation data from the original experiment. prefix: str, default="" The prefix to prepend to the generated gold and prediction files, to distinguish different models/data. domain: str, optional (default = None) If passed, filters the ontonotes evaluation/test dataset to only contain the specified domain. This overwrites the domain in the config file from the model, to allow evaluation on domains other than the one the model was trained on. """ config = Params.from_file(os.path.join(serialization_directory, "config.json")) if domain is not None: # Hack to allow evaluation on different domains than the # model was trained on. config["dataset_reader"]["domain_identifier"] = domain prefix = f"{domain}_{prefix}" else: config["dataset_reader"].pop("domain_identifier", None) dataset_reader = DatasetReader.from_params(config['dataset_reader']) evaluation_data_path = data if data else config['validation_data_path'] archive = load_archive(os.path.join(serialization_directory, "model.tar.gz"), cuda_device=device) model = archive.model model.eval() prediction_file_path = os.path.join(serialization_directory, prefix + "_predictions.txt") gold_file_path = os.path.join(serialization_directory, prefix + "_gold.txt") prediction_file = open(prediction_file_path, "w+") gold_file = open(gold_file_path, "w+") # Load the evaluation data and index it. print("reading evaluation data from {}".format(evaluation_data_path)) instances = dataset_reader.read(evaluation_data_path) with torch.autograd.no_grad(): iterator = BasicIterator(batch_size=32) iterator.index_with(model.vocab) model_predictions = [] batches = iterator(instances, num_epochs=1, shuffle=False, cuda_device=device) for batch in Tqdm.tqdm(batches): result = model(**batch) predictions = model.decode(result) model_predictions.extend(predictions["tags"]) for instance, prediction in zip(instances, model_predictions): fields = instance.fields try: # Most sentences have a verbal predicate, but not all. verb_index = fields["verb_indicator"].labels.index(1) except ValueError: verb_index = None gold_tags = fields["tags"].labels sentence = [x.text for x in fields["tokens"].tokens] write_to_conll_eval_file(prediction_file, gold_file, verb_index, sentence, prediction, gold_tags) prediction_file.close() gold_file.close()
def main(serialization_directory: int, device: int, data: str, prefix: str, domain: str = None): """ serialization_directory : str, required. The directory containing the serialized weights. device: int, default = -1 The device to run the evaluation on. data: str, default = None The data to evaluate on. By default, we use the validation data from the original experiment. prefix: str, default="" The prefix to prepend to the generated gold and prediction files, to distinguish different models/data. domain: str, optional (default = None) If passed, filters the ontonotes evaluation/test dataset to only contain the specified domain. This overwrites the domain in the config file from the model, to allow evaluation on domains other than the one the model was trained on. """ config = Params.from_file( os.path.join(serialization_directory, "config.json")) if domain is not None: # Hack to allow evaluation on different domains than the # model was trained on. config["dataset_reader"]["domain_identifier"] = domain prefix = f"{domain}_{prefix}" else: config["dataset_reader"].pop("domain_identifier", None) dataset_reader = DatasetReader.from_params(config["dataset_reader"]) evaluation_data_path = data if data else config["validation_data_path"] archive = load_archive(os.path.join(serialization_directory, "model.tar.gz"), cuda_device=device) model = archive.model model.eval() prediction_file_path = os.path.join(serialization_directory, prefix + "_predictions.txt") gold_file_path = os.path.join(serialization_directory, prefix + "_gold.txt") prediction_file = open(prediction_file_path, "w+") gold_file = open(gold_file_path, "w+") # Load the evaluation data and index it. print("reading evaluation data from {}".format(evaluation_data_path)) instances = dataset_reader.read(evaluation_data_path) with torch.autograd.no_grad(): iterator = BasicIterator(batch_size=32) iterator.index_with(model.vocab) model_predictions = [] batches = iterator(instances, num_epochs=1, shuffle=False) for batch in Tqdm.tqdm(batches): batch = move_to_device(batch, device) result = model(**batch) predictions = model.decode(result) model_predictions.extend(predictions["tags"]) for instance, prediction in zip(instances, model_predictions): fields = instance.fields verb_index = fields["metadata"]["verb_index"] gold_tags = fields["metadata"]["gold_tags"] sentence = fields["metadata"]["words"] write_to_conll_eval_file(prediction_file, gold_file, verb_index, sentence, prediction, gold_tags) prediction_file.close() gold_file.close()
def srl_conll_evaluate(pred_file: str, gold_file: str, silent: bool = False, min_length=0): """ Evaluate current model using CoNLL script. Args: preds: contains the predictions from the model. Returns: f1 score :param silent: :param gold_file: :param pred_file: """ pred_data = load_srl_data(pred_file) gold_data = load_srl_data(gold_file) if min_length > 0: pred_conll_file = pred_file + ".conll_" + str(min_length) gold_conll_file = gold_file + ".conll_" + str(min_length) else: pred_conll_file = pred_file + ".conll" gold_conll_file = gold_file + ".conll" assert len(pred_data) == len(gold_data) with open(pred_conll_file, mode='w') as pred_conll, open(gold_conll_file, mode='w') as gold_conll: for gold, pred in zip(gold_data, pred_data): # fields = instance.fields try: # Most sentences have a verbal predicate, but not all. verb_index = gold["target_verb_position"] except ValueError: verb_index = None gold_tags = gold["tags"] pred_tags = pred["tags"] sentence = gold["words"] if min_length > 0: if len(sentence) < min_length: continue write_to_conll_eval_file(pred_conll, gold_conll, verb_index, sentence, pred_tags, gold_tags) with tempfile.NamedTemporaryFile(mode='r', delete=True) as scores: eval_script = "gcd/metrics/srl_perl/bin/srl-eval.pl" eval_lib = "gcd/metrics/srl_perl/lib" scores_path = scores.name # command = f"perl -I {eval_lib} {eval_script} {gold_conll_file} {pred_conll_file} > {scores_path}" command = "perl -I %s %s %s %s > %s" % (eval_lib, eval_script, gold_conll_file, pred_conll_file, scores_path) # print("running", command) os.system(command) result = scores.read().split('\n') # print(result) if not silent: for r in result: print(r) """ Number of Sentences : 3248 Number of Propositions : 3221 Percentage of perfect props : 68.89 corr. excess missed prec. rec. F1 ------------------------------------------------------------ Overall 4810 997 1081 82.83 81.65 82.24 ---------- A0 1803 287 268 86.27 87.06 86.66 A1 2448 521 525 82.45 82.34 82.40 A2 450 163 218 73.41 67.37 70.26 A3 67 16 45 80.72 59.82 68.72 A4 41 10 24 80.39 63.08 70.69 A5 1 0 1 100.00 50.00 66.67 ------------------------------------------------------------ ------------------------------------------------------------ """ conll_f1 = float(result[6].strip().split()[-1]) perfect_props_percent = float(result[2].strip().split(':')[-1]) label_f1s = {} for r in result[8:]: try: label_f1 = float(r.strip().split()[-1]) label = r.strip().split()[0] label_f1s[label] = label_f1 except ValueError: break return { "conll_f1": conll_f1, "perfect_props_percent": perfect_props_percent, "label_f1s": label_f1s }