def test_pyramids(self): """Do some basic sanity tests on the files.""" pyramids = JsonlReader(_pyramids_file_path, Pyramid).read() annotations = JsonlReader(_pyramid_annotations_file_path, PyramidAnnotation).read() instance_id_to_pyramid = {} for pyramid in pyramids: instance_id_to_pyramid[pyramid.instance_id] = pyramid instance_id_to_annotations = defaultdict(list) for annotation in annotations: instance_id_to_annotations[annotation.instance_id].append(annotation) assert instance_id_to_pyramid.keys() == instance_id_to_annotations.keys() for instance_id, pyramid in instance_id_to_pyramid.items(): assert len(pyramid.summaries) == 4 assert len(pyramid.summarizer_ids) == 4 for reference in pyramid.summaries: assert len(reference) > 0 scu_ids = set([scu.scu_id for scu in pyramid.scus]) for annotation in instance_id_to_annotations[instance_id]: assert len(annotation.summary) > 0, (instance_id, annotation.summarizer_id) for scu in annotation.scus: assert scu.scu_id in scu_ids, (scu.scu_id, scu_ids)
def read(self, documents_jsonl: str, summaries_jsonl) -> List[EvalInstance]: logger.info(f'Loading documents from {documents_jsonl}') documents_dict = {} with JsonlReader(documents_jsonl) as f: for data in f: instance_id = data['instance_id'] if 'document' in data: documents = [data['document']['text']] else: documents = [ document['text'] for document in data['documents'] ] documents = flatten_documents(documents) documents_dict[instance_id] = DocumentsField(documents) logger.info(f'Loaded {len(documents_dict)} document sets') logger.info(f'Loading summaries from {summaries_jsonl}') instances = [] with JsonlReader(summaries_jsonl) as f: for data in f: fields = {} fields['summary'] = SummaryField(data['summary']['text']) instance_id = data['instance_id'] fields['documents'] = documents_dict[instance_id] fields = Fields(fields) instance = EvalInstance(data['instance_id'], data['summarizer_id'], data['summarizer_type'], fields) instances.append(instance) logger.info(f'Loaded {len(instances)} instances') return instances
def test_pyramid_score(self): # This is a regression test, not necessarily a test for correctness pyramids = { pyramid.instance_id: pyramid for pyramid in JsonlReader(_pyramid_file_path, Pyramid).read() } annotations = JsonlReader(_annotation_file_path, PyramidAnnotation).read() annotation_pyramids = [ pyramids[annotation.instance_id] for annotation in annotations ] metric = PyramidScore() actual_output = metric.score_all(annotations, annotation_pyramids)[:5] expected_output = [{ 'modified_pyramid_score': 0.2413793103448276 }, { 'modified_pyramid_score': 0.0 }, { 'modified_pyramid_score': 0.06896551724137931 }, { 'modified_pyramid_score': 0.034482758620689655 }, { 'modified_pyramid_score': 0.1724137931034483 }] for i, (expected, actual) in enumerate(zip(expected_output, actual_output)): assert actual.approx_equal( MetricsDict(expected), abs=1e-4 ), f'Instance {i} not equal. Expected {expected}, actual {actual}'
def main(args): random.seed(args.random_seed) summaries = JsonlReader(args.summaries_jsonl).read() metrics_list = JsonlReader(args.metrics_jsonl).read() instance_ids = sample_instances(summaries, args.num_instances) summarizer_ids1 = sample_summarizers(summaries, args.num_summaries) summarizer_ids2 = sample_next_summarizers(summaries, summarizer_ids1, 16 - args.num_summaries) union = summarizer_ids1 | summarizer_ids2 print(summarizer_ids1) print(summarizer_ids2) save(summaries, instance_ids, union, args.output_summaries_jsonl) save(metrics_list, instance_ids, union, args.output_metrics_jsonl) save(summaries, instance_ids, summarizer_ids1, args.output_summaries_jsonl_1) save(metrics_list, instance_ids, summarizer_ids1, args.output_metrics_jsonl_1) save(summaries, instance_ids, summarizer_ids2, args.output_summaries_jsonl_2) save(metrics_list, instance_ids, summarizer_ids2, args.output_metrics_jsonl_2)
def main(args): expert_answers = JsonlReader(args.expert_answers_jsonl).read() model_answers = JsonlReader(args.model_answers_jsonl).read() expert_answers = filter_to_summarizer_type(expert_answers, args.summarizer_type) model_answers = filter_to_summarizer_type(model_answers, args.summarizer_type) annotations_dict = load_annotations(args.annotations_csv, expert_answers, model_answers) is_answerable_dict = get_ground_truth_answerable(expert_answers) process_expert_answers(expert_answers, annotations_dict, is_answerable_dict) print() process_model_answers(model_answers, annotations_dict, is_answerable_dict, args.output_right_for_wrong_reasons_csv) print() print('Saving expert') save_human_judgments(expert_answers, annotations_dict, args.output_expert_jsonl) print('Saving model') save_human_judgments(model_answers, annotations_dict, args.output_model_jsonl)
def read(self) -> List[EvalInstance]: pyramids = JsonlReader(self.pyramid_jsonl, Pyramid).read() annotations = JsonlReader(self.annotation_jsonl, PyramidAnnotation).read() # Enumerate the peers instance_id_to_pyramid = { pyramid.instance_id: pyramid for pyramid in pyramids } eval_instances = [] for annotation in annotations: summary = PyramidAnnotationField(annotation) pyramid = instance_id_to_pyramid[annotation.instance_id] fields = Fields({'pyramid': PyramidField(pyramid)}) eval_instances.append( EvalInstance(annotation.instance_id, annotation.summarizer_id, annotation.summarizer_type, summary, fields)) # Enumerate the references for pyramid in pyramids: for i, summary in enumerate(pyramid.summaries): annotation = pyramid.get_annotation(i) summary = PyramidAnnotationField(annotation) fields = Fields( {'pyramid': PyramidField(pyramid.remove_summary(i))}) eval_instances.append( EvalInstance(annotation.instance_id, annotation.summarizer_id, annotation.summarizer_type, summary, fields)) return eval_instances
def test_sanity_checks(self): file_paths = glob(f'{_mds_dir}/??.train.jsonl') assert len(file_paths) == 10 # 10 languages # Make sure each one has 3 instances with non-empty documents and summaries for file_path in file_paths: instances = JsonlReader(file_path).read() assert len(instances) == 3 for instance in instances: assert len(instance['documents']) > 0 for document in instance['documents']: assert len(document['text']) > 0 assert len(instance['summaries']) > 0 for summary in instance['summaries']: assert len(summary['text']) > 0 # The test currently does not have summaries file_paths = glob(f'{_mds_dir}/??.test.jsonl') assert len(file_paths) == 10 # 10 languages # Make sure each one has 3 instances with non-empty documents and summaries for file_path in file_paths: instances = JsonlReader(file_path).read() assert len(instances) == 7 for instance in instances: assert len(instance['documents']) > 0 for document in instance['documents']: assert len(document['text']) > 0 assert 'summaries' not in instance
def setUpClass(cls) -> None: super().setUpClass() cls.instance_ids = [] cls.summaries = [] cls.references_list = [] with JsonlReader(MULTILING_SUMMARIES) as f: for instance in f: cls.instance_ids.append(instance['instance_id']) cls.summaries.append(instance['summary']['text']) references = [] for reference in instance['references']: references.append(reference['text']) cls.references_list.append(references) # Load the documents, grouped by instance id, then put them into a list # parallel with the instances cls.documents_dict = {} with JsonlReader(MULTILING_DOCUMENTS) as f: for instance in f: cls.documents_dict[instance['instance_id']] = [ document['text'] for document in instance['documents'] ] cls.documents_list = [] for instance_id in cls.instance_ids: cls.documents_list.append(cls.documents_dict[instance_id])
def read(self, pyramid_jsonl: str, annotation_jsonl: str) -> List[EvalInstance]: logger.info(f'Loading Pyramids from {pyramid_jsonl}') pyramids = {} with JsonlReader(pyramid_jsonl, Pyramid) as f: for pyramid in f: pyramids[pyramid.instance_id] = pyramid logger.info(f'Loaded {len(pyramids)} pyramids') logger.info(f'Loading Pyramid annotations from {annotation_jsonl}') instances = [] instance_ids = set() with JsonlReader(annotation_jsonl, PyramidAnnotation) as f: for annotation in f: fields = Fields({ 'annotation': PyramidAnnotationField(annotation), 'pyramid': PyramidField(pyramids[annotation.instance_id]) }) instance = EvalInstance(annotation.instance_id, annotation.summarizer_id, annotation.summarizer_type, fields) instances.append(instance) instance_ids.add(annotation.instance_id) logger.info(f'Loaded {len(instances)} Pyramid annotations') if self.include_reference_annotations: logger.info( f'Generating Pyramid annotations for the reference summaries') reference_instances = [] for instance_id in instance_ids: pyramid = pyramids[instance_id] # We can only do this if there are > 1 summaries used to construct the pyramid if len(pyramid.summarizer_ids) > 1: for i in range(len(pyramid.summarizer_ids)): annotation = pyramid.get_annotation(i) reduced_pyramid = pyramid.remove_summary(i) fields = Fields({ 'annotation': PyramidAnnotationField(annotation), 'pyramid': PyramidField(reduced_pyramid) }) instance = EvalInstance(annotation.instance_id, annotation.summarizer_id, annotation.summarizer_type, fields) reference_instances.append(instance) logger.info( f'Generated {len(reference_instances)} reference summary annotations' ) instances.extend(reference_instances) logger.info(f'Loaded a total of {len(instances)} instances') return instances
def main(args): metrics_list = JsonlReader(args.input_file, Metrics).read() aggregated_metrics = aggregate_metrics(metrics_list) with JsonlWriter(args.output_file) as out: # I'm being lazy and just iterating twice because the aggregate method edited the metrics inplace for metrics in JsonlReader(args.input_file, Metrics).read(): metrics.metrics['overall_responsiveness'] = aggregated_metrics[ metrics.summarizer_id]['overall_responsiveness'] out.write(metrics)
def test_score_multi_all(self): instances = JsonlReader(_instances_file_path).read() summaries_lists, documents_lists, instance_id_to_index, summarizer_id_to_index = self._setup_multi_all( instances) simetrix = SIMetrix() metrics_lists = simetrix.score_multi_all(summaries_lists, documents_lists) for expected_metrics in JsonlReader(_summary_metrics_file_path).read(): instance_index = instance_id_to_index[ expected_metrics['instance_id']] summarizer_index = summarizer_id_to_index[ expected_metrics['summarizer_id']] actual_metrics = metrics_lists[instance_index][summarizer_index] assert actual_metrics == expected_metrics['metrics']
def test_pyramid_examples(self): """Pick some random examples and test them.""" pyramids = JsonlReader(_pyramids_file_path, Pyramid).read() annotations = JsonlReader(_pyramid_annotations_file_path, PyramidAnnotation).read() instance_id_to_pyramid = {} for pyramid in pyramids: instance_id_to_pyramid[pyramid.instance_id] = pyramid instance_id_to_annotations = defaultdict(dict) for annotation in annotations: instance_id_to_annotations[annotation.instance_id][ annotation.summarizer_id] = annotation pyramid = instance_id_to_pyramid['d1101-A'] assert pyramid.summarizer_ids == ['A', 'B', 'C', 'D'] annotation = instance_id_to_annotations['d1101-A']['1'] label = 'Roberts entered the school' contrib_label = 'gunman inside their tiny one-room schoolhouse' assert len(annotation.scus) == 2 assert annotation.scus[0].scu_id == 2 assert len(annotation.scus[0].contributors) == 1 assert annotation.scus[0].contributors[0].label == contrib_label assert len(annotation.scus[0].contributors[0].parts) == 1 assert annotation.scus[0].contributors[0].parts[ 0].text == contrib_label start = annotation.scus[0].contributors[0].parts[0].start end = annotation.scus[0].contributors[0].parts[0].end assert annotation.summary[start:end] == contrib_label self._test_example(pyramid, annotation) # This is an example of a pyramid annotation that has SCUs that are # not present in the pyramid, therefore they are missing here. pyramid = instance_id_to_pyramid['d1112-B'] assert pyramid.summarizer_ids == ['A', 'C', 'G', 'H'] annotation = instance_id_to_annotations['d1112-B']['22'] contrib_label = 'some jurors in the Metrolink train derailment case last month said they really didn\'t think Alvarez intended to kill anyone' part_label = 'some jurors in the Metrolink train derailment case last month said they really didn\'t think Alvarez intended to kill anyone' assert len(annotation.scus) == 1 assert annotation.scus[0].scu_id == 41 assert len(annotation.scus[0].contributors) == 1 assert annotation.scus[0].contributors[0].label == contrib_label assert len(annotation.scus[0].contributors[0].parts) == 1 assert annotation.scus[0].contributors[0].parts[0].text == part_label start = annotation.scus[0].contributors[0].parts[0].start end = annotation.scus[0].contributors[0].parts[0].end assert annotation.summary[start:end] == part_label
def main(args): random.seed(4) instances = JsonlReader(args.input_jsonl).read() instance_id_to_instances = defaultdict(list) for instance in instances: instance_id_to_instances[instance['instance_id']].append(instance) instance_ids = list(sorted(instance_id_to_instances.keys())) # Each point on the learning curve. Also make sure that the number of inputs isn't larger # than the number of instances and the max number of inputs is included. num_inputs_list = args.num_inputs while num_inputs_list[-1] >= len(instance_ids): num_inputs_list.pop() num_inputs_list.append(len(instance_ids)) for num_inputs in tqdm(num_inputs_list): for sample_index in range(args.num_samples): with JsonlWriter( f'{args.output_dir}/{num_inputs}/{sample_index}.jsonl' ) as out: random.shuffle(instance_ids) sample = list(sorted(instance_ids[:num_inputs])) for instance_id in sample: for instance in instance_id_to_instances[instance_id]: out.write(instance)
def load_metrics(metrics_files: List[str]) -> List[Metrics]: logger.info(f'Loading metrics from {metrics_files}') metrics_list = [] for metrics_file in metrics_files: metrics_list.extend(JsonlReader(metrics_file, Metrics).read()) logger.info(f'Loaded {len(metrics_list)} metrics objects') return metrics_list
def _check_micro_list(self, micro_list: List[Metrics]) -> None: instances = JsonlReader(_summaries_file_path).read() assert len(micro_list) == len(instances) for micro, instance in zip(micro_list, instances): assert micro.instance_id == instance['instance_id'] assert micro.summarizer_id == instance['summarizer_id'] assert micro.summarizer_type == instance['summarizer_type'] # Test a couple of cases. Again, I did not manually test these, but they # will catch if anything changes assert micro_list[1].metrics == { 'python-rouge-1': { 'precision': 30.412371134020617, 'recall': 29.72292191435768, 'f1': 30.06369426751592 }, 'python-rouge-2': { 'precision': 2.604166666666667, 'recall': 2.5445292620865136, 'f1': 2.5740025740025736 } } assert micro_list[2848].metrics == { 'python-rouge-1': { 'precision': 29.207920792079207, 'recall': 28.780487804878046, 'f1': 28.99262899262899 }, 'python-rouge-2': { 'precision': 8.5, 'recall': 8.374384236453201, 'f1': 8.436724565756823 } }
def test_fr_summary_level(self): # Test a few random metrics to ensure the data was parsed correctly metrics_list = JsonlReader(_fr_file_path, Metrics).read() metrics_dicts = self._convert_to_dicts(metrics_list) assert metrics_dicts['M004']['1']['rouge-1'][ 'precision'] == pytest.approx(42.791, abs=1e-3) assert metrics_dicts['M004']['1']['rouge-1'][ 'recall'] == pytest.approx(44.753, abs=1e-3) assert metrics_dicts['M004']['1']['rouge-1']['f1'] == pytest.approx( 43.750, abs=1e-3) assert metrics_dicts['M009']['2']['rouge-2'][ 'precision'] == pytest.approx(17.770, abs=1e-3) assert metrics_dicts['M009']['2']['rouge-2'][ 'recall'] == pytest.approx(17.813, abs=1e-3) assert metrics_dicts['M009']['2']['rouge-2']['f1'] == pytest.approx( 17.791, abs=1e-3) assert metrics_dicts['M002']['5']['MeMoG'] == pytest.approx(0.24885254, abs=1e-4) assert metrics_dicts['M004']['3']['MeMoG'] == pytest.approx(0.46445730, abs=1e-4) assert metrics_dicts['M000']['1']['grade'] == [3, 1, 3] assert metrics_dicts['M006']['4']['grade'] == [1, 3, 3] assert metrics_dicts['M000']['3']['length_aware_grade'] == [ 2.73, 2.73, 3.63 ] assert metrics_dicts['M006']['4']['length_aware_grade'] == [ 1.00, 3.00, 3.00 ]
def main(args): aqs = load_answered_questions(args.expert_answers_jsonl) with JsonlWriter(args.output_jsonl) as out: for instance in JsonlReader(args.questions_jsonl).read(): instance_id = instance['instance_id'] summarizer_id = instance['summarizer_id'] new_references = [] for reference in instance['references']: reference_id = reference['summarizer_id'] new_questions = [] for question in reference['questions']: prompt_id = question['prompt_id'] key = (instance_id, summarizer_id, reference_id, prompt_id) if key in aqs: new_questions.append(question) if len(reference['questions']) != len(new_questions): print( f'({instance_id}, {summarizer_id}, {reference_id}) only has {len(new_questions)} / {len(reference["questions"])} answered' ) if len(new_questions) > 0: reference['questions'] = new_questions new_references.append(reference) if len(instance['references']) != len(new_references): print( f'({instance_id}, {summarizer_id}) only has {len(new_references)} / {len(instance["references"])} references' ) if len(new_references) > 0: instance['references'] = new_references out.write(instance)
def load_contributions(scores_jsonl: str) -> Dict[str, float]: metrics_dicts = [] for metrics in JsonlReader(scores_jsonl, Metrics).read(): if metrics.summarizer_type != 'peer': continue metrics_dicts.append(metrics.metrics) return sum(metrics_dicts) / len(metrics_dicts)
def _run_answer_questions( self, input_file: str, output_file: str) -> Dict[Tuple[str, str], Dict[str, float]]: logger.info('Running answering questions') commands = [f'cd {self.apes_root}/rc-cnn-dailymail'] commands.append(f'source {os.environ["CONDA_INIT"]}') commands.append(f'conda activate {self.environment_name}') commands.append( f'python2.7 code/run_qa_model.py --input_file {input_file} --output_file {output_file} --train_path cnn_train.txt --dev_path cnn_dev.txt --glove_path glove.6B.100d.txt' ) command = ' && '.join(commands) logger.info(f'Running command: "{command}"') redirect = None if self.verbose else PIPE process = Popen(command, stdout=redirect, stderr=redirect, shell=True) process.communicate() ids_to_scores = {} with JsonlReader(output_file) as f: for data in f: summarizer_id = data['answering_doc'] reference_id = data['questioning_doc'] accuracy = data['acc'] num_correct = data['num_correct'] ids_to_scores[(summarizer_id, reference_id)] = { 'accuracy': accuracy, 'num_correct': num_correct } return ids_to_scores
def load_metrics_dicts(file_path: str) -> Dict[str, Dict[str, MetricsDict]]: metrics_dicts = defaultdict(dict) with JsonlReader(file_path, Metrics) as f: for instance in f: metrics_dicts[instance.instance_id][ instance.summarizer_id] = instance.metrics return metrics_dicts
def _check_micro_list_arguments(self, micro_list: List[Metrics]) -> None: instances = JsonlReader(MULTILING_SUMMARIES).read() assert len(micro_list) == len(instances) for micro, instance in zip(micro_list, instances): assert micro.instance_id == instance['instance_id'] assert micro.summarizer_id == instance['summarizer_id'] assert micro.summarizer_type == instance['summarizer_type'] assert micro_list[0].metrics == { 'python-rouge-3': { 'precision': 3.0952380952380953, 'recall': 3.110047846889952, 'f1': 3.1026252983293556 }, 'python-rouge-l': { 'precision': 20.657276995305164, 'recall': 20.754716981132077, 'f1': 20.705882352941174 } } assert micro_list[11].metrics == { 'python-rouge-3': { 'precision': 4.273504273504273, 'recall': 3.816793893129771, 'f1': 4.03225806451613 }, 'python-rouge-l': { 'precision': 28.15126050420168, 'recall': 25.18796992481203, 'f1': 26.58730158730159 } }
def _check_micro_list(self, micro_list: List[Metrics]) -> None: instances = JsonlReader(MULTILING_SUMMARIES).read() assert len(micro_list) == len(instances) for micro, instance in zip(micro_list, instances): assert micro.instance_id == instance['instance_id'] assert micro.summarizer_id == instance['summarizer_id'] assert micro.summarizer_type == instance['summarizer_type'] assert micro_list[0].metrics == { 'python-rouge-1': { 'precision': 41.699867197875164, 'recall': 40.516129032258064, 'f1': 41.09947643979057 }, 'python-rouge-2': { 'precision': 10.533333333333333, 'recall': 10.233160621761659, 'f1': 10.38107752956636 } } assert micro_list[11].metrics == { 'python-rouge-1': { 'precision': 45.34412955465587, 'recall': 44.71057884231537, 'f1': 45.0251256281407 }, 'python-rouge-2': { 'precision': 15.24390243902439, 'recall': 15.030060120240481, 'f1': 15.136226034308779 } }
def test_en_summary_level(self): # Test a few random metrics to ensure the data was parsed correctly metrics_list = JsonlReader(_en_file_path, Metrics).read() metrics_dicts = self._convert_to_dicts(metrics_list) assert metrics_dicts['M004']['1']['rouge-1'][ 'precision'] == pytest.approx(50.525, abs=1e-3) assert metrics_dicts['M004']['1']['rouge-1'][ 'recall'] == pytest.approx(50.459, abs=1e-3) assert metrics_dicts['M004']['1']['rouge-1']['f1'] == pytest.approx( 50.492, abs=1e-3) assert metrics_dicts['M009']['2']['rouge-2'][ 'precision'] == pytest.approx(17.647, abs=1e-3) assert metrics_dicts['M009']['2']['rouge-2'][ 'recall'] == pytest.approx(18.072, abs=1e-3) assert metrics_dicts['M009']['2']['rouge-2']['f1'] == pytest.approx( 17.857, abs=1e-3) assert metrics_dicts['M002']['5']['MeMoG'] == pytest.approx(0.32643265, abs=1e-4) assert metrics_dicts['M004']['3']['MeMoG'] == pytest.approx(0.57679360, abs=1e-4) assert metrics_dicts['M000']['1']['grade'] == [2, 3, 3] assert metrics_dicts['M006']['4']['grade'] == [1, 3, 1] assert metrics_dicts['M000']['3']['length_aware_grade'] == [ 2.88, 3.83, 3.83 ] assert metrics_dicts['M006']['4']['length_aware_grade'] == [ 1.00, 3.00, 1.00 ]
def test_task2_metrics(self): summary_level_metrics = JsonlReader(_task2_metrics_file_path, Metrics).read() summary_level_metrics = self._convert_to_dicts(summary_level_metrics) # Check a few metrics to make sure they are equal to what's in the NIST files. Unlike # the "system_level_test.py", these metrics don't have aggregated values in the NIST data, # so we just check to make sure a few of them were correctly parsed # updateEval/Pyramid/scoring/2007_modified_scores.txt assert summary_level_metrics['d0703-A']['39'][ 'modified_pyramid_score'] == pytest.approx(0.2857, 1e-2) assert summary_level_metrics['d0711-B']['45'][ 'modified_pyramid_score'] == pytest.approx(0.2353, 1e-2) assert summary_level_metrics['d0726-C']['51'][ 'modified_pyramid_score'] == pytest.approx(0.1739, 1e-2) assert summary_level_metrics['d0703-A']['39']['num_scus'] == 3 assert summary_level_metrics['d0711-B']['45']['num_scus'] == 1 assert summary_level_metrics['d0726-C']['51']['num_scus'] == 2 assert summary_level_metrics['d0703-A']['39']['num_repetitions'] == 0 assert summary_level_metrics['d0711-B']['45']['num_repetitions'] == 0 assert summary_level_metrics['d0726-C']['51']['num_repetitions'] == 0 assert summary_level_metrics['d0703-A']['39'][ 'content_responsiveness'] == 2 assert summary_level_metrics['d0711-B']['45'][ 'content_responsiveness'] == 2 assert summary_level_metrics['d0726-C']['51'][ 'content_responsiveness'] == 2
def load_metrics(scores_jsonl: str, summarizer_type: str) -> List[MetricsDict]: metrics_list = [] with JsonlReader(scores_jsonl, Metrics) as f: for metrics in f: if summarizer_type == 'all' or metrics.summarizer_type == summarizer_type: metrics_list.append(metrics.metrics) return metrics_list
def test_system_level(self): summary_level_metrics = JsonlReader(_metrics_file_path, Metrics).read() summary_level_metrics = self._convert_to_dicts(summary_level_metrics) # Check a few metrics to make sure they are equal to what's in the NIST files # Pyramid/DUC2005/processed_pans.txt assert summary_level_metrics['d311']['14']['pyramid_score'] == pytest.approx(0.5521, 1e-2) assert summary_level_metrics['d345']['31']['pyramid_score'] == 0.0 assert summary_level_metrics['d400']['A']['pyramid_score'] == pytest.approx(0.5094, 1e-2) assert summary_level_metrics['d311']['14']['modified_pyramid_score'] == pytest.approx(0.3869, 1e-2) assert summary_level_metrics['d345']['31']['modified_pyramid_score'] == 0.0 assert summary_level_metrics['d400']['A']['modified_pyramid_score'] == pytest.approx(0.4576, 1e-2) assert summary_level_metrics['d311']['14']['num_scus'] == 10 assert summary_level_metrics['d345']['31']['num_scus'] == 0 assert summary_level_metrics['d400']['A']['num_scus'] == 23 assert summary_level_metrics['d311']['14']['num_repetitions'] == 7 assert summary_level_metrics['d345']['31']['num_repetitions'] == 0 assert summary_level_metrics['d400']['A']['num_repetitions'] == 7 assert summary_level_metrics['d311']['14']['responsiveness'] == 3 assert summary_level_metrics['d345']['31']['responsiveness'] == 1 assert summary_level_metrics['d400']['A']['responsiveness'] == 5
def test_system_level(self): summary_level_metrics = JsonlReader(_metrics_file_path, Metrics).read() summary_level_metrics = self._convert_to_dicts(summary_level_metrics) # Check a few metrics to make sure they are equal to what's in the NIST files # DUC2006pyramiddata/scoring/2006_modified_scores.txt assert summary_level_metrics['d0603']['32'][ 'modified_pyramid_score'] == pytest.approx(0.1048, 1e-2) assert summary_level_metrics['d0616']['10'][ 'modified_pyramid_score'] == pytest.approx(0.3103, 1e-2) assert summary_level_metrics['d0628']['6'][ 'modified_pyramid_score'] == pytest.approx(0.0769, 1e-2) assert summary_level_metrics['d0603']['32']['num_scus'] == 6 assert summary_level_metrics['d0616']['10']['num_scus'] == 7 assert summary_level_metrics['d0628']['6']['num_scus'] == 2 assert summary_level_metrics['d0603']['32']['num_repetitions'] == 0 assert summary_level_metrics['d0616']['10']['num_repetitions'] == 0 assert summary_level_metrics['d0628']['6']['num_repetitions'] == 0 assert summary_level_metrics['d0603']['32'][ 'content_responsiveness'] == 3 assert summary_level_metrics['d0616']['10'][ 'content_responsiveness'] == 4 assert summary_level_metrics['d0628']['6'][ 'content_responsiveness'] == 2
def test_rouge(self): # Test the first several instances in the TAC 2008 data to ensure that # our computation of ROUGE matches the values released by NIST instances = ReferenceBasedDatasetReader().read(_summaries_file_path) metrics_list = JsonlReader(_metrics_file_path, Metrics).read() metric_names = ['rouge-1', 'rouge-2', 'rouge-3', 'rouge-4', 'rouge-l', 'rouge-su4', 'rouge-w-1.2'] rouge = Rouge(max_ngram=4, use_porter_stemmer=True, remove_stopwords=False, compute_rouge_l=True, skip_bigram_gap_length=4, wlcs_weight=1.2) peer_instances, peer_metrics = self._filter_by_type(instances, metrics_list, 'peer') reference_instances, reference_metrics = self._filter_by_type(instances, metrics_list, 'reference') num_to_check = 25 actual_metrics_dicts = score_instances(peer_instances[:num_to_check], [rouge]) for expected_metrics in peer_metrics[:num_to_check]: instance_id = expected_metrics.instance_id summarizer_id = expected_metrics.summarizer_id actual_metrics = actual_metrics_dicts[instance_id][summarizer_id] for metric in metric_names: assert actual_metrics.metrics[metric] == expected_metrics.metrics[metric] assert actual_metrics.metrics[metric + '_jk'] == expected_metrics.metrics[metric + '_jk'] actual_metrics_dicts = score_instances(reference_instances[:num_to_check], [rouge]) for expected_metrics in reference_metrics[:num_to_check]: instance_id = expected_metrics.instance_id summarizer_id = expected_metrics.summarizer_id actual_metrics = actual_metrics_dicts[instance_id][summarizer_id] for metric in metric_names: assert actual_metrics.metrics[metric + '_jk'] == expected_metrics.metrics[metric + '_jk']
def load_peers(file_path: str): instance_to_peers = defaultdict(dict) with JsonlReader(file_path) as f: for instance in f: instance_to_peers[instance['instance_id']][ instance['summarizer_id']] = instance return instance_to_peers
def load_questions(file_path: str): questions = {} for instance in JsonlReader(file_path).read(): for reference in instance['references']: for question in reference['questions']: questions[question['question_id']] = question return questions