def main(args): dirname = os.path.dirname(args.output_csv) if dirname: os.makedirs(dirname, exist_ok=True) instances = JsonlReader(args.input_jsonl).read() with open(args.output_csv, 'w') as out: writer = csv.writer(out) writer.writerow(['prompt_ids', 'instancejson']) # Group instances by (instance_id, annotator), which should group # prompts together by the same summary key = lambda instance: (instance['instance_id'], instance['annotator']) instances.sort(key=key) for _, group in itertools.groupby(instances, key): # Sort the group by offset group = list(group) group.sort(key=lambda member: member['answer_start']) prompt_ids = [] current = None for instance in group: if current is None: prompt_ids = [] current = {'summary': instance['context'], 'offsets': []} prompt_ids.append(instance['prompt_id']) current['offsets'].append({ 'foregroundStart': instance['sent_start'], 'foregroundEnd': instance['sent_end'], 'highlightStart': instance['answer_start'], 'highlightEnd': instance['answer_end'], }) if len(prompt_ids) == args.num_prompts_per_assignment: writerow(writer, prompt_ids, current) prompt_ids = [] current = None if len(prompt_ids) > 0: writerow(writer, prompt_ids, current)
def main(args): dirname = os.path.dirname(args.output_csv) if dirname: os.makedirs(dirname, exist_ok=True) instances = JsonlReader(args.input_jsonl).read() with open(args.output_csv, 'w') as out: writer = csv.writer(out) writer.writerow(['input_ids', 'instancejson']) # Group instances by (instance_id, summarizer_id), which should group # prompts together by the same summary key = lambda instance: (instance['instance_id'], instance['summarizer_id']) instances.sort(key=key) for _, group in itertools.groupby(instances, key): for instance in group: summary = instance['summary']['text'] input_ids = [] current = None for reference in instance['references']: for question_dict in reference['questions']: if current is None: input_ids = [] current = { 'summary': summary, 'questions': [] } # This tuple needs to uniquely identify both the question # and the summary input_ids.append(( instance['instance_id'], instance['summarizer_id'], question_dict['question_id'], )) current['questions'].append(question_dict['question']) if len(input_ids) == args.num_questions_per_assignment: writerow(writer, input_ids, current) input_ids = [] current = None if len(input_ids) > 0: writerow(writer, input_ids, current)
def test_evaluate(self): instances = JsonlReader(_instances_file_path).read() simetrix = SIMetrix() summarizer_id_to_metrics = {} for expected_metrics in JsonlReader(_system_metrics_file_path).read(): summarizer_id = expected_metrics['summarizer_id'] summarizer_id_to_metrics[summarizer_id] = expected_metrics instances.sort(key=lambda instance: instance['summarizer_id']) for summarizer_id, group in itertools.groupby( instances, key=lambda instance: instance['summarizer_id']): group = list(group) summaries = [member['summary'] for member in group] documents_list = [member['documents'] for member in group] actual_metrics, _ = simetrix.evaluate(summaries, documents_list) expected_metrics = summarizer_id_to_metrics[summarizer_id][ 'metrics'] assert len(actual_metrics) == len(expected_metrics) for metric, value in actual_metrics.items(): assert value == expected_metrics['Avg' + metric]