Exemplo n.º 1
0
def main(args):
    dirname = os.path.dirname(args.output_csv)
    if dirname:
        os.makedirs(dirname, exist_ok=True)

    instances = JsonlReader(args.input_jsonl).read()

    with open(args.output_csv, 'w') as out:
        writer = csv.writer(out)
        writer.writerow(['prompt_ids', 'instancejson'])

        # Group instances by (instance_id, annotator), which should group
        # prompts together by the same summary
        key = lambda instance: (instance['instance_id'], instance['annotator'])
        instances.sort(key=key)
        for _, group in itertools.groupby(instances, key):
            # Sort the group by offset
            group = list(group)
            group.sort(key=lambda member: member['answer_start'])

            prompt_ids = []
            current = None
            for instance in group:
                if current is None:
                    prompt_ids = []
                    current = {'summary': instance['context'], 'offsets': []}

                prompt_ids.append(instance['prompt_id'])
                current['offsets'].append({
                    'foregroundStart':
                    instance['sent_start'],
                    'foregroundEnd':
                    instance['sent_end'],
                    'highlightStart':
                    instance['answer_start'],
                    'highlightEnd':
                    instance['answer_end'],
                })

                if len(prompt_ids) == args.num_prompts_per_assignment:
                    writerow(writer, prompt_ids, current)
                    prompt_ids = []
                    current = None

            if len(prompt_ids) > 0:
                writerow(writer, prompt_ids, current)
def main(args):
    dirname = os.path.dirname(args.output_csv)
    if dirname:
        os.makedirs(dirname, exist_ok=True)

    instances = JsonlReader(args.input_jsonl).read()

    with open(args.output_csv, 'w') as out:
        writer = csv.writer(out)
        writer.writerow(['input_ids', 'instancejson'])

        # Group instances by (instance_id, summarizer_id), which should group
        # prompts together by the same summary
        key = lambda instance: (instance['instance_id'], instance['summarizer_id'])
        instances.sort(key=key)
        for _, group in itertools.groupby(instances, key):
            for instance in group:
                summary = instance['summary']['text']
                input_ids = []
                current = None
                for reference in instance['references']:
                    for question_dict in reference['questions']:
                        if current is None:
                            input_ids = []
                            current = {
                                'summary': summary,
                                'questions': []
                            }

                        # This tuple needs to uniquely identify both the question
                        # and the summary
                        input_ids.append((
                            instance['instance_id'],
                            instance['summarizer_id'],
                            question_dict['question_id'],
                        ))
                        current['questions'].append(question_dict['question'])

                        if len(input_ids) == args.num_questions_per_assignment:
                            writerow(writer, input_ids, current)
                            input_ids = []
                            current = None

                if len(input_ids) > 0:
                    writerow(writer, input_ids, current)
Exemplo n.º 3
0
    def test_evaluate(self):
        instances = JsonlReader(_instances_file_path).read()
        simetrix = SIMetrix()

        summarizer_id_to_metrics = {}
        for expected_metrics in JsonlReader(_system_metrics_file_path).read():
            summarizer_id = expected_metrics['summarizer_id']
            summarizer_id_to_metrics[summarizer_id] = expected_metrics

        instances.sort(key=lambda instance: instance['summarizer_id'])
        for summarizer_id, group in itertools.groupby(
                instances, key=lambda instance: instance['summarizer_id']):
            group = list(group)
            summaries = [member['summary'] for member in group]
            documents_list = [member['documents'] for member in group]
            actual_metrics, _ = simetrix.evaluate(summaries, documents_list)
            expected_metrics = summarizer_id_to_metrics[summarizer_id][
                'metrics']

            assert len(actual_metrics) == len(expected_metrics)
            for metric, value in actual_metrics.items():
                assert value == expected_metrics['Avg' + metric]