def test_default_features_dont_modify_condgen(self):

        condgen_processor = get_processor(TaskType.conditional_generation.value)
        mt_processor = get_processor(TaskType.machine_translation.value)

        condgen_features_1 = condgen_processor.default_features()
        mt_features = mt_processor.default_features()
        condgen_features_2 = condgen_processor.default_features()

        # MT features didn't change condgen features
        self.assertDictEqual(condgen_features_1, condgen_features_2)
        # condgen features are a subset of MT features
        self.assertDictEqual(mt_features, {**mt_features, **condgen_features_1})
    def test_no_user_defined_features(self):
        dataset = os.path.join(self.artifact_path, "no_custom_feature.json")
        loader = get_custom_dataset_loader(
            TaskType.kg_link_tail_prediction,
            dataset,
            dataset,
            Source.local_filesystem,
            Source.local_filesystem,
            FileType.json,
            FileType.json,
        )
        data = loader.load()
        self.assertEqual(data.metadata, FileLoaderMetadata())

        metadata = {
            "task_name": TaskType.kg_link_tail_prediction.value,
            "dataset_name": "fb15k-237-subset",
            "metric_configs": [HitsConfig(name='Hits4', hits_k=4)],
        }

        processor = get_processor(TaskType.kg_link_tail_prediction.value)

        sys_info = processor.process(metadata, data.samples)

        self.assertIsNotNone(sys_info.results.fine_grained)
        self.assertGreater(len(sys_info.results.overall), 0)
    def test_custom_features(self):
        loader = get_custom_dataset_loader(
            TaskType.machine_translation,
            self.tsv_dataset,
            self.json_output_with_features,
            Source.local_filesystem,
            Source.local_filesystem,
            FileType.tsv,
            FileType.json,
        )
        data = loader.load()
        self.assertEqual(len(data), 4)
        self.assertEqual(
            data[0],
            {
                'source': 'Ak sa chcete dostať ešte hlbšie, môžete si všimnúť '
                + 'trhlinky.',
                'reference': 'Now just to get really deep in , you can really get to '
                + 'the cracks .',
                'id': '0',
                'hypothesis': 'If you want to get a deeper , you can see the forces .',
                'num_capital_letters': 1,
            },
        )

        processor = get_processor(TaskType.machine_translation.value)

        sys_info = processor.process(dataclasses.asdict(data.metadata), data.samples)
        self.assertTrue('num_capital_letters' in sys_info.results.fine_grained)
    def test_e2e(self):
        loader = get_custom_dataset_loader(
            TaskType.aspect_based_sentiment_classification,
            self.tsv_dataset,
            self.txt_output,
            Source.local_filesystem,
            Source.in_memory,
            FileType.tsv,
            FileType.text,
        )
        data = loader.load()
        self.assertEqual(len(data), 100)
        self.assertEqual(
            data[0],
            {
                'aspect': 'Boot time',
                'text': 'Boot time  is super fast, around anywhere from 35 seconds to '
                + '1 minute.',
                'true_label': 'positive',
                'id': '0',
                'predicted_label': 'positive',
            },
        )

        metadata = {
            "task_name": TaskType.aspect_based_sentiment_classification,
            "metric_names": ["Accuracy", "F1Score"],
        }
        processor = get_processor(TaskType.aspect_based_sentiment_classification)

        sys_info = processor.process(metadata, data)

        self.assertIsNotNone(sys_info.results.fine_grained)
        self.assertGreater(len(sys_info.results.overall), 0)
    def test_generate_system_analysis(self):
        loader = get_custom_dataset_loader(
            TaskType.machine_translation,
            self.tsv_dataset,
            self.txt_output,
            Source.local_filesystem,
            Source.local_filesystem,
            FileType.tsv,
            FileType.text,
        )
        data = loader.load()

        metadata = {
            "task_name": TaskType.machine_translation.value,
            "dataset_name": "ted_multi",
            "metric_names": ["bleu"],
        }

        processor = get_processor(TaskType.machine_translation.value)

        sys_info = processor.process(metadata, data)

        # analysis.write_to_directory("./")
        self.assertIsNotNone(sys_info.results.fine_grained)
        self.assertGreater(len(sys_info.results.overall), 0)
Пример #6
0
    def test_generate_system_analysis(self):
        loader = get_custom_dataset_loader(
            TaskType.named_entity_recognition,
            self.conll_dataset,
            self.conll_output,
            Source.local_filesystem,
            Source.local_filesystem,
            FileType.conll,
            FileType.conll,
        )
        data = loader.load()

        metadata = {
            "task_name": TaskType.named_entity_recognition.value,
            # "dataset_name": "conll2003",
            # "sub_dataset_name":"ner",
            "metric_names": ["F1Score"],
        }
        processor = get_processor(TaskType.named_entity_recognition)
        sys_info = processor.process(metadata, data)

        self.assertIsNotNone(sys_info.results.fine_grained)
        self.assertGreater(len(sys_info.results.overall), 0)

        # ------ Deep Test --------

        # test: training set dependent features should be disabled when
        # training dataset is not provided
        activate_features = sys_info.results.fine_grained.keys()
        self.assertTrue("span_econ" not in activate_features
                        and "span_efre" not in activate_features)
    def test_multiple_qa_customized_feature(self):
        dataset_path = os.path.join(self.artifact_path, "dataset_fig_qa.json")
        output_path = os.path.join(self.artifact_path,
                                   "output_fig_qa_customized_features.json")
        loader = get_custom_dataset_loader(
            TaskType.qa_multiple_choice,
            dataset_path,
            output_path,
            Source.local_filesystem,
            Source.local_filesystem,
            FileType.json,
            FileType.json,
        )
        data = loader.load()
        self.assertIsInstance(data.samples[0]["commonsense_category"], list)
        self.assertEqual(data.samples[0]["commonsense_category"],
                         ["obj", "cul"])

        metadata = {
            "task_name": TaskType.qa_multiple_choice.value,
            "dataset_name": "fig_qa",
            "metric_names": ["Accuracy"],
            # don't forget this, otherwise the user-defined features will be ignored
            "user_defined_features_configs": data.metadata.custom_features,
        }

        processor = get_processor(TaskType.qa_multiple_choice.value)

        sys_info = processor.process(metadata, data.samples)

        self.assertIsNotNone(sys_info.results.fine_grained)
        self.assertGreater(len(sys_info.results.overall), 0)
Пример #8
0
    def test_sort_buckets_by_value(self):
        loader = get_custom_dataset_loader(
            TaskType.kg_link_tail_prediction,
            self.test_data,
            self.dataset_no_custom_feature,
        )
        data = loader.load()
        self.assertEqual(data.metadata, FileLoaderMetadata())

        metadata = {
            "task_name": TaskType.kg_link_tail_prediction.value,
            "dataset_name": "fb15k-237",
            "metric_configs": [
                HitsConfig(name='Hits4', hits_k=4),
                MeanReciprocalRankConfig(name='MRR'),
                MeanRankConfig(name='MR'),
            ],
            "sort_by": "performance_value",
            "sort_by_metric": "first",
        }

        processor = get_processor(TaskType.kg_link_tail_prediction.value)
        sys_info = processor.process(metadata, data.samples)

        self.assertIsNotNone(sys_info.results.fine_grained)
        self.assertGreater(len(sys_info.results.overall), 0)

        symmetry_performances = sys_info.results.fine_grained['symmetry']
        if len(symmetry_performances) <= 1:  # can't sort if only 1 item
            return
        for i in range(len(symmetry_performances) - 1):
            first_item = symmetry_performances[i].performances[0].value
            second_item = symmetry_performances[i + 1].performances[0].value
            self.assertGreater(first_item, second_item)
Пример #9
0
    def test_generate_system_analysis(self):
        loader = get_custom_dataset_loader(
            TaskType.word_segmentation,
            self.conll_dataset,
            self.conll_output,
            Source.local_filesystem,
            Source.local_filesystem,
            FileType.conll,
            FileType.conll,
        )
        data = loader.load()

        metadata = {
            "task_name": TaskType.word_segmentation.value,
            # "dataset_name": "conll2003",
            # "sub_dataset_name":"ner",
            "metric_names": ["F1Score"],
        }

        processor = get_processor(TaskType.word_segmentation)

        sys_info = processor.process(metadata, data)

        self.assertIsNotNone(sys_info.results.fine_grained)
        self.assertGreater(len(sys_info.results.overall), 0)
Пример #10
0
    def test_no_user_defined_features(self):
        loader = get_custom_dataset_loader(
            TaskType.kg_link_tail_prediction,
            self.test_data,
            self.dataset_no_custom_feature,
            dataset_file_type=FileType.json,
            output_file_type=FileType.json,
        )
        data = loader.load()
        self.assertEqual(data.metadata, FileLoaderMetadata())

        metadata = {
            "task_name": TaskType.kg_link_tail_prediction.value,
            "dataset_name": "fb15k-237-subset",
            "metric_configs": [
                HitsConfig(name='Hits4', hits_k=4),  # you can modify k here
                MeanReciprocalRankConfig(name='MRR'),
                MeanRankConfig(name='MR'),
            ],
        }

        processor = get_processor(TaskType.kg_link_tail_prediction.value)

        sys_info = processor.process(metadata, data.samples)

        self.assertIsNotNone(sys_info.results.fine_grained)
        self.assertGreater(len(sys_info.results.overall), 0)
Пример #11
0
    def test_extractive_qa_en(self):
        json_en_dataset = os.path.join(self.artifact_path,
                                       "dataset-xquad-en.json")
        json_en_output = os.path.join(self.artifact_path,
                                      "output-xquad-en.json")
        loader = get_custom_dataset_loader(
            TaskType.qa_extractive,
            json_en_dataset,
            json_en_output,
            Source.local_filesystem,
            Source.local_filesystem,
            FileType.json,
            FileType.json,
        )
        data = loader.load()
        self.assertEqual(len(data), 1190)
        sample = data[0]
        self.assertEqual(sample["predicted_answers"], {"text": "308"})
        self.assertEqual(sample["id"], "0")
        self.assertEqual(sample["answers"], {
            "answer_start": [-1],
            "text": ["308"]
        })
        self.assertEqual(
            sample["question"],
            "How many points did the Panthers defense surrender ?")
        self.assertTrue(sample["context"].startswith("The Panthers"))

        metadata = {
            "task_name": TaskType.qa_extractive,
            "dataset_name": "squad",
            "metric_names": ["F1ScoreQA", "ExactMatchQA"],
            # "language":"en"
        }

        processor = get_processor(TaskType.qa_extractive)
        sys_info = processor.process(metadata, data)

        # analysis.write_to_directory("./")
        self.assertIsNotNone(sys_info.results.fine_grained)
        self.assertGreater(len(sys_info.results.overall), 0)
        get_logger('test').info(f'OVERALL={sys_info.results.overall}')
        # should be 0.6974789915966386
        self.assertAlmostEqual(
            sys_info.results.overall["ExactMatch"].value,
            0.6974789915966386,
            2,
            "almost equal",
        )
        # should be 0.8235975260931867
        self.assertAlmostEqual(
            sys_info.results.overall["F1"].value,
            0.8235975260931867,
            2,
            "almost equal",
        )
Пример #12
0
 def test_readme_custom_dataset(self):
     dataset = f"{top_path}/explainaboard/tests/artifacts/summarization/dataset.tsv"
     output = f"{top_path}/explainaboard/tests/artifacts/summarization/output.txt"
     loader = get_custom_dataset_loader(TaskType.summarization,
                                        dataset_data=dataset,
                                        output_data=output)
     data = loader.load()
     processor = get_processor(TaskType.summarization)
     analysis = processor.process(metadata={}, sys_output=data)
     analysis.write_to_directory("./")
Пример #13
0
 def test_simple_example(self):
     # Load the data
     dataset = self.dataset_no_custom_feature
     task = TaskType.kg_link_tail_prediction
     loader = get_custom_dataset_loader(task, dataset, dataset)
     data = loader.load()
     # Initialize the processor and perform the processing
     processor = get_processor(TaskType.kg_link_tail_prediction.value)
     sys_info = processor.process(metadata={}, sys_output=data.samples)
     # If you want to write out to disk you can use
     sys_info.write_to_directory('./')
    def test_batch_processing(self):
        sys_out_dir = os.path.join(self.artifact_path, "CL-mt5base", "xnli")

        datasets = [
            os.path.join(sys_out_dir, "datasets", file)
            for file in os.listdir(os.path.join(sys_out_dir, "datasets"))
        ]

        outputs = [
            os.path.join(sys_out_dir, "outputs", file)
            for file in os.listdir(os.path.join(sys_out_dir, "outputs"))
        ]

        file_type = FileType.json
        task_dummy = TaskType.text_classification
        tasks = []
        for dataset, output in zip(datasets, outputs):
            loader = get_custom_dataset_loader(
                task_dummy,
                dataset,
                output,
                dataset_file_type=file_type,
                output_file_type=file_type,
            )
            if not loader.user_defined_metadata_configs:
                raise ValueError(
                    f"user_defined_metadata_configs in system output {output} hasn't "
                    "been specified or task name should be specified")
            tasks.append(loader.user_defined_metadata_configs['task_name'])

        # Get loaders using real `task` and `file_type`
        loaders = [
            get_custom_dataset_loader(
                task,
                dataset,
                output,
                dataset_file_type=file_type,
                output_file_type=file_type,
            ) for dataset, output, task in zip(datasets, outputs, tasks)
        ]
        system_outputs = [loader.load() for loader in loaders]

        # Run analysis
        reports = []
        metadata = {}
        for loader, system_output, task in zip(loaders, system_outputs, tasks):

            metadata.update(loader.user_defined_metadata_configs)

            report = get_processor(task).process(metadata=metadata,
                                                 sys_output=system_output)
            reports.append(report)

        self.assertEqual(len(reports), 2)
Пример #15
0
 def test_readme_datalab_dataset(self):
     loader = get_datalab_loader(
         TaskType.text_classification,
         dataset=DatalabLoaderOption("sst2"),
         output_data=
         f"{top_path}/explainaboard/tests/artifacts/text_classification/"
         "output_sst2.txt",
         output_source=Source.local_filesystem,
         output_file_type=FileType.text,
     )
     data = loader.load()
     processor = get_processor(TaskType.text_classification)
     analysis = processor.process(metadata={}, sys_output=data)
     analysis.write_to_directory("./")
Пример #16
0
 def test_customized_metadata1(self):
     loader = get_datalab_loader(
         TaskType.named_entity_recognition,
         dataset=DatalabLoaderOption("conll2003", "ner"),
         output_data=self.json_output_customized,
         output_source=Source.local_filesystem,
         output_file_type=FileType.json,
     )
     data = loader.load()
     metadata = dataclasses.asdict(data.metadata)
     metadata.update({
         "task_name": TaskType.named_entity_recognition.value,
     })
     processor = get_processor(TaskType.named_entity_recognition)
     sys_info = processor.process(metadata, data.samples)
     self.assertIsNotNone(sys_info.results.fine_grained)
     self.assertGreater(len(sys_info.results.overall), 0)
Пример #17
0
    def test_extractive_qa_zh(self):
        json_zh_dataset = os.path.join(self.artifact_path,
                                       "dataset-xquad-zh.json")
        json_zh_output = os.path.join(self.artifact_path,
                                      "output-xquad-zh.json")
        loader = get_custom_dataset_loader(
            TaskType.qa_extractive,
            json_zh_dataset,
            json_zh_output,
            Source.local_filesystem,
            Source.local_filesystem,
            FileType.json,
            FileType.json,
        )
        data = loader.load()
        metadata = {
            "task_name": TaskType.qa_extractive.value,
            "dataset_name": "squad",
            "metric_names": ["F1Score", "ExactMatch"],
            "source_language": "zh",
            "target_language": "zh",
        }

        processor = get_processor(TaskType.qa_extractive)

        sys_info = processor.process(metadata, data)
        get_logger('test').info(
            f'--------- sys_info.metric_configs {sys_info.metric_configs}')

        # analysis.write_to_directory("./")
        self.assertIsNotNone(sys_info.results.fine_grained)
        self.assertGreater(len(sys_info.results.overall), 0)
        # 0.6285714285714286
        self.assertAlmostEqual(
            sys_info.results.overall["ExactMatch"].value,
            0.6285714285714286,
            2,
            "almost equal",
        )
        # 0.7559651817716333
        self.assertAlmostEqual(
            sys_info.results.overall["F1"].value,
            0.7559651817716333,
            2,
            "almost equal",
        )
Пример #18
0
    def test_qa_metrics(self):
        json_en_dataset = os.path.join(
            test_artifacts_path, "extractive_qa", "dataset-xquad-en.json"
        )
        json_en_output = os.path.join(
            test_artifacts_path, "extractive_qa", "output-xquad-en.json"
        )
        loader = get_custom_dataset_loader(
            TaskType.qa_extractive,
            json_en_dataset,
            json_en_output,
            Source.local_filesystem,
            Source.local_filesystem,
            FileType.json,
            FileType.json,
        )
        data = loader.load()

        metadata = {
            "task_name": TaskType.qa_extractive.value,
            "dataset_name": "squad",
            "metric_names": ["F1ScoreQA", "ExactMatchQA"],
        }

        processor = get_processor(TaskType.qa_extractive)

        sys_info = processor.process(metadata, data)

        # analysis.write_to_directory("./")
        self.assertIsNotNone(sys_info.results.fine_grained)
        self.assertGreater(len(sys_info.results.overall), 0)
        self.assertAlmostEqual(
            sys_info.results.overall["ExactMatch"].value,
            0.6974789915966386,
            2,
            "almost equal",
        )
        # should be 0.8235975260931867
        self.assertAlmostEqual(
            sys_info.results.overall["F1"].value,
            0.8235975260931867,
            2,
            "almost equal",
        )
Пример #19
0
    def test_datalab_loader(self):
        loader = get_datalab_loader(
            TaskType.cloze_generative,
            dataset=DatalabLoaderOption("gaokao2018_np1", "cloze-hint"),
            output_data=self.json_output,
            output_source=Source.local_filesystem,
            output_file_type=FileType.json,
        )
        data = loader.load()

        metadata = {
            "task_name": TaskType.cloze_generative.value,
            "dataset_name": "gaokao2018_np1",
            "sub_dataset_name": "cloze-hint",
            "metric_names": ["CorrectCount"],
        }
        processor = get_processor(TaskType.cloze_generative.value)
        sys_info = processor.process(metadata, data)
        self.assertIsNotNone(sys_info.results.fine_grained)
Пример #20
0
    def test_process_metadata_in_output_file(self):
        loader = get_custom_dataset_loader(
            TaskType.text_classification,
            self.json_dataset,
            self.json_output,
            Source.local_filesystem,
            Source.local_filesystem,
            FileType.json,
            FileType.json,
        )
        data = loader.load()
        self.assertNotEqual(data.metadata, FileLoaderMetadata)
        metadata = dataclasses.asdict(data.metadata)
        processor = get_processor(TaskType.text_classification)

        sys_info = processor.process(metadata, data.samples)

        self.assertIsNotNone(sys_info.results.fine_grained)
        self.assertGreater(len(sys_info.results.overall), 0)
Пример #21
0
    def test_process(self):
        metadata = {
            "task_name": TaskType.text_classification,
            "metric_names": ["Accuracy", "F1Score"],
        }
        loader = get_custom_dataset_loader(
            TaskType.text_classification,
            load_file_as_str(self.tsv_dataset),
            load_file_as_str(self.txt_output),
            Source.in_memory,
            Source.in_memory,
            FileType.tsv,
            FileType.text,
        )
        data = loader.load()
        processor = get_processor(TaskType.text_classification)
        sys_info = processor.process(metadata, data)

        self.assertIsNotNone(sys_info.results.fine_grained)
        self.assertGreater(len(sys_info.results.overall), 0)
Пример #22
0
def get_customized_results(dataset, customized_features):

    customized_features_performance = {}

    task = TaskType.kg_link_tail_prediction
    loader = get_custom_dataset_loader(task, dataset, dataset)
    data = loader.load()
    # Initialize the processor and perform the processing
    processor = get_processor(TaskType.kg_link_tail_prediction.value)
    metadata = {
        "task_name": TaskType.kg_link_tail_prediction.value,
        "custom_features": data.metadata.custom_features,
    }
    print(metadata)
    sys_info = processor.process(metadata=metadata, sys_output=data.samples)

    # print bucket information
    processor.print_bucket_info(sys_info.results.fine_grained)  # type: ignore

    # get overall results of different metrics
    for metric_name, metric_info in sys_info.results.overall.items(
    ):  # type: ignore

        metric_name = metric_info.metric_name
        value = metric_info.value
        confidence_score_low = metric_info.confidence_score_low
        confidence_score_high = metric_info.confidence_score_high

        print(f"metric_name:{metric_name}\n"
              f"value:{value }\n"
              f"confidence_score_low:{confidence_score_low}\n"
              f"confidence_score_high:{confidence_score_high}\n")

    # get fine-grained results
    for (
            feature_name,
            feature_info,
    ) in sys_info.results.fine_grained.items():  # type: ignore
        if feature_name in customized_features:
            customized_features_performance[feature_name] = feature_info
    return customized_features_performance
Пример #23
0
    def test_snli(self):

        metadata = {
            "task_name": TaskType.text_classification.value,
            "metric_names": ["Accuracy"],
        }
        loader = get_custom_dataset_loader(
            TaskType.text_pair_classification,
            self.tsv_dataset,
            self.txt_output,
            Source.local_filesystem,
            Source.local_filesystem,
            FileType.tsv,
            FileType.text,
        )
        data = loader.load()
        processor = get_processor(TaskType.text_pair_classification)

        sys_info = processor.process(metadata, data)

        self.assertIsNotNone(sys_info.results.fine_grained)
        self.assertGreater(len(sys_info.results.overall), 0)
    def test_generate_system_analysis(self):
        loader = get_custom_dataset_loader(
            TaskType.qa_multiple_choice,
            self.json_dataset,
            self.json_output,
            Source.local_filesystem,
            Source.local_filesystem,
            FileType.json,
            FileType.json,
        )
        data = loader.load()
        metadata = {
            "task_name": TaskType.qa_multiple_choice.value,
            "dataset_name": "fig_qa",
            "metric_names": ["Accuracy"],
        }

        processor = get_processor(TaskType.qa_multiple_choice.value)
        sys_info = processor.process(metadata, data)

        self.assertIsNotNone(sys_info.results.fine_grained)
        self.assertGreater(len(sys_info.results.overall), 0)
Пример #25
0
    def test_generate_system_analysis(self):
        path_system_details = os.path.join(
            test_artifacts_path, "test_system_details.json"
        )
        dataset_data = os.path.join(
            test_artifacts_path, "text_classification", "dataset.tsv"
        )
        output_data = os.path.join(
            test_artifacts_path, "text_classification", "output.txt"
        )

        with open(path_system_details) as fin:
            system_details = json.load(fin)

        metadata = {
            "task_name": TaskType.text_classification,
            "metric_names": ["Accuracy"],
            "system_details": system_details,
        }

        loader = get_custom_dataset_loader(
            TaskType.text_classification,
            dataset_data,
            output_data,
            Source.local_filesystem,
            Source.local_filesystem,
            FileType.tsv,
            FileType.text,
        )
        data = loader.load()
        processor = get_processor(TaskType.text_classification)

        sys_info = processor.process(metadata, data)

        # analysis.write_to_directory("./")
        self.assertIsNotNone(
            sys_info.system_details, {"learning_rate": 0.0001, "number_of_layers": 10}
        )
Пример #26
0
    def test_process_training_set_dependent_features(self):
        metadata = {
            "task_name": TaskType.text_classification.value,
            "metric_names": ["Accuracy", "F1Score"],
            "dataset_name": "ag_news",
            "reload_stat": False,
        }
        loader = get_custom_dataset_loader(
            TaskType.text_classification,
            self.json_dataset,
            self.json_output,
            Source.local_filesystem,
            Source.local_filesystem,
            FileType.json,
            FileType.json,
        )
        data = loader.load()

        processor = get_processor(TaskType.text_classification)
        sys_info = processor.process(metadata, data)

        self.assertIsNotNone(sys_info.results.fine_grained)
        self.assertGreater(len(sys_info.results.overall), 0)
Пример #27
0
from explainaboard import get_custom_dataset_loader, get_processor, TaskType

# This code details (1) how to evaluate your systems using ExplainaBoard
# programmatically (2)how to collect different results
# Load the data
from explainaboard.utils.typing_utils import unwrap

dataset = (
    "../../explainaboard/tests/artifacts/kg_link_tail_prediction/no_custom_feature.json"
)
task = TaskType.kg_link_tail_prediction
loader = get_custom_dataset_loader(task, dataset, dataset)
data = loader.load()
# Initialize the processor and perform the processing
processor = get_processor(TaskType.kg_link_tail_prediction.value)
sys_info = processor.process(metadata={}, sys_output=data.samples)

fine_grained_res = unwrap(sys_info.results.fine_grained)
overall_res = unwrap(sys_info.results.overall)

# print bucket information
processor.print_bucket_info(fine_grained_res)

# save analysis report locally
sys_info.print_as_json(file=open("./report.json", 'w'))

# get overall results of different metrics
for metric_name, metric_info in unwrap(sys_info.results.overall).items():

    value = metric_info.value
    confidence_score_low = metric_info.confidence_score_low
Пример #28
0
def main():
    args = create_parser().parse_args()

    reload_stat: bool = False if args.reload_stat == "0" else True
    system_outputs: list[str] = args.system_outputs

    reports: list[str] | None = args.reports
    metric_names: list[str] | None = args.metrics
    dataset_file_type: str | None = args.custom_dataset_file_type
    output_file_type: str | None = args.output_file_type
    output_dir: str = args.output_dir

    # If reports have been specified, ExplainaBoard cli will perform analysis
    # over report files.
    if args.reports:
        analyze_reports(args)
    else:

        def load_system_details_path():
            if args.system_details:
                try:
                    with open(args.system_details) as fin:
                        return json.load(fin)
                except ValueError as e:
                    raise ValueError(f'invalid json: {e} for system details')

        output_dir_figures = os.path.join(output_dir, "figures")
        output_dir_reports = os.path.join(output_dir, "reports")

        def setup_output_folders():
            """Setup for generated reports and figures"""
            # This part could be generalized
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            if not os.path.exists(output_dir_figures):
                os.makedirs(output_dir_figures)
            if not os.path.exists(output_dir_reports):
                os.makedirs(output_dir_reports)

        system_details: dict | None = load_system_details_path()
        setup_output_folders()

        # check for benchmark submission: explainaboard  --system_outputs ./data/
        # system_outputs/sst2/user_specified_metadata.json
        num_systems = len(system_outputs)
        dataset_file_types: list[str
                                 | None] = [dataset_file_type] * num_systems
        output_file_types: list[str | None] = [output_file_type] * num_systems
        custom_dataset_paths: list[str] | None = args.custom_dataset_paths
        dataset: str | None = args.dataset
        sub_dataset: str | None = args.sub_dataset
        split: str = args.split
        target_language: str = args.target_language
        source_language: str = args.source_language or target_language
        tasks = get_tasks(args.task, system_outputs)

        # Some loaders need to know the language of the inputs and outputs
        loader_field_mapping = {
            FileLoaderField.SOURCE_LANGUAGE: source_language,
            FileLoaderField.TARGET_LANGUAGE: target_language,
        }
        if custom_dataset_paths:  # load custom datasets
            loaders = [
                get_custom_dataset_loader(
                    task,
                    dataset,
                    output,
                    Source.local_filesystem,
                    Source.local_filesystem,
                    dataset_file_type,
                    output_file_type,
                    field_mapping=loader_field_mapping,
                ) for task, dataset, output, dataset_file_type,
                output_file_type in zip(
                    tasks,
                    custom_dataset_paths,
                    system_outputs,
                    dataset_file_types,
                    output_file_types,
                )
            ]
        else:  # load from datalab
            if not dataset:
                raise ValueError(
                    "neither custom_dataset_paths or dataset is defined")
            loaders = [
                get_datalab_loader(
                    task,
                    DatalabLoaderOption(dataset, sub_dataset, split=split),
                    sys_output,
                    Source.local_filesystem,
                    output_file_type,
                    field_mapping=loader_field_mapping,
                ) for task, sys_output, output_file_type in zip(
                    tasks, system_outputs, output_file_types)
            ]
        system_datasets = [loader.load() for loader in loaders]

        # validation
        if len(system_datasets) == 2:
            if len(system_datasets[0]) != len(system_datasets[1]):
                num0 = len(system_datasets[0])
                num1 = len(system_datasets[1])
                raise ValueError(
                    f'Data must be identical for pairwise analysis, but length of '
                    'files '
                    f'{system_datasets[0]} ({num0}) != {system_datasets[1]} ({num1})'
                )

        # TODO(gneubig): This gets metadata from the first system and assumes it's the
        #  same for other systems
        target_language = (target_language
                           or system_datasets[0].metadata.target_language
                           or 'en')
        source_language = (source_language
                           or system_datasets[0].metadata.source_language
                           or target_language)

        # Setup metadata
        metadata = {
            "dataset_name": dataset,
            "sub_dataset_name": sub_dataset,
            "split_name": split,
            "source_language": source_language,
            "target_language": target_language,
            "reload_stat": reload_stat,
            "conf_value": args.conf_value,
            "system_details": system_details,
            "custom_features": system_datasets[0].metadata.custom_features,
        }

        if metric_names is not None:
            if 'metric_configs' in metadata:
                raise ValueError(
                    'Cannot specify both metric names and metric configs')
            metric_configs = [
                metric_name_to_config(name, source_language, target_language)
                for name in metric_names
            ]
            metadata["metric_configs"] = metric_configs

        # Run analysis
        reports: list[SysOutputInfo] = []
        for loader, system_dataset, system_full_path, task in zip(
                loaders, system_datasets, system_outputs, tasks):

            # metadata.update(loader.user_defined_metadata_configs)
            # metadata[
            #     "user_defined_features_configs"
            # ] = loader.user_defined_features_configs
            metadata["task_name"] = task

            processor = get_processor(task=task)
            report = processor.process(metadata=metadata,
                                       sys_output=system_dataset)
            reports.append(report)

            # print to the console
            get_logger('report').info('--- Overall Performance')
            for metric_stat in report.results.overall.values():
                get_logger('report').info(
                    f'{metric_stat.metric_name}\t{metric_stat.value}')
            get_logger('report').info('')
            get_logger('report').info('--- Bucketed Performance')
            processor.print_bucket_info(report.results.fine_grained)

            # save report to `output_dir_reports`
            x_file_name = os.path.basename(system_full_path).split(".")[0]
            report.write_to_directory(output_dir_reports,
                                      f"{x_file_name}.json")

            # generate figures and save them into  `output_dir_figures`
            if not os.path.exists(f"{output_dir_figures}/{x_file_name}"):
                os.makedirs(f"{output_dir_figures}/{x_file_name}")
            draw_bar_chart_from_reports(
                [f"{output_dir_reports}/{x_file_name}.json"],
                f"{output_dir_figures}/{x_file_name}",
            )

        if args.report_json is not None:
            report_file = open(args.report_json, 'w')
        else:
            report_file = sys.stdout
        if len(system_outputs) == 1:  # individual system analysis
            reports[0].print_as_json(file=report_file)
        elif len(system_outputs) == 2:  # pairwise analysis
            compare_analysis = get_pairwise_performance_gap(
                reports[0], reports[1])
            compare_analysis.print_as_json(file=report_file)
        if args.report_json is not None:
            report_file.close()
Пример #29
0
    def test_datalab_loader(self):
        loader = get_datalab_loader(
            TaskType.named_entity_recognition,
            dataset=DatalabLoaderOption("conll2003", "ner"),
            output_data=self.conll_output_full,
            output_source=Source.local_filesystem,
            output_file_type=FileType.conll,
        )
        data = loader.load()

        metadata = {
            "task_name": TaskType.named_entity_recognition.value,
            "dataset_name": "conll2003",
            "sub_dataset_name": "ner",
            "metric_names": ["F1Score"],
        }
        processor = get_processor(TaskType.named_entity_recognition)
        sys_info = processor.process(metadata, data)

        self.assertIsNotNone(sys_info.results.fine_grained)
        self.assertGreater(len(sys_info.results.overall), 0)

        # ---------------------------------------------------------------------------
        #                               Deep Test
        # ---------------------------------------------------------------------------

        # 1. Unittest: training set dependent features shouldn't included
        # when training dataset is not provided
        activate_features = sys_info.results.fine_grained.keys()
        self.assertTrue("span_econ" in activate_features
                        and "span_efre" in activate_features)

        # 2. Unittest: test the number of buckets of training dependent features
        n_buckets = len(sys_info.results.fine_grained["span_econ"])
        self.assertEqual(n_buckets, 3)

        # 3. Unittest: test detailed bucket information: bucket interval
        # [0.007462686567164179,0.9565217391304348]
        second_bucket = sys_info.results.fine_grained["span_econ"][1]
        self.assertAlmostEqual(
            second_bucket.bucket_interval[0],
            0.007462686567164179,
            4,
            "almost equal",
        )
        self.assertAlmostEqual(
            second_bucket.bucket_interval[1],
            0.8571428571428571,
            4,
            "almost equal",
        )
        # 4. Unittest: test detailed bucket information: bucket samples
        self.assertEqual(second_bucket.n_samples, 1007)

        # 5. Unittest: test detailed bucket information: metric
        self.assertEqual(second_bucket.performances[0].metric_name, "F1")
        self.assertAlmostEqual(second_bucket.performances[0].value,
                               0.9203805708562846, 4, "almost equal")
        # 6 Unittest: test detailed bucket information: confidence interval
        self.assertGreater(second_bucket.performances[0].confidence_score_low,
                           0)

        # 7. Unittest: test if only fewer cases are printed (this is the expected
        # case, especially for sequence labeling tasks. Otherwise, the analysis report
        # files will be too large.)
        self.assertLess(
            len(second_bucket.bucket_samples),
            second_bucket.n_samples,
        )