def test_default_features_dont_modify_condgen(self): condgen_processor = get_processor(TaskType.conditional_generation.value) mt_processor = get_processor(TaskType.machine_translation.value) condgen_features_1 = condgen_processor.default_features() mt_features = mt_processor.default_features() condgen_features_2 = condgen_processor.default_features() # MT features didn't change condgen features self.assertDictEqual(condgen_features_1, condgen_features_2) # condgen features are a subset of MT features self.assertDictEqual(mt_features, {**mt_features, **condgen_features_1})
def test_no_user_defined_features(self): dataset = os.path.join(self.artifact_path, "no_custom_feature.json") loader = get_custom_dataset_loader( TaskType.kg_link_tail_prediction, dataset, dataset, Source.local_filesystem, Source.local_filesystem, FileType.json, FileType.json, ) data = loader.load() self.assertEqual(data.metadata, FileLoaderMetadata()) metadata = { "task_name": TaskType.kg_link_tail_prediction.value, "dataset_name": "fb15k-237-subset", "metric_configs": [HitsConfig(name='Hits4', hits_k=4)], } processor = get_processor(TaskType.kg_link_tail_prediction.value) sys_info = processor.process(metadata, data.samples) self.assertIsNotNone(sys_info.results.fine_grained) self.assertGreater(len(sys_info.results.overall), 0)
def test_custom_features(self): loader = get_custom_dataset_loader( TaskType.machine_translation, self.tsv_dataset, self.json_output_with_features, Source.local_filesystem, Source.local_filesystem, FileType.tsv, FileType.json, ) data = loader.load() self.assertEqual(len(data), 4) self.assertEqual( data[0], { 'source': 'Ak sa chcete dostať ešte hlbšie, môžete si všimnúť ' + 'trhlinky.', 'reference': 'Now just to get really deep in , you can really get to ' + 'the cracks .', 'id': '0', 'hypothesis': 'If you want to get a deeper , you can see the forces .', 'num_capital_letters': 1, }, ) processor = get_processor(TaskType.machine_translation.value) sys_info = processor.process(dataclasses.asdict(data.metadata), data.samples) self.assertTrue('num_capital_letters' in sys_info.results.fine_grained)
def test_e2e(self): loader = get_custom_dataset_loader( TaskType.aspect_based_sentiment_classification, self.tsv_dataset, self.txt_output, Source.local_filesystem, Source.in_memory, FileType.tsv, FileType.text, ) data = loader.load() self.assertEqual(len(data), 100) self.assertEqual( data[0], { 'aspect': 'Boot time', 'text': 'Boot time is super fast, around anywhere from 35 seconds to ' + '1 minute.', 'true_label': 'positive', 'id': '0', 'predicted_label': 'positive', }, ) metadata = { "task_name": TaskType.aspect_based_sentiment_classification, "metric_names": ["Accuracy", "F1Score"], } processor = get_processor(TaskType.aspect_based_sentiment_classification) sys_info = processor.process(metadata, data) self.assertIsNotNone(sys_info.results.fine_grained) self.assertGreater(len(sys_info.results.overall), 0)
def test_generate_system_analysis(self): loader = get_custom_dataset_loader( TaskType.machine_translation, self.tsv_dataset, self.txt_output, Source.local_filesystem, Source.local_filesystem, FileType.tsv, FileType.text, ) data = loader.load() metadata = { "task_name": TaskType.machine_translation.value, "dataset_name": "ted_multi", "metric_names": ["bleu"], } processor = get_processor(TaskType.machine_translation.value) sys_info = processor.process(metadata, data) # analysis.write_to_directory("./") self.assertIsNotNone(sys_info.results.fine_grained) self.assertGreater(len(sys_info.results.overall), 0)
def test_generate_system_analysis(self): loader = get_custom_dataset_loader( TaskType.named_entity_recognition, self.conll_dataset, self.conll_output, Source.local_filesystem, Source.local_filesystem, FileType.conll, FileType.conll, ) data = loader.load() metadata = { "task_name": TaskType.named_entity_recognition.value, # "dataset_name": "conll2003", # "sub_dataset_name":"ner", "metric_names": ["F1Score"], } processor = get_processor(TaskType.named_entity_recognition) sys_info = processor.process(metadata, data) self.assertIsNotNone(sys_info.results.fine_grained) self.assertGreater(len(sys_info.results.overall), 0) # ------ Deep Test -------- # test: training set dependent features should be disabled when # training dataset is not provided activate_features = sys_info.results.fine_grained.keys() self.assertTrue("span_econ" not in activate_features and "span_efre" not in activate_features)
def test_multiple_qa_customized_feature(self): dataset_path = os.path.join(self.artifact_path, "dataset_fig_qa.json") output_path = os.path.join(self.artifact_path, "output_fig_qa_customized_features.json") loader = get_custom_dataset_loader( TaskType.qa_multiple_choice, dataset_path, output_path, Source.local_filesystem, Source.local_filesystem, FileType.json, FileType.json, ) data = loader.load() self.assertIsInstance(data.samples[0]["commonsense_category"], list) self.assertEqual(data.samples[0]["commonsense_category"], ["obj", "cul"]) metadata = { "task_name": TaskType.qa_multiple_choice.value, "dataset_name": "fig_qa", "metric_names": ["Accuracy"], # don't forget this, otherwise the user-defined features will be ignored "user_defined_features_configs": data.metadata.custom_features, } processor = get_processor(TaskType.qa_multiple_choice.value) sys_info = processor.process(metadata, data.samples) self.assertIsNotNone(sys_info.results.fine_grained) self.assertGreater(len(sys_info.results.overall), 0)
def test_sort_buckets_by_value(self): loader = get_custom_dataset_loader( TaskType.kg_link_tail_prediction, self.test_data, self.dataset_no_custom_feature, ) data = loader.load() self.assertEqual(data.metadata, FileLoaderMetadata()) metadata = { "task_name": TaskType.kg_link_tail_prediction.value, "dataset_name": "fb15k-237", "metric_configs": [ HitsConfig(name='Hits4', hits_k=4), MeanReciprocalRankConfig(name='MRR'), MeanRankConfig(name='MR'), ], "sort_by": "performance_value", "sort_by_metric": "first", } processor = get_processor(TaskType.kg_link_tail_prediction.value) sys_info = processor.process(metadata, data.samples) self.assertIsNotNone(sys_info.results.fine_grained) self.assertGreater(len(sys_info.results.overall), 0) symmetry_performances = sys_info.results.fine_grained['symmetry'] if len(symmetry_performances) <= 1: # can't sort if only 1 item return for i in range(len(symmetry_performances) - 1): first_item = symmetry_performances[i].performances[0].value second_item = symmetry_performances[i + 1].performances[0].value self.assertGreater(first_item, second_item)
def test_generate_system_analysis(self): loader = get_custom_dataset_loader( TaskType.word_segmentation, self.conll_dataset, self.conll_output, Source.local_filesystem, Source.local_filesystem, FileType.conll, FileType.conll, ) data = loader.load() metadata = { "task_name": TaskType.word_segmentation.value, # "dataset_name": "conll2003", # "sub_dataset_name":"ner", "metric_names": ["F1Score"], } processor = get_processor(TaskType.word_segmentation) sys_info = processor.process(metadata, data) self.assertIsNotNone(sys_info.results.fine_grained) self.assertGreater(len(sys_info.results.overall), 0)
def test_no_user_defined_features(self): loader = get_custom_dataset_loader( TaskType.kg_link_tail_prediction, self.test_data, self.dataset_no_custom_feature, dataset_file_type=FileType.json, output_file_type=FileType.json, ) data = loader.load() self.assertEqual(data.metadata, FileLoaderMetadata()) metadata = { "task_name": TaskType.kg_link_tail_prediction.value, "dataset_name": "fb15k-237-subset", "metric_configs": [ HitsConfig(name='Hits4', hits_k=4), # you can modify k here MeanReciprocalRankConfig(name='MRR'), MeanRankConfig(name='MR'), ], } processor = get_processor(TaskType.kg_link_tail_prediction.value) sys_info = processor.process(metadata, data.samples) self.assertIsNotNone(sys_info.results.fine_grained) self.assertGreater(len(sys_info.results.overall), 0)
def test_extractive_qa_en(self): json_en_dataset = os.path.join(self.artifact_path, "dataset-xquad-en.json") json_en_output = os.path.join(self.artifact_path, "output-xquad-en.json") loader = get_custom_dataset_loader( TaskType.qa_extractive, json_en_dataset, json_en_output, Source.local_filesystem, Source.local_filesystem, FileType.json, FileType.json, ) data = loader.load() self.assertEqual(len(data), 1190) sample = data[0] self.assertEqual(sample["predicted_answers"], {"text": "308"}) self.assertEqual(sample["id"], "0") self.assertEqual(sample["answers"], { "answer_start": [-1], "text": ["308"] }) self.assertEqual( sample["question"], "How many points did the Panthers defense surrender ?") self.assertTrue(sample["context"].startswith("The Panthers")) metadata = { "task_name": TaskType.qa_extractive, "dataset_name": "squad", "metric_names": ["F1ScoreQA", "ExactMatchQA"], # "language":"en" } processor = get_processor(TaskType.qa_extractive) sys_info = processor.process(metadata, data) # analysis.write_to_directory("./") self.assertIsNotNone(sys_info.results.fine_grained) self.assertGreater(len(sys_info.results.overall), 0) get_logger('test').info(f'OVERALL={sys_info.results.overall}') # should be 0.6974789915966386 self.assertAlmostEqual( sys_info.results.overall["ExactMatch"].value, 0.6974789915966386, 2, "almost equal", ) # should be 0.8235975260931867 self.assertAlmostEqual( sys_info.results.overall["F1"].value, 0.8235975260931867, 2, "almost equal", )
def test_readme_custom_dataset(self): dataset = f"{top_path}/explainaboard/tests/artifacts/summarization/dataset.tsv" output = f"{top_path}/explainaboard/tests/artifacts/summarization/output.txt" loader = get_custom_dataset_loader(TaskType.summarization, dataset_data=dataset, output_data=output) data = loader.load() processor = get_processor(TaskType.summarization) analysis = processor.process(metadata={}, sys_output=data) analysis.write_to_directory("./")
def test_simple_example(self): # Load the data dataset = self.dataset_no_custom_feature task = TaskType.kg_link_tail_prediction loader = get_custom_dataset_loader(task, dataset, dataset) data = loader.load() # Initialize the processor and perform the processing processor = get_processor(TaskType.kg_link_tail_prediction.value) sys_info = processor.process(metadata={}, sys_output=data.samples) # If you want to write out to disk you can use sys_info.write_to_directory('./')
def test_batch_processing(self): sys_out_dir = os.path.join(self.artifact_path, "CL-mt5base", "xnli") datasets = [ os.path.join(sys_out_dir, "datasets", file) for file in os.listdir(os.path.join(sys_out_dir, "datasets")) ] outputs = [ os.path.join(sys_out_dir, "outputs", file) for file in os.listdir(os.path.join(sys_out_dir, "outputs")) ] file_type = FileType.json task_dummy = TaskType.text_classification tasks = [] for dataset, output in zip(datasets, outputs): loader = get_custom_dataset_loader( task_dummy, dataset, output, dataset_file_type=file_type, output_file_type=file_type, ) if not loader.user_defined_metadata_configs: raise ValueError( f"user_defined_metadata_configs in system output {output} hasn't " "been specified or task name should be specified") tasks.append(loader.user_defined_metadata_configs['task_name']) # Get loaders using real `task` and `file_type` loaders = [ get_custom_dataset_loader( task, dataset, output, dataset_file_type=file_type, output_file_type=file_type, ) for dataset, output, task in zip(datasets, outputs, tasks) ] system_outputs = [loader.load() for loader in loaders] # Run analysis reports = [] metadata = {} for loader, system_output, task in zip(loaders, system_outputs, tasks): metadata.update(loader.user_defined_metadata_configs) report = get_processor(task).process(metadata=metadata, sys_output=system_output) reports.append(report) self.assertEqual(len(reports), 2)
def test_readme_datalab_dataset(self): loader = get_datalab_loader( TaskType.text_classification, dataset=DatalabLoaderOption("sst2"), output_data= f"{top_path}/explainaboard/tests/artifacts/text_classification/" "output_sst2.txt", output_source=Source.local_filesystem, output_file_type=FileType.text, ) data = loader.load() processor = get_processor(TaskType.text_classification) analysis = processor.process(metadata={}, sys_output=data) analysis.write_to_directory("./")
def test_customized_metadata1(self): loader = get_datalab_loader( TaskType.named_entity_recognition, dataset=DatalabLoaderOption("conll2003", "ner"), output_data=self.json_output_customized, output_source=Source.local_filesystem, output_file_type=FileType.json, ) data = loader.load() metadata = dataclasses.asdict(data.metadata) metadata.update({ "task_name": TaskType.named_entity_recognition.value, }) processor = get_processor(TaskType.named_entity_recognition) sys_info = processor.process(metadata, data.samples) self.assertIsNotNone(sys_info.results.fine_grained) self.assertGreater(len(sys_info.results.overall), 0)
def test_extractive_qa_zh(self): json_zh_dataset = os.path.join(self.artifact_path, "dataset-xquad-zh.json") json_zh_output = os.path.join(self.artifact_path, "output-xquad-zh.json") loader = get_custom_dataset_loader( TaskType.qa_extractive, json_zh_dataset, json_zh_output, Source.local_filesystem, Source.local_filesystem, FileType.json, FileType.json, ) data = loader.load() metadata = { "task_name": TaskType.qa_extractive.value, "dataset_name": "squad", "metric_names": ["F1Score", "ExactMatch"], "source_language": "zh", "target_language": "zh", } processor = get_processor(TaskType.qa_extractive) sys_info = processor.process(metadata, data) get_logger('test').info( f'--------- sys_info.metric_configs {sys_info.metric_configs}') # analysis.write_to_directory("./") self.assertIsNotNone(sys_info.results.fine_grained) self.assertGreater(len(sys_info.results.overall), 0) # 0.6285714285714286 self.assertAlmostEqual( sys_info.results.overall["ExactMatch"].value, 0.6285714285714286, 2, "almost equal", ) # 0.7559651817716333 self.assertAlmostEqual( sys_info.results.overall["F1"].value, 0.7559651817716333, 2, "almost equal", )
def test_qa_metrics(self): json_en_dataset = os.path.join( test_artifacts_path, "extractive_qa", "dataset-xquad-en.json" ) json_en_output = os.path.join( test_artifacts_path, "extractive_qa", "output-xquad-en.json" ) loader = get_custom_dataset_loader( TaskType.qa_extractive, json_en_dataset, json_en_output, Source.local_filesystem, Source.local_filesystem, FileType.json, FileType.json, ) data = loader.load() metadata = { "task_name": TaskType.qa_extractive.value, "dataset_name": "squad", "metric_names": ["F1ScoreQA", "ExactMatchQA"], } processor = get_processor(TaskType.qa_extractive) sys_info = processor.process(metadata, data) # analysis.write_to_directory("./") self.assertIsNotNone(sys_info.results.fine_grained) self.assertGreater(len(sys_info.results.overall), 0) self.assertAlmostEqual( sys_info.results.overall["ExactMatch"].value, 0.6974789915966386, 2, "almost equal", ) # should be 0.8235975260931867 self.assertAlmostEqual( sys_info.results.overall["F1"].value, 0.8235975260931867, 2, "almost equal", )
def test_datalab_loader(self): loader = get_datalab_loader( TaskType.cloze_generative, dataset=DatalabLoaderOption("gaokao2018_np1", "cloze-hint"), output_data=self.json_output, output_source=Source.local_filesystem, output_file_type=FileType.json, ) data = loader.load() metadata = { "task_name": TaskType.cloze_generative.value, "dataset_name": "gaokao2018_np1", "sub_dataset_name": "cloze-hint", "metric_names": ["CorrectCount"], } processor = get_processor(TaskType.cloze_generative.value) sys_info = processor.process(metadata, data) self.assertIsNotNone(sys_info.results.fine_grained)
def test_process_metadata_in_output_file(self): loader = get_custom_dataset_loader( TaskType.text_classification, self.json_dataset, self.json_output, Source.local_filesystem, Source.local_filesystem, FileType.json, FileType.json, ) data = loader.load() self.assertNotEqual(data.metadata, FileLoaderMetadata) metadata = dataclasses.asdict(data.metadata) processor = get_processor(TaskType.text_classification) sys_info = processor.process(metadata, data.samples) self.assertIsNotNone(sys_info.results.fine_grained) self.assertGreater(len(sys_info.results.overall), 0)
def test_process(self): metadata = { "task_name": TaskType.text_classification, "metric_names": ["Accuracy", "F1Score"], } loader = get_custom_dataset_loader( TaskType.text_classification, load_file_as_str(self.tsv_dataset), load_file_as_str(self.txt_output), Source.in_memory, Source.in_memory, FileType.tsv, FileType.text, ) data = loader.load() processor = get_processor(TaskType.text_classification) sys_info = processor.process(metadata, data) self.assertIsNotNone(sys_info.results.fine_grained) self.assertGreater(len(sys_info.results.overall), 0)
def get_customized_results(dataset, customized_features): customized_features_performance = {} task = TaskType.kg_link_tail_prediction loader = get_custom_dataset_loader(task, dataset, dataset) data = loader.load() # Initialize the processor and perform the processing processor = get_processor(TaskType.kg_link_tail_prediction.value) metadata = { "task_name": TaskType.kg_link_tail_prediction.value, "custom_features": data.metadata.custom_features, } print(metadata) sys_info = processor.process(metadata=metadata, sys_output=data.samples) # print bucket information processor.print_bucket_info(sys_info.results.fine_grained) # type: ignore # get overall results of different metrics for metric_name, metric_info in sys_info.results.overall.items( ): # type: ignore metric_name = metric_info.metric_name value = metric_info.value confidence_score_low = metric_info.confidence_score_low confidence_score_high = metric_info.confidence_score_high print(f"metric_name:{metric_name}\n" f"value:{value }\n" f"confidence_score_low:{confidence_score_low}\n" f"confidence_score_high:{confidence_score_high}\n") # get fine-grained results for ( feature_name, feature_info, ) in sys_info.results.fine_grained.items(): # type: ignore if feature_name in customized_features: customized_features_performance[feature_name] = feature_info return customized_features_performance
def test_snli(self): metadata = { "task_name": TaskType.text_classification.value, "metric_names": ["Accuracy"], } loader = get_custom_dataset_loader( TaskType.text_pair_classification, self.tsv_dataset, self.txt_output, Source.local_filesystem, Source.local_filesystem, FileType.tsv, FileType.text, ) data = loader.load() processor = get_processor(TaskType.text_pair_classification) sys_info = processor.process(metadata, data) self.assertIsNotNone(sys_info.results.fine_grained) self.assertGreater(len(sys_info.results.overall), 0)
def test_generate_system_analysis(self): loader = get_custom_dataset_loader( TaskType.qa_multiple_choice, self.json_dataset, self.json_output, Source.local_filesystem, Source.local_filesystem, FileType.json, FileType.json, ) data = loader.load() metadata = { "task_name": TaskType.qa_multiple_choice.value, "dataset_name": "fig_qa", "metric_names": ["Accuracy"], } processor = get_processor(TaskType.qa_multiple_choice.value) sys_info = processor.process(metadata, data) self.assertIsNotNone(sys_info.results.fine_grained) self.assertGreater(len(sys_info.results.overall), 0)
def test_generate_system_analysis(self): path_system_details = os.path.join( test_artifacts_path, "test_system_details.json" ) dataset_data = os.path.join( test_artifacts_path, "text_classification", "dataset.tsv" ) output_data = os.path.join( test_artifacts_path, "text_classification", "output.txt" ) with open(path_system_details) as fin: system_details = json.load(fin) metadata = { "task_name": TaskType.text_classification, "metric_names": ["Accuracy"], "system_details": system_details, } loader = get_custom_dataset_loader( TaskType.text_classification, dataset_data, output_data, Source.local_filesystem, Source.local_filesystem, FileType.tsv, FileType.text, ) data = loader.load() processor = get_processor(TaskType.text_classification) sys_info = processor.process(metadata, data) # analysis.write_to_directory("./") self.assertIsNotNone( sys_info.system_details, {"learning_rate": 0.0001, "number_of_layers": 10} )
def test_process_training_set_dependent_features(self): metadata = { "task_name": TaskType.text_classification.value, "metric_names": ["Accuracy", "F1Score"], "dataset_name": "ag_news", "reload_stat": False, } loader = get_custom_dataset_loader( TaskType.text_classification, self.json_dataset, self.json_output, Source.local_filesystem, Source.local_filesystem, FileType.json, FileType.json, ) data = loader.load() processor = get_processor(TaskType.text_classification) sys_info = processor.process(metadata, data) self.assertIsNotNone(sys_info.results.fine_grained) self.assertGreater(len(sys_info.results.overall), 0)
from explainaboard import get_custom_dataset_loader, get_processor, TaskType # This code details (1) how to evaluate your systems using ExplainaBoard # programmatically (2)how to collect different results # Load the data from explainaboard.utils.typing_utils import unwrap dataset = ( "../../explainaboard/tests/artifacts/kg_link_tail_prediction/no_custom_feature.json" ) task = TaskType.kg_link_tail_prediction loader = get_custom_dataset_loader(task, dataset, dataset) data = loader.load() # Initialize the processor and perform the processing processor = get_processor(TaskType.kg_link_tail_prediction.value) sys_info = processor.process(metadata={}, sys_output=data.samples) fine_grained_res = unwrap(sys_info.results.fine_grained) overall_res = unwrap(sys_info.results.overall) # print bucket information processor.print_bucket_info(fine_grained_res) # save analysis report locally sys_info.print_as_json(file=open("./report.json", 'w')) # get overall results of different metrics for metric_name, metric_info in unwrap(sys_info.results.overall).items(): value = metric_info.value confidence_score_low = metric_info.confidence_score_low
def main(): args = create_parser().parse_args() reload_stat: bool = False if args.reload_stat == "0" else True system_outputs: list[str] = args.system_outputs reports: list[str] | None = args.reports metric_names: list[str] | None = args.metrics dataset_file_type: str | None = args.custom_dataset_file_type output_file_type: str | None = args.output_file_type output_dir: str = args.output_dir # If reports have been specified, ExplainaBoard cli will perform analysis # over report files. if args.reports: analyze_reports(args) else: def load_system_details_path(): if args.system_details: try: with open(args.system_details) as fin: return json.load(fin) except ValueError as e: raise ValueError(f'invalid json: {e} for system details') output_dir_figures = os.path.join(output_dir, "figures") output_dir_reports = os.path.join(output_dir, "reports") def setup_output_folders(): """Setup for generated reports and figures""" # This part could be generalized if not os.path.exists(output_dir): os.makedirs(output_dir) if not os.path.exists(output_dir_figures): os.makedirs(output_dir_figures) if not os.path.exists(output_dir_reports): os.makedirs(output_dir_reports) system_details: dict | None = load_system_details_path() setup_output_folders() # check for benchmark submission: explainaboard --system_outputs ./data/ # system_outputs/sst2/user_specified_metadata.json num_systems = len(system_outputs) dataset_file_types: list[str | None] = [dataset_file_type] * num_systems output_file_types: list[str | None] = [output_file_type] * num_systems custom_dataset_paths: list[str] | None = args.custom_dataset_paths dataset: str | None = args.dataset sub_dataset: str | None = args.sub_dataset split: str = args.split target_language: str = args.target_language source_language: str = args.source_language or target_language tasks = get_tasks(args.task, system_outputs) # Some loaders need to know the language of the inputs and outputs loader_field_mapping = { FileLoaderField.SOURCE_LANGUAGE: source_language, FileLoaderField.TARGET_LANGUAGE: target_language, } if custom_dataset_paths: # load custom datasets loaders = [ get_custom_dataset_loader( task, dataset, output, Source.local_filesystem, Source.local_filesystem, dataset_file_type, output_file_type, field_mapping=loader_field_mapping, ) for task, dataset, output, dataset_file_type, output_file_type in zip( tasks, custom_dataset_paths, system_outputs, dataset_file_types, output_file_types, ) ] else: # load from datalab if not dataset: raise ValueError( "neither custom_dataset_paths or dataset is defined") loaders = [ get_datalab_loader( task, DatalabLoaderOption(dataset, sub_dataset, split=split), sys_output, Source.local_filesystem, output_file_type, field_mapping=loader_field_mapping, ) for task, sys_output, output_file_type in zip( tasks, system_outputs, output_file_types) ] system_datasets = [loader.load() for loader in loaders] # validation if len(system_datasets) == 2: if len(system_datasets[0]) != len(system_datasets[1]): num0 = len(system_datasets[0]) num1 = len(system_datasets[1]) raise ValueError( f'Data must be identical for pairwise analysis, but length of ' 'files ' f'{system_datasets[0]} ({num0}) != {system_datasets[1]} ({num1})' ) # TODO(gneubig): This gets metadata from the first system and assumes it's the # same for other systems target_language = (target_language or system_datasets[0].metadata.target_language or 'en') source_language = (source_language or system_datasets[0].metadata.source_language or target_language) # Setup metadata metadata = { "dataset_name": dataset, "sub_dataset_name": sub_dataset, "split_name": split, "source_language": source_language, "target_language": target_language, "reload_stat": reload_stat, "conf_value": args.conf_value, "system_details": system_details, "custom_features": system_datasets[0].metadata.custom_features, } if metric_names is not None: if 'metric_configs' in metadata: raise ValueError( 'Cannot specify both metric names and metric configs') metric_configs = [ metric_name_to_config(name, source_language, target_language) for name in metric_names ] metadata["metric_configs"] = metric_configs # Run analysis reports: list[SysOutputInfo] = [] for loader, system_dataset, system_full_path, task in zip( loaders, system_datasets, system_outputs, tasks): # metadata.update(loader.user_defined_metadata_configs) # metadata[ # "user_defined_features_configs" # ] = loader.user_defined_features_configs metadata["task_name"] = task processor = get_processor(task=task) report = processor.process(metadata=metadata, sys_output=system_dataset) reports.append(report) # print to the console get_logger('report').info('--- Overall Performance') for metric_stat in report.results.overall.values(): get_logger('report').info( f'{metric_stat.metric_name}\t{metric_stat.value}') get_logger('report').info('') get_logger('report').info('--- Bucketed Performance') processor.print_bucket_info(report.results.fine_grained) # save report to `output_dir_reports` x_file_name = os.path.basename(system_full_path).split(".")[0] report.write_to_directory(output_dir_reports, f"{x_file_name}.json") # generate figures and save them into `output_dir_figures` if not os.path.exists(f"{output_dir_figures}/{x_file_name}"): os.makedirs(f"{output_dir_figures}/{x_file_name}") draw_bar_chart_from_reports( [f"{output_dir_reports}/{x_file_name}.json"], f"{output_dir_figures}/{x_file_name}", ) if args.report_json is not None: report_file = open(args.report_json, 'w') else: report_file = sys.stdout if len(system_outputs) == 1: # individual system analysis reports[0].print_as_json(file=report_file) elif len(system_outputs) == 2: # pairwise analysis compare_analysis = get_pairwise_performance_gap( reports[0], reports[1]) compare_analysis.print_as_json(file=report_file) if args.report_json is not None: report_file.close()
def test_datalab_loader(self): loader = get_datalab_loader( TaskType.named_entity_recognition, dataset=DatalabLoaderOption("conll2003", "ner"), output_data=self.conll_output_full, output_source=Source.local_filesystem, output_file_type=FileType.conll, ) data = loader.load() metadata = { "task_name": TaskType.named_entity_recognition.value, "dataset_name": "conll2003", "sub_dataset_name": "ner", "metric_names": ["F1Score"], } processor = get_processor(TaskType.named_entity_recognition) sys_info = processor.process(metadata, data) self.assertIsNotNone(sys_info.results.fine_grained) self.assertGreater(len(sys_info.results.overall), 0) # --------------------------------------------------------------------------- # Deep Test # --------------------------------------------------------------------------- # 1. Unittest: training set dependent features shouldn't included # when training dataset is not provided activate_features = sys_info.results.fine_grained.keys() self.assertTrue("span_econ" in activate_features and "span_efre" in activate_features) # 2. Unittest: test the number of buckets of training dependent features n_buckets = len(sys_info.results.fine_grained["span_econ"]) self.assertEqual(n_buckets, 3) # 3. Unittest: test detailed bucket information: bucket interval # [0.007462686567164179,0.9565217391304348] second_bucket = sys_info.results.fine_grained["span_econ"][1] self.assertAlmostEqual( second_bucket.bucket_interval[0], 0.007462686567164179, 4, "almost equal", ) self.assertAlmostEqual( second_bucket.bucket_interval[1], 0.8571428571428571, 4, "almost equal", ) # 4. Unittest: test detailed bucket information: bucket samples self.assertEqual(second_bucket.n_samples, 1007) # 5. Unittest: test detailed bucket information: metric self.assertEqual(second_bucket.performances[0].metric_name, "F1") self.assertAlmostEqual(second_bucket.performances[0].value, 0.9203805708562846, 4, "almost equal") # 6 Unittest: test detailed bucket information: confidence interval self.assertGreater(second_bucket.performances[0].confidence_score_low, 0) # 7. Unittest: test if only fewer cases are printed (this is the expected # case, especially for sequence labeling tasks. Otherwise, the analysis report # files will be too large.) self.assertLess( len(second_bucket.bucket_samples), second_bucket.n_samples, )