def cache_online_file( online_path: str, local_path: str, lifetime: datetime.timedelta | None = None ) -> str: """ Caches an online file locally and returns the path to the local file. :param online_path: The path online :param local_path: The relative path to the file locally :param lifetime: How long this file should be cached before reloading :return: The absolute file to the cached path locally """ sanitized_path = sanitize_path(local_path) file_path = os.path.join(get_cache_dir(), sanitized_path) # Use cached file if it exists and is young enough if os.path.exists(file_path): mod_time = datetime.datetime.fromtimestamp(os.path.getmtime(file_path)) age = datetime.datetime.now() - mod_time if lifetime is None or age <= lifetime: return file_path # Else download from online get_logger().info(f'Caching {online_path} to {file_path}') path_dir = Path(file_path).parent.absolute() if not os.path.exists(path_dir): os.makedirs(path_dir) urllib.request.urlretrieve(online_path, file_path) return file_path
def test_extractive_qa_en(self): json_en_dataset = os.path.join(self.artifact_path, "dataset-xquad-en.json") json_en_output = os.path.join(self.artifact_path, "output-xquad-en.json") loader = get_custom_dataset_loader( TaskType.qa_extractive, json_en_dataset, json_en_output, Source.local_filesystem, Source.local_filesystem, FileType.json, FileType.json, ) data = loader.load() self.assertEqual(len(data), 1190) sample = data[0] self.assertEqual(sample["predicted_answers"], {"text": "308"}) self.assertEqual(sample["id"], "0") self.assertEqual(sample["answers"], { "answer_start": [-1], "text": ["308"] }) self.assertEqual( sample["question"], "How many points did the Panthers defense surrender ?") self.assertTrue(sample["context"].startswith("The Panthers")) metadata = { "task_name": TaskType.qa_extractive, "dataset_name": "squad", "metric_names": ["F1ScoreQA", "ExactMatchQA"], # "language":"en" } processor = get_processor(TaskType.qa_extractive) sys_info = processor.process(metadata, data) # analysis.write_to_directory("./") self.assertIsNotNone(sys_info.results.fine_grained) self.assertGreater(len(sys_info.results.overall), 0) get_logger('test').info(f'OVERALL={sys_info.results.overall}') # should be 0.6974789915966386 self.assertAlmostEqual( sys_info.results.overall["ExactMatch"].value, 0.6974789915966386, 2, "almost equal", ) # should be 0.8235975260931867 self.assertAlmostEqual( sys_info.results.overall["F1"].value, 0.8235975260931867, 2, "almost equal", )
def test_extractive_qa_zh(self): json_zh_dataset = os.path.join(self.artifact_path, "dataset-xquad-zh.json") json_zh_output = os.path.join(self.artifact_path, "output-xquad-zh.json") loader = get_custom_dataset_loader( TaskType.qa_extractive, json_zh_dataset, json_zh_output, Source.local_filesystem, Source.local_filesystem, FileType.json, FileType.json, ) data = loader.load() metadata = { "task_name": TaskType.qa_extractive.value, "dataset_name": "squad", "metric_names": ["F1Score", "ExactMatch"], "source_language": "zh", "target_language": "zh", } processor = get_processor(TaskType.qa_extractive) sys_info = processor.process(metadata, data) get_logger('test').info( f'--------- sys_info.metric_configs {sys_info.metric_configs}') # analysis.write_to_directory("./") self.assertIsNotNone(sys_info.results.fine_grained) self.assertGreater(len(sys_info.results.overall), 0) # 0.6285714285714286 self.assertAlmostEqual( sys_info.results.overall["ExactMatch"].value, 0.6285714285714286, 2, "almost equal", ) # 0.7559651817716333 self.assertAlmostEqual( sys_info.results.overall["F1"].value, 0.7559651817716333, 2, "almost equal", )
def print_score_tensor(score_tensor: dict): """ print the score_tensor, for example, ---------------------------------------- System: CL-mt5base, Dataset: xnli Language: ar bg de el en es fr Accuracy: 0.679 0.714 0.721 0.722 0.768 0.738 0.721 ---------------------------------------- System: CL-mlpp15out1sum, Dataset: xnli Language: ar bg de el en es fr Accuracy: 0.696 0.739 0.735 0.739 0.787 0.768 0.730 ---------------------------------------- System: CL-mlpp15out1sum, Dataset: marc Language: de en es fr ja zh Accuracy: 0.933 0.915 0.934 0.926 0.915 0.871 """ get_logger('report').info(score_tensor.keys()) for system_name, m_value in score_tensor.items(): for dataset_name, d_value in score_tensor[system_name].items(): info_printed = ( f"----------------------------------------\nSystem: " f"{system_name}, Dataset: " f"{dataset_name} \n" ) info_printed += ( "Language:\t" + "\t".join(score_tensor[system_name][dataset_name].keys()) + "\n" ) metric_name = list(score_tensor[system_name][dataset_name].values())[0][ "metric_name" ] info_printed += ( f"{metric_name}:\t" + "\t".join( [ '{:.3f}'.format(score["value"]) for score in score_tensor[system_name][dataset_name].values() ] ) + "\n" ) get_logger('report').info(info_printed)
def print_bucket_perfs(bucket_perfs: list[BucketPerformance], print_information: str): metric_names = [x.metric_name for x in bucket_perfs[0].performances] for i, metric_name in enumerate(metric_names): get_logger('report').info(f"the information of #{print_information}#") get_logger('report').info(f"bucket_interval\t{metric_name}\t#samples") for bucket_perf in bucket_perfs: get_logger('report').info(f"{bucket_perf.bucket_interval}\t" f"{bucket_perf.performances[i].value}\t" f"{bucket_perf.n_samples}") get_logger('report').info('')
def _gen_external_stats( self, sys_info: SysOutputInfo, statistics_func: aggregating ): """Generate external statistics that are gathered from a relatively costly source, such as the training set. These are gathered once and then cached for future use. :param sys_info: Information about the system outputs :param statistics_func: The function used to get the statistics :return: Statistics from, usually, the training set that are used to calculate other features """ statistics = None if sys_info.dataset_name is not None: split_name = "train" sub_dataset = ( None if sys_info.sub_dataset_name == "default" else sys_info.sub_dataset_name ) # read statistics from cache if sys_info.reload_stat: statistics = read_statistics_from_cache( sys_info.dataset_name, sub_dataset ) if statistics is None: try: dataset = load_dataset(sys_info.dataset_name, sub_dataset) except Exception: dataset = None if dataset is None: get_logger().warning( f"{sys_info.dataset_name} hasn't been supported by DataLab so" " no training set dependent features will be supported by" " ExplainaBoard. You can add the dataset by: https://github.com/ExpressAI/DataLab/blob/main/docs/SDK/add_new_datasets_into_sdk.md" # noqa ) elif not ( isinstance(dataset, Dataset) or isinstance(dataset, DatasetDict) ): raise ValueError( 'Expecting type Dataset or DatasetDict, ' f'but got {type(dataset)}' ) elif split_name not in dataset: get_logger().warning( f"{sys_info.dataset_name} has no {split_name} split in DataLab " "so training set dependent features will not be calculated" ) else: self._statistics_func.resources = self._get_statistics_resources( sys_info ) new_train = dataset[split_name].apply( # type: ignore self._statistics_func, mode="local" ) statistics = new_train._stat get_logger().info( f"caching stats for {sys_info.dataset_name} {sub_dataset}" ) write_statistics_to_cache( statistics, sys_info.dataset_name, sub_dataset ) return statistics
import json import requests from explainaboard.utils.logging import get_logger if __name__ == "__main__": end_point_upload_dataset = "https://datalab.nlpedia.ai/api/normal_dataset/read_stat" data_info = { 'dataset_name': 'sst2', 'subset_name': None, 'version': 'Hugging Face', 'transformation': {'type': 'origin'}, } response = requests.post(end_point_upload_dataset, json=data_info) message = json.loads(response.text.replace("null", ""))["message"] get_logger('test').info(message) """ (1) success (2) dataset does not exist (3) the dataset does not include the information of _stat """ return_content = json.loads(response.content) get_logger('test').info(return_content['content'])
def main(): args = create_parser().parse_args() reload_stat: bool = False if args.reload_stat == "0" else True system_outputs: list[str] = args.system_outputs reports: list[str] | None = args.reports metric_names: list[str] | None = args.metrics dataset_file_type: str | None = args.custom_dataset_file_type output_file_type: str | None = args.output_file_type output_dir: str = args.output_dir # If reports have been specified, ExplainaBoard cli will perform analysis # over report files. if args.reports: analyze_reports(args) else: def load_system_details_path(): if args.system_details: try: with open(args.system_details) as fin: return json.load(fin) except ValueError as e: raise ValueError(f'invalid json: {e} for system details') output_dir_figures = os.path.join(output_dir, "figures") output_dir_reports = os.path.join(output_dir, "reports") def setup_output_folders(): """Setup for generated reports and figures""" # This part could be generalized if not os.path.exists(output_dir): os.makedirs(output_dir) if not os.path.exists(output_dir_figures): os.makedirs(output_dir_figures) if not os.path.exists(output_dir_reports): os.makedirs(output_dir_reports) system_details: dict | None = load_system_details_path() setup_output_folders() # check for benchmark submission: explainaboard --system_outputs ./data/ # system_outputs/sst2/user_specified_metadata.json num_systems = len(system_outputs) dataset_file_types: list[str | None] = [dataset_file_type] * num_systems output_file_types: list[str | None] = [output_file_type] * num_systems custom_dataset_paths: list[str] | None = args.custom_dataset_paths dataset: str | None = args.dataset sub_dataset: str | None = args.sub_dataset split: str = args.split target_language: str = args.target_language source_language: str = args.source_language or target_language tasks = get_tasks(args.task, system_outputs) # Some loaders need to know the language of the inputs and outputs loader_field_mapping = { FileLoaderField.SOURCE_LANGUAGE: source_language, FileLoaderField.TARGET_LANGUAGE: target_language, } if custom_dataset_paths: # load custom datasets loaders = [ get_custom_dataset_loader( task, dataset, output, Source.local_filesystem, Source.local_filesystem, dataset_file_type, output_file_type, field_mapping=loader_field_mapping, ) for task, dataset, output, dataset_file_type, output_file_type in zip( tasks, custom_dataset_paths, system_outputs, dataset_file_types, output_file_types, ) ] else: # load from datalab if not dataset: raise ValueError( "neither custom_dataset_paths or dataset is defined") loaders = [ get_datalab_loader( task, DatalabLoaderOption(dataset, sub_dataset, split=split), sys_output, Source.local_filesystem, output_file_type, field_mapping=loader_field_mapping, ) for task, sys_output, output_file_type in zip( tasks, system_outputs, output_file_types) ] system_datasets = [loader.load() for loader in loaders] # validation if len(system_datasets) == 2: if len(system_datasets[0]) != len(system_datasets[1]): num0 = len(system_datasets[0]) num1 = len(system_datasets[1]) raise ValueError( f'Data must be identical for pairwise analysis, but length of ' 'files ' f'{system_datasets[0]} ({num0}) != {system_datasets[1]} ({num1})' ) # TODO(gneubig): This gets metadata from the first system and assumes it's the # same for other systems target_language = (target_language or system_datasets[0].metadata.target_language or 'en') source_language = (source_language or system_datasets[0].metadata.source_language or target_language) # Setup metadata metadata = { "dataset_name": dataset, "sub_dataset_name": sub_dataset, "split_name": split, "source_language": source_language, "target_language": target_language, "reload_stat": reload_stat, "conf_value": args.conf_value, "system_details": system_details, "custom_features": system_datasets[0].metadata.custom_features, } if metric_names is not None: if 'metric_configs' in metadata: raise ValueError( 'Cannot specify both metric names and metric configs') metric_configs = [ metric_name_to_config(name, source_language, target_language) for name in metric_names ] metadata["metric_configs"] = metric_configs # Run analysis reports: list[SysOutputInfo] = [] for loader, system_dataset, system_full_path, task in zip( loaders, system_datasets, system_outputs, tasks): # metadata.update(loader.user_defined_metadata_configs) # metadata[ # "user_defined_features_configs" # ] = loader.user_defined_features_configs metadata["task_name"] = task processor = get_processor(task=task) report = processor.process(metadata=metadata, sys_output=system_dataset) reports.append(report) # print to the console get_logger('report').info('--- Overall Performance') for metric_stat in report.results.overall.values(): get_logger('report').info( f'{metric_stat.metric_name}\t{metric_stat.value}') get_logger('report').info('') get_logger('report').info('--- Bucketed Performance') processor.print_bucket_info(report.results.fine_grained) # save report to `output_dir_reports` x_file_name = os.path.basename(system_full_path).split(".")[0] report.write_to_directory(output_dir_reports, f"{x_file_name}.json") # generate figures and save them into `output_dir_figures` if not os.path.exists(f"{output_dir_figures}/{x_file_name}"): os.makedirs(f"{output_dir_figures}/{x_file_name}") draw_bar_chart_from_reports( [f"{output_dir_reports}/{x_file_name}.json"], f"{output_dir_figures}/{x_file_name}", ) if args.report_json is not None: report_file = open(args.report_json, 'w') else: report_file = sys.stdout if len(system_outputs) == 1: # individual system analysis reports[0].print_as_json(file=report_file) elif len(system_outputs) == 2: # pairwise analysis compare_analysis = get_pairwise_performance_gap( reports[0], reports[1]) compare_analysis.print_as_json(file=report_file) if args.report_json is not None: report_file.close()
def draw_bar_chart_from_reports(reports: list[str], output_dir: str, sys_names: list[str] | None = None) -> None: """ Draw bar charts from report file generated from ExplainaBoard :param reports: Reports to plot :param output_dir: :return: """ # TODO(gneubig): This should get the system name from inside the report if sys_names is None: sys_names = [os.path.basename(x).replace('.json', '') for x in reports] elif len(sys_names) != len(reports): raise ValueError('Length of sys_names must equal that of reports') report_info: list[SysOutputInfo] = [] for report in reports: with open(report) as fin: report_info.append(SysOutputInfo.from_dict(json.load(fin))) overall_results = [ list(unwrap(x.results.overall).values()) for x in report_info ] overall_metric_names = list(unwrap(report_info[0].results.overall).keys()) fg_results = [unwrap(x.results.fine_grained) for x in report_info] if not os.path.exists(output_dir): os.makedirs(output_dir) # Overall performance ys = [[x.value for x in y] for y in overall_results] y_errs = None if overall_results[0][0].confidence_score_low is not None: y_errs = [( [x.value - unwrap(x.confidence_score_low) for x in y], [unwrap(x.confidence_score_high) - x.value for x in y], ) for y in overall_results] make_bar_chart( ys, output_dir, 'overall', output_fig_format='png', fig_size=(8, 6), sys_names=sys_names, errs=y_errs, title=None, xticklabels=overall_metric_names, ylabel='metric value', ) # Bucket performance: feature name, for example, sentence length for feature_name in progress(fg_results[0].keys()): # Make sure that buckets exist buckets: list[list[BucketPerformance]] = [] for i, fg_result in enumerate(fg_results): if feature_name not in fg_result: get_logger().error( f'error: feature {feature_name} not in {reports[i]}') else: buckets.append(fg_result[feature_name]) bnames0, bnames = [x.bucket_interval for x in buckets[0] ], [x.bucket_interval for x in buckets[-1]] if len(bnames0) != len(bnames): get_logger().error( f'error: different number of buckets for {feature_name} in ' f'{reports[0]} and {reports[i]}') buckets = [] elif bnames0 != bnames: get_logger().warning( f'warning: different bucket labels for {feature_name} in ' f'{reports[0]} and {reports[i]}') if len(buckets) != i + 1: break if len(buckets) != len(reports): continue bucket0_intervals = [x.bucket_interval for x in buckets[0]] bucket_metrics = [x.metric_name for x in buckets[0][0].performances] for metric_id, metric_name in enumerate(bucket_metrics): performances: list[list[Performance]] = [ [x.performances[metric_id] for x in y] for y in buckets ] ys = [[x.value for x in y] for y in performances] y_errs = None if performances[0][0].confidence_score_low is not None: y_errs = [( [x.value - unwrap(x.confidence_score_low) for x in y], [unwrap(x.confidence_score_high) - x.value for x in y], ) for y in performances] make_bar_chart( ys, output_dir, f'{feature_name}_{metric_name}', output_fig_format='png', fig_size=(8, 6), sys_names=sys_names, errs=y_errs, title=None, xlabel=feature_name, xticklabels=bucket0_intervals, ylabel=metric_name, )
import copy import dataclasses from dataclasses import dataclass, field import json import os import sys from typing import Any, Optional from explainaboard import config from explainaboard.feature import Features from explainaboard.metrics.metric import MetricConfig, MetricStats from explainaboard.utils.logging import get_logger from explainaboard.utils.serialization import general_to_dict from explainaboard.utils.tokenizer import Tokenizer logger = get_logger(__name__) @dataclass class Table: table: Optional[dict] = None @dataclass class PaperInfo: """ "year": "xx", "venue": "xx", "title": "xx", "author": "xx", "url": "xx",