예제 #1
0
def cache_online_file(
    online_path: str, local_path: str, lifetime: datetime.timedelta | None = None
) -> str:
    """
    Caches an online file locally and returns the path to the local file.
    :param online_path: The path online
    :param local_path: The relative path to the file locally
    :param lifetime: How long this file should be cached before reloading
    :return: The absolute file to the cached path locally
    """
    sanitized_path = sanitize_path(local_path)
    file_path = os.path.join(get_cache_dir(), sanitized_path)
    # Use cached file if it exists and is young enough
    if os.path.exists(file_path):
        mod_time = datetime.datetime.fromtimestamp(os.path.getmtime(file_path))
        age = datetime.datetime.now() - mod_time
        if lifetime is None or age <= lifetime:
            return file_path
    # Else download from online
    get_logger().info(f'Caching {online_path} to {file_path}')
    path_dir = Path(file_path).parent.absolute()
    if not os.path.exists(path_dir):
        os.makedirs(path_dir)
    urllib.request.urlretrieve(online_path, file_path)
    return file_path
예제 #2
0
    def test_extractive_qa_en(self):
        json_en_dataset = os.path.join(self.artifact_path,
                                       "dataset-xquad-en.json")
        json_en_output = os.path.join(self.artifact_path,
                                      "output-xquad-en.json")
        loader = get_custom_dataset_loader(
            TaskType.qa_extractive,
            json_en_dataset,
            json_en_output,
            Source.local_filesystem,
            Source.local_filesystem,
            FileType.json,
            FileType.json,
        )
        data = loader.load()
        self.assertEqual(len(data), 1190)
        sample = data[0]
        self.assertEqual(sample["predicted_answers"], {"text": "308"})
        self.assertEqual(sample["id"], "0")
        self.assertEqual(sample["answers"], {
            "answer_start": [-1],
            "text": ["308"]
        })
        self.assertEqual(
            sample["question"],
            "How many points did the Panthers defense surrender ?")
        self.assertTrue(sample["context"].startswith("The Panthers"))

        metadata = {
            "task_name": TaskType.qa_extractive,
            "dataset_name": "squad",
            "metric_names": ["F1ScoreQA", "ExactMatchQA"],
            # "language":"en"
        }

        processor = get_processor(TaskType.qa_extractive)
        sys_info = processor.process(metadata, data)

        # analysis.write_to_directory("./")
        self.assertIsNotNone(sys_info.results.fine_grained)
        self.assertGreater(len(sys_info.results.overall), 0)
        get_logger('test').info(f'OVERALL={sys_info.results.overall}')
        # should be 0.6974789915966386
        self.assertAlmostEqual(
            sys_info.results.overall["ExactMatch"].value,
            0.6974789915966386,
            2,
            "almost equal",
        )
        # should be 0.8235975260931867
        self.assertAlmostEqual(
            sys_info.results.overall["F1"].value,
            0.8235975260931867,
            2,
            "almost equal",
        )
예제 #3
0
    def test_extractive_qa_zh(self):
        json_zh_dataset = os.path.join(self.artifact_path,
                                       "dataset-xquad-zh.json")
        json_zh_output = os.path.join(self.artifact_path,
                                      "output-xquad-zh.json")
        loader = get_custom_dataset_loader(
            TaskType.qa_extractive,
            json_zh_dataset,
            json_zh_output,
            Source.local_filesystem,
            Source.local_filesystem,
            FileType.json,
            FileType.json,
        )
        data = loader.load()
        metadata = {
            "task_name": TaskType.qa_extractive.value,
            "dataset_name": "squad",
            "metric_names": ["F1Score", "ExactMatch"],
            "source_language": "zh",
            "target_language": "zh",
        }

        processor = get_processor(TaskType.qa_extractive)

        sys_info = processor.process(metadata, data)
        get_logger('test').info(
            f'--------- sys_info.metric_configs {sys_info.metric_configs}')

        # analysis.write_to_directory("./")
        self.assertIsNotNone(sys_info.results.fine_grained)
        self.assertGreater(len(sys_info.results.overall), 0)
        # 0.6285714285714286
        self.assertAlmostEqual(
            sys_info.results.overall["ExactMatch"].value,
            0.6285714285714286,
            2,
            "almost equal",
        )
        # 0.7559651817716333
        self.assertAlmostEqual(
            sys_info.results.overall["F1"].value,
            0.7559651817716333,
            2,
            "almost equal",
        )
예제 #4
0
def print_score_tensor(score_tensor: dict):
    """
    print the score_tensor, for example,
     ----------------------------------------
    System: CL-mt5base, Dataset: xnli
    Language:       ar      bg      de      el      en      es      fr
    Accuracy:       0.679   0.714   0.721   0.722   0.768   0.738   0.721

    ----------------------------------------
    System: CL-mlpp15out1sum, Dataset: xnli
    Language:       ar      bg      de      el      en      es      fr
    Accuracy:       0.696   0.739   0.735   0.739   0.787   0.768   0.730

    ----------------------------------------
    System: CL-mlpp15out1sum, Dataset: marc
    Language:       de      en      es      fr      ja      zh
    Accuracy:       0.933   0.915   0.934   0.926   0.915   0.871

    """
    get_logger('report').info(score_tensor.keys())
    for system_name, m_value in score_tensor.items():
        for dataset_name, d_value in score_tensor[system_name].items():
            info_printed = (
                f"----------------------------------------\nSystem: "
                f"{system_name}, Dataset: "
                f"{dataset_name} \n"
            )
            info_printed += (
                "Language:\t"
                + "\t".join(score_tensor[system_name][dataset_name].keys())
                + "\n"
            )
            metric_name = list(score_tensor[system_name][dataset_name].values())[0][
                "metric_name"
            ]
            info_printed += (
                f"{metric_name}:\t"
                + "\t".join(
                    [
                        '{:.3f}'.format(score["value"])
                        for score in score_tensor[system_name][dataset_name].values()
                    ]
                )
                + "\n"
            )
            get_logger('report').info(info_printed)
예제 #5
0
def print_bucket_perfs(bucket_perfs: list[BucketPerformance],
                       print_information: str):
    metric_names = [x.metric_name for x in bucket_perfs[0].performances]
    for i, metric_name in enumerate(metric_names):
        get_logger('report').info(f"the information of #{print_information}#")
        get_logger('report').info(f"bucket_interval\t{metric_name}\t#samples")
        for bucket_perf in bucket_perfs:
            get_logger('report').info(f"{bucket_perf.bucket_interval}\t"
                                      f"{bucket_perf.performances[i].value}\t"
                                      f"{bucket_perf.n_samples}")
        get_logger('report').info('')
예제 #6
0
 def _gen_external_stats(
     self, sys_info: SysOutputInfo, statistics_func: aggregating
 ):
     """Generate external statistics that are gathered from a relatively costly
     source, such as the training set.
     These are gathered once and then cached for future use.
     :param sys_info: Information about the system outputs
     :param statistics_func: The function used to get the statistics
     :return: Statistics from, usually, the training set that are used to calculate
         other features
     """
     statistics = None
     if sys_info.dataset_name is not None:
         split_name = "train"
         sub_dataset = (
             None
             if sys_info.sub_dataset_name == "default"
             else sys_info.sub_dataset_name
         )
         # read statistics from cache
         if sys_info.reload_stat:
             statistics = read_statistics_from_cache(
                 sys_info.dataset_name, sub_dataset
             )
         if statistics is None:
             try:
                 dataset = load_dataset(sys_info.dataset_name, sub_dataset)
             except Exception:
                 dataset = None
             if dataset is None:
                 get_logger().warning(
                     f"{sys_info.dataset_name} hasn't been supported by DataLab so"
                     " no training set dependent features will be supported by"
                     " ExplainaBoard. You can add the dataset by: https://github.com/ExpressAI/DataLab/blob/main/docs/SDK/add_new_datasets_into_sdk.md"  # noqa
                 )
             elif not (
                 isinstance(dataset, Dataset) or isinstance(dataset, DatasetDict)
             ):
                 raise ValueError(
                     'Expecting type Dataset or DatasetDict, '
                     f'but got {type(dataset)}'
                 )
             elif split_name not in dataset:
                 get_logger().warning(
                     f"{sys_info.dataset_name} has no {split_name} split in DataLab "
                     "so training set dependent features will not be calculated"
                 )
             else:
                 self._statistics_func.resources = self._get_statistics_resources(
                     sys_info
                 )
                 new_train = dataset[split_name].apply(  # type: ignore
                     self._statistics_func, mode="local"
                 )
                 statistics = new_train._stat
                 get_logger().info(
                     f"caching stats for {sys_info.dataset_name} {sub_dataset}"
                 )
                 write_statistics_to_cache(
                     statistics, sys_info.dataset_name, sub_dataset
                 )
     return statistics
예제 #7
0
import json

import requests

from explainaboard.utils.logging import get_logger

if __name__ == "__main__":
    end_point_upload_dataset = "https://datalab.nlpedia.ai/api/normal_dataset/read_stat"
    data_info = {
        'dataset_name': 'sst2',
        'subset_name': None,
        'version': 'Hugging Face',
        'transformation': {'type': 'origin'},
    }
    response = requests.post(end_point_upload_dataset, json=data_info)

    message = json.loads(response.text.replace("null", ""))["message"]
    get_logger('test').info(message)
    """
    (1) success
    (2) dataset does not exist
    (3) the dataset does not include the information of _stat
    """
    return_content = json.loads(response.content)
    get_logger('test').info(return_content['content'])
예제 #8
0
def main():
    args = create_parser().parse_args()

    reload_stat: bool = False if args.reload_stat == "0" else True
    system_outputs: list[str] = args.system_outputs

    reports: list[str] | None = args.reports
    metric_names: list[str] | None = args.metrics
    dataset_file_type: str | None = args.custom_dataset_file_type
    output_file_type: str | None = args.output_file_type
    output_dir: str = args.output_dir

    # If reports have been specified, ExplainaBoard cli will perform analysis
    # over report files.
    if args.reports:
        analyze_reports(args)
    else:

        def load_system_details_path():
            if args.system_details:
                try:
                    with open(args.system_details) as fin:
                        return json.load(fin)
                except ValueError as e:
                    raise ValueError(f'invalid json: {e} for system details')

        output_dir_figures = os.path.join(output_dir, "figures")
        output_dir_reports = os.path.join(output_dir, "reports")

        def setup_output_folders():
            """Setup for generated reports and figures"""
            # This part could be generalized
            if not os.path.exists(output_dir):
                os.makedirs(output_dir)
            if not os.path.exists(output_dir_figures):
                os.makedirs(output_dir_figures)
            if not os.path.exists(output_dir_reports):
                os.makedirs(output_dir_reports)

        system_details: dict | None = load_system_details_path()
        setup_output_folders()

        # check for benchmark submission: explainaboard  --system_outputs ./data/
        # system_outputs/sst2/user_specified_metadata.json
        num_systems = len(system_outputs)
        dataset_file_types: list[str
                                 | None] = [dataset_file_type] * num_systems
        output_file_types: list[str | None] = [output_file_type] * num_systems
        custom_dataset_paths: list[str] | None = args.custom_dataset_paths
        dataset: str | None = args.dataset
        sub_dataset: str | None = args.sub_dataset
        split: str = args.split
        target_language: str = args.target_language
        source_language: str = args.source_language or target_language
        tasks = get_tasks(args.task, system_outputs)

        # Some loaders need to know the language of the inputs and outputs
        loader_field_mapping = {
            FileLoaderField.SOURCE_LANGUAGE: source_language,
            FileLoaderField.TARGET_LANGUAGE: target_language,
        }
        if custom_dataset_paths:  # load custom datasets
            loaders = [
                get_custom_dataset_loader(
                    task,
                    dataset,
                    output,
                    Source.local_filesystem,
                    Source.local_filesystem,
                    dataset_file_type,
                    output_file_type,
                    field_mapping=loader_field_mapping,
                ) for task, dataset, output, dataset_file_type,
                output_file_type in zip(
                    tasks,
                    custom_dataset_paths,
                    system_outputs,
                    dataset_file_types,
                    output_file_types,
                )
            ]
        else:  # load from datalab
            if not dataset:
                raise ValueError(
                    "neither custom_dataset_paths or dataset is defined")
            loaders = [
                get_datalab_loader(
                    task,
                    DatalabLoaderOption(dataset, sub_dataset, split=split),
                    sys_output,
                    Source.local_filesystem,
                    output_file_type,
                    field_mapping=loader_field_mapping,
                ) for task, sys_output, output_file_type in zip(
                    tasks, system_outputs, output_file_types)
            ]
        system_datasets = [loader.load() for loader in loaders]

        # validation
        if len(system_datasets) == 2:
            if len(system_datasets[0]) != len(system_datasets[1]):
                num0 = len(system_datasets[0])
                num1 = len(system_datasets[1])
                raise ValueError(
                    f'Data must be identical for pairwise analysis, but length of '
                    'files '
                    f'{system_datasets[0]} ({num0}) != {system_datasets[1]} ({num1})'
                )

        # TODO(gneubig): This gets metadata from the first system and assumes it's the
        #  same for other systems
        target_language = (target_language
                           or system_datasets[0].metadata.target_language
                           or 'en')
        source_language = (source_language
                           or system_datasets[0].metadata.source_language
                           or target_language)

        # Setup metadata
        metadata = {
            "dataset_name": dataset,
            "sub_dataset_name": sub_dataset,
            "split_name": split,
            "source_language": source_language,
            "target_language": target_language,
            "reload_stat": reload_stat,
            "conf_value": args.conf_value,
            "system_details": system_details,
            "custom_features": system_datasets[0].metadata.custom_features,
        }

        if metric_names is not None:
            if 'metric_configs' in metadata:
                raise ValueError(
                    'Cannot specify both metric names and metric configs')
            metric_configs = [
                metric_name_to_config(name, source_language, target_language)
                for name in metric_names
            ]
            metadata["metric_configs"] = metric_configs

        # Run analysis
        reports: list[SysOutputInfo] = []
        for loader, system_dataset, system_full_path, task in zip(
                loaders, system_datasets, system_outputs, tasks):

            # metadata.update(loader.user_defined_metadata_configs)
            # metadata[
            #     "user_defined_features_configs"
            # ] = loader.user_defined_features_configs
            metadata["task_name"] = task

            processor = get_processor(task=task)
            report = processor.process(metadata=metadata,
                                       sys_output=system_dataset)
            reports.append(report)

            # print to the console
            get_logger('report').info('--- Overall Performance')
            for metric_stat in report.results.overall.values():
                get_logger('report').info(
                    f'{metric_stat.metric_name}\t{metric_stat.value}')
            get_logger('report').info('')
            get_logger('report').info('--- Bucketed Performance')
            processor.print_bucket_info(report.results.fine_grained)

            # save report to `output_dir_reports`
            x_file_name = os.path.basename(system_full_path).split(".")[0]
            report.write_to_directory(output_dir_reports,
                                      f"{x_file_name}.json")

            # generate figures and save them into  `output_dir_figures`
            if not os.path.exists(f"{output_dir_figures}/{x_file_name}"):
                os.makedirs(f"{output_dir_figures}/{x_file_name}")
            draw_bar_chart_from_reports(
                [f"{output_dir_reports}/{x_file_name}.json"],
                f"{output_dir_figures}/{x_file_name}",
            )

        if args.report_json is not None:
            report_file = open(args.report_json, 'w')
        else:
            report_file = sys.stdout
        if len(system_outputs) == 1:  # individual system analysis
            reports[0].print_as_json(file=report_file)
        elif len(system_outputs) == 2:  # pairwise analysis
            compare_analysis = get_pairwise_performance_gap(
                reports[0], reports[1])
            compare_analysis.print_as_json(file=report_file)
        if args.report_json is not None:
            report_file.close()
예제 #9
0
def draw_bar_chart_from_reports(reports: list[str],
                                output_dir: str,
                                sys_names: list[str] | None = None) -> None:
    """
    Draw bar charts from report file generated from ExplainaBoard
    :param reports: Reports to plot
    :param output_dir:
    :return:
    """

    # TODO(gneubig): This should get the system name from inside the report
    if sys_names is None:
        sys_names = [os.path.basename(x).replace('.json', '') for x in reports]
    elif len(sys_names) != len(reports):
        raise ValueError('Length of sys_names must equal that of reports')

    report_info: list[SysOutputInfo] = []
    for report in reports:
        with open(report) as fin:
            report_info.append(SysOutputInfo.from_dict(json.load(fin)))
    overall_results = [
        list(unwrap(x.results.overall).values()) for x in report_info
    ]
    overall_metric_names = list(unwrap(report_info[0].results.overall).keys())
    fg_results = [unwrap(x.results.fine_grained) for x in report_info]

    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Overall performance
    ys = [[x.value for x in y] for y in overall_results]
    y_errs = None
    if overall_results[0][0].confidence_score_low is not None:
        y_errs = [(
            [x.value - unwrap(x.confidence_score_low) for x in y],
            [unwrap(x.confidence_score_high) - x.value for x in y],
        ) for y in overall_results]

    make_bar_chart(
        ys,
        output_dir,
        'overall',
        output_fig_format='png',
        fig_size=(8, 6),
        sys_names=sys_names,
        errs=y_errs,
        title=None,
        xticklabels=overall_metric_names,
        ylabel='metric value',
    )

    # Bucket performance: feature name, for example, sentence length
    for feature_name in progress(fg_results[0].keys()):
        # Make sure that buckets exist
        buckets: list[list[BucketPerformance]] = []
        for i, fg_result in enumerate(fg_results):
            if feature_name not in fg_result:
                get_logger().error(
                    f'error: feature {feature_name} not in {reports[i]}')
            else:
                buckets.append(fg_result[feature_name])
                bnames0, bnames = [x.bucket_interval for x in buckets[0]
                                   ], [x.bucket_interval for x in buckets[-1]]
                if len(bnames0) != len(bnames):
                    get_logger().error(
                        f'error: different number of buckets for {feature_name} in '
                        f'{reports[0]} and {reports[i]}')
                    buckets = []
                elif bnames0 != bnames:
                    get_logger().warning(
                        f'warning: different bucket labels for {feature_name} in '
                        f'{reports[0]} and {reports[i]}')
            if len(buckets) != i + 1:
                break
        if len(buckets) != len(reports):
            continue

        bucket0_intervals = [x.bucket_interval for x in buckets[0]]
        bucket_metrics = [x.metric_name for x in buckets[0][0].performances]
        for metric_id, metric_name in enumerate(bucket_metrics):

            performances: list[list[Performance]] = [
                [x.performances[metric_id] for x in y] for y in buckets
            ]
            ys = [[x.value for x in y] for y in performances]

            y_errs = None
            if performances[0][0].confidence_score_low is not None:
                y_errs = [(
                    [x.value - unwrap(x.confidence_score_low) for x in y],
                    [unwrap(x.confidence_score_high) - x.value for x in y],
                ) for y in performances]

            make_bar_chart(
                ys,
                output_dir,
                f'{feature_name}_{metric_name}',
                output_fig_format='png',
                fig_size=(8, 6),
                sys_names=sys_names,
                errs=y_errs,
                title=None,
                xlabel=feature_name,
                xticklabels=bucket0_intervals,
                ylabel=metric_name,
            )
예제 #10
0
import copy
import dataclasses
from dataclasses import dataclass, field
import json
import os
import sys
from typing import Any, Optional

from explainaboard import config
from explainaboard.feature import Features
from explainaboard.metrics.metric import MetricConfig, MetricStats
from explainaboard.utils.logging import get_logger
from explainaboard.utils.serialization import general_to_dict
from explainaboard.utils.tokenizer import Tokenizer

logger = get_logger(__name__)


@dataclass
class Table:
    table: Optional[dict] = None


@dataclass
class PaperInfo:
    """
    "year": "xx",
    "venue": "xx",
    "title": "xx",
    "author": "xx",
    "url": "xx",