def __init__(self, dataset: WMTDataset, source_lang: Language, target_lang: Language, local_root: str = '.', source_dataset_filename: str = None, target_dataset_filename: str = None, model_name: str = None, paper_arxiv_id: str = None, paper_pwc_id: str = None, paper_results: dict = None, model_description: str = None, tokenization: Callable[[str], str] = None): super().__init__(model_name, paper_arxiv_id, paper_pwc_id, paper_results, model_description) self.root = change_root_if_server(root=local_root, server_root=".data/nlp/wmt") self.dataset = dataset self.source_lang = source_lang self.target_lang = target_lang default_src_fn, default_dst_fn = self._get_source_dataset_filename() if source_dataset_filename is None or is_server(): source_dataset_filename = default_src_fn if target_dataset_filename is None or is_server(): target_dataset_filename = default_dst_fn self.source_dataset_path = Path(self.root) / source_dataset_filename self.target_dataset_path = Path(self.root) / target_dataset_filename self.metrics = TranslationMetrics(self.source_dataset_path, self.target_dataset_path, tokenization)
def __init__(self, root: str = '.', model_name: str = None, paper_arxiv_id: str = None, paper_pwc_id: str = None, paper_results: dict = None, model_description=None,): """Benchmarking function. Args: root (string): Root directory of the ImageNet Dataset - where the label data is located (or will be downloaded to). model_name (str, optional): The name of the model from the paper - if you want to link your build to a model from a machine learning paper. See the ImageNet benchmark page for model names, https://sotabench.com/benchmarks/image-classification-on-imagenet, e.g. on the paper leaderboard tab. paper_arxiv_id (str, optional): Optional linking to arXiv if you want to link to papers on the leaderboard; put in the corresponding paper's arXiv ID, e.g. '1611.05431'. paper_pwc_id (str, optional): Optional linking to Papers With Code; put in the corresponding papers with code URL slug, e.g. 'u-gat-it-unsupervised-generative-attentional' paper_results (dict, optional) : If the paper model you are reproducing does not have model results on sotabench.com, you can specify the paper results yourself through this argument, where keys are metric names, values are metric values. e.g:: {'Top 1 Accuracy': 0.543, 'Top 5 Accuracy': 0.654}. Ensure that the metric names match those on the sotabench leaderboard - for ImageNet it should be 'Top 1 Accuracy' and 'Top 5 Accuracy'. model_description (str, optional): Optional model description. """ root = self.root = os.path.expanduser(change_root_if_server( root=root, server_root="./.data/vision/imagenet")) self.model_name = model_name self.paper_arxiv_id = paper_arxiv_id self.paper_pwc_id = paper_pwc_id self.paper_results = paper_results self.model_description = model_description self.top1 = AverageMeter() self.top5 = AverageMeter() self.load_targets() self.outputs = {} self.results = None self.first_batch_processed = False self.batch_hash = None self.cached_results = False self.speed_mem_metrics = {} self.init_time = time.time()
def _get_path(self, local_root, local_unzip=False): root = Path(change_root_if_server(root=local_root, server_root=".data/nlp/" + self.pwc_name.lower())) zip_name = self.pwc_name.lower() + "-v1.zip" dataset_path = root / "wiki.test.tokens" if not dataset_path.exists(): # unzip extract_archive(str(root / zip_name), to_path=root.parent) return dataset_path
def get_path(local_root, local_unzip=False): root = Path( change_root_if_server(root=local_root, server_root=".data/nlp/multinli")) zip_name = "MNLI.zip" dataset_path = root / "MNLI" / "dev_matched.tsv" if not dataset_path.exists(): # unzip extract_archive(str(root / zip_name), to_path=root) return (dataset_path, dataset_path.parent / "dev_mismatched.tsv")
def __init__(self, local_root: str = '.', dataset_filename: str = None, model_name: str = None, paper_arxiv_id: str = None, paper_pwc_id: str = None, paper_results: dict = None, model_description=None, version: SQuADVersion = SQuADVersion.V20): """ Creates an evaluator for SQuAD v1.1 or v2.0 Question Answering benchmarks. :param local_root: Path to the directory where the dataset files are located locally. Ignored when run on sotabench server. :param dataset_filename: Local filename of the JSON file with the SQuAD dataset. If None, the standard filename is used, based on :param:`version`. Ignored when run on sotabench server. :param model_name: The name of the model from the paper - if you want to link your build to a model from a machine learning paper. See the SQuAD benchmarks pages for model names, (f.e., https://sotabench.com/benchmarks/question-answering-on-squad11-dev) on the paper leaderboard or models yet to try tabs. :param paper_arxiv_id: Optional linking to arXiv if you want to link to papers on the leaderboard; put in the corresponding paper's arXiv ID, e.g. '1907.10529'. :param paper_pwc_id: Optional linking to Papers With Code; put in the corresponding papers with code URL slug, e.g. 'spanbert-improving-pre-training-by' :param paper_results: If the paper model you are reproducing does not have model results on sotabench.com, you can specify the paper results yourself through this argument, where keys are metric names, values are metric values. e.g: {'EM': 0.858, 'F1': 0.873}. Ensure that the metric names match those on the sotabench leaderboard - for SQuAD benchmarks it should be `EM` for exact match and `F1` for F1 score. Make sure to use results of evaluation on a development set. :param model_description: Optional model description. :param version: Which dataset to evaluate on, either `SQuADVersion.V11` or `SQuADVersion.V20`. """ super().__init__(model_name, paper_arxiv_id, paper_pwc_id, paper_results, model_description) self.root = change_root_if_server(root=local_root, server_root=".data/nlp/squad") self.version = version if dataset_filename is None or is_server(): dataset_filename = "dev-{}.json".format(version.value) self.dataset_path = Path(self.root) / dataset_filename self.metrics = SQuADMetrics(self.dataset_path, version)
def __init__(self, local_root: str = '.', dataset_filename: str = None, model_name: str = None, paper_arxiv_id: str = None, paper_pwc_id: str = None, paper_results: dict = None, model_description=None, version: SQuADVersion = SQuADVersion.V20): super().__init__(model_name, paper_arxiv_id, paper_pwc_id, paper_results, model_description) self.root = change_root_if_server(root=local_root, server_root=".data/nlp/squad") self.version = version if dataset_filename is None or is_server(): dataset_filename = "dev-{}.json".format(version.value) self.dataset_path = Path(self.root) / dataset_filename self.metrics = SQuADMetrics(self.dataset_path, version)
def __init__(self, dataset: WMTDataset, source_lang: Language, target_lang: Language, local_root: str = '.', source_dataset_filename: str = None, target_dataset_filename: str = None, model_name: str = None, paper_arxiv_id: str = None, paper_pwc_id: str = None, paper_results: dict = None, model_description: str = None, tokenization: Callable[[str], str] = None): """ Creates an evaluator for one of the WMT benchmarks. :param dataset: Which dataset to evaluate on, f.e., WMTDataset.News2014. :param source_lang: Source language of the documents to translate. :param target_lang: Target language into which the documents are translated. :param local_root: Path to the directory where the dataset files are located locally. Ignored when run on sotabench server. :param source_dataset_filename: Local filename of the SGML file with the source documents. If None, the standard WMT filename is used, based on :param:`dataset`, :param:`source_lang` and :param:`target_lang`. Ignored when run on sotabench server. :param target_dataset_filename: Local filename of the SGML file with the reference documents. If None, the standard WMT filename is used, based on :param:`dataset`, :param:`source_lang` and :param:`target_lang`. Ignored when run on sotabench server. :param model_name: The name of the model from the paper - if you want to link your build to a model from a machine learning paper. See the WMT benchmarks pages for model names, (f.e., https://sotabench.com/benchmarks/machine-translation-on-wmt2014-english-german) on the paper leaderboard or models yet to try tabs. :param paper_arxiv_id: Optional linking to arXiv if you want to link to papers on the leaderboard; put in the corresponding paper's arXiv ID, e.g. '1907.06616'. :param paper_pwc_id: Optional linking to Papers With Code; put in the corresponding papers with code URL slug, e.g. 'facebook-fairs-wmt19-news-translation-task' :param paper_results: If the paper model you are reproducing does not have model results on sotabench.com, you can specify the paper results yourself through this argument, where keys are metric names, values are metric values. e.g: {'SacreBLEU': 42.7, 'BLEU score': 43.1}. Ensure that the metric names match those on the sotabench leaderboard - for WMT benchmarks it should be `SacreBLEU` for de-tokenized case sensitive BLEU score and `BLEU score` for tokenized BLEU. :param model_description: Optional model description. :param tokenization: An optional tokenization function to compute tokenized BLEU score. It takes a single string - a segment to tokenize, and returns a string with tokens separated by space, f.e.: tokenization = lambda seg: seg.replace("'s", " 's").replace("-", " - ") If None, only de-tokenized SacreBLEU score is reported. """ super().__init__(model_name, paper_arxiv_id, paper_pwc_id, paper_results, model_description) self.root = change_root_if_server(root=local_root, server_root=".data/nlp/wmt") self.dataset = dataset self.source_lang = source_lang self.target_lang = target_lang default_src_fn, default_dst_fn = self._get_source_dataset_filename() if source_dataset_filename is None or is_server(): source_dataset_filename = default_src_fn if target_dataset_filename is None or is_server(): target_dataset_filename = default_dst_fn self.source_dataset_path = Path(self.root) / source_dataset_filename self.target_dataset_path = Path(self.root) / target_dataset_filename self.metrics = TranslationMetrics(self.source_dataset_path, self.target_dataset_path, tokenization)
def __init__( self, root: str = '.', split: str = "val", dataset_year: str = "2017", model_name: str = None, paper_arxiv_id: str = None, paper_pwc_id: str = None, paper_results: dict = None, model_description=None, ): """Benchmarking function. Args: root (string): Root directory of the COCO Dataset - where the label data is located (or will be downloaded to). split (str) : the split for COCO to use, e.g. 'val' dataset_year (str): the dataset year for COCO to use model_name (str, optional): The name of the model from the paper - if you want to link your build to a machine learning paper. See the COCO benchmark page for model names, https://sotabench.com/benchmarks/object-detection-on-coco-minival, e.g. on the paper leaderboard tab. paper_arxiv_id (str, optional): Optional linking to arXiv if you want to link to papers on the leaderboard; put in the corresponding paper's arXiv ID, e.g. '1611.05431'. paper_pwc_id (str, optional): Optional linking to Papers With Code; put in the corresponding papers with code URL slug, e.g. 'u-gat-it-unsupervised-generative-attentional' paper_results (dict, optional) : If the paper you are reproducing does not have model results on sotabench.com, you can specify the paper results yourself through this argument, where keys are metric names, values are metric values. e.g:: {'box AP': 0.349, 'AP50': 0.592, ...}. Ensure that the metric names match those on the sotabench leaderboard - for COCO it should be 'box AP', 'AP50', 'AP75', 'APS', 'APM', 'APL' model_description (str, optional): Optional model description. """ root = self.root = change_root_if_server( root=root, server_root="./.data/vision/coco") self.model_name = model_name self.paper_arxiv_id = paper_arxiv_id self.paper_pwc_id = paper_pwc_id self.paper_results = paper_results self.model_description = model_description self.split = split annFile = os.path.join( root, "annotations/instances_%s%s.json" % (self.split, dataset_year)) self._download(annFile) self.coco = COCO(annFile) self.iou_types = ['bbox'] self.coco_evaluator = CocoEvaluator(self.coco, self.iou_types) self.detections = [] self.results = None self.first_batch_processed = False self.batch_hash = None self.cached_results = False self.speed_mem_metrics = {} self.init_time = time.time()