def __init__(self, dataset: WMTDataset, source_lang: Language, target_lang: Language, local_root: str = '.', source_dataset_filename: str = None, target_dataset_filename: str = None, model_name: str = None, paper_arxiv_id: str = None, paper_pwc_id: str = None, paper_results: dict = None, model_description: str = None, tokenization: Callable[[str], str] = None): super().__init__(model_name, paper_arxiv_id, paper_pwc_id, paper_results, model_description) self.root = change_root_if_server(root=local_root, server_root=".data/nlp/wmt") self.dataset = dataset self.source_lang = source_lang self.target_lang = target_lang default_src_fn, default_dst_fn = self._get_source_dataset_filename() if source_dataset_filename is None or is_server(): source_dataset_filename = default_src_fn if target_dataset_filename is None or is_server(): target_dataset_filename = default_dst_fn self.source_dataset_path = Path(self.root) / source_dataset_filename self.target_dataset_path = Path(self.root) / target_dataset_filename self.metrics = TranslationMetrics(self.source_dataset_path, self.target_dataset_path, tokenization)
def make_data(batch_size): print('Preparing data...', flush=True) if is_server(): datadir = './.data/vision/imagenet' else: # local settings datadir = '/fastwork/data/ilsvrc2012' # Setup the input pipeline _, crop = bit_hyperrule.get_resolution_from_dataset('imagenet2012') input_tx = tv.transforms.Compose([ tv.transforms.Resize((crop, crop)), tv.transforms.ToTensor(), tv.transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)), ]) # valid_set = tv.datasets.ImageFolder(os.path.join(datadir, 'val'), input_tx) valid_set = tv.datasets.ImageNet(datadir, split='val', transform=input_tx) valid_loader = torch.utils.data.DataLoader(valid_set, batch_size=batch_size, shuffle=False, num_workers=8, pin_memory=True, drop_last=False) return valid_set, valid_loader
def cache_exists(self): """ Checks whether the cache exists in the sotabench.com database - if so then sets self.results to cached results and returns True. You can use this property for control flow to break a for loop over a dataset after the first iteration. This prevents re-running the same calculation for the same model twice. Q: Why should the user use this? A: If you want fast "continuous evaluation" and don't want to avoid rerunning the same model over and over each time you commit something new to your repository. Examples: Breaking a for loop if the model is the same as last time we ran .. code-block:: python ... with torch.no_grad(): for i, (input, target) in enumerate(iterator): ... output = model(input) # optional formatting of output here to be a list of detection dicts evaluator.add(output) if evaluator.cache_exists: break evaluator.save() This logic is for the server; it will not break the loop if you evaluate locally. :return: bool or None (if not on server) """ if not is_server(): # we only check the cache on the server return None if not self.first_batch_processed: return False if self._cache_exists is not None: return self._cache_exists client = Client.public() cached_res = client.get_results_by_run_hash(self.batch_hash) if cached_res: self.results = cached_res self.cached_results = True print("No model change detected (using the first batch run " f"hash {self.batch_hash}). Will use cached results.") self._cache_exists = True else: self._cache_exists = False return self._cache_exists
def cache_exists(self): """ Checks whether the cache exists in the sotabench.com database - if so then sets self.results to cached results and returns True. You can use this property for control flow to break a for loop over a dataset after the first iteration. This prevents re-running the same calculation for the same model twice. Q: Why should the user use this? A: If you want fast "continuous evaluation" and don't want to avoid rerunning the same model over and over each time you commit something new to your repository. Examples: Breaking a for loop for a PyTorch evaluation .. code-block:: python ... with torch.no_grad(): for i, (input, target) in enumerate(test_loader): input = input.to(device=device, non_blocking=True) target = target.to(device=device, non_blocking=True) output = model(input) image_ids = [img[0].split('/')[-1].replace('.JPEG', '') for img in test_loader.dataset.imgs[i*test_loader.batch_size:(i+1)*test_loader.batch_size]] evaluator.add(dict(zip(image_ids, list(output.cpu().numpy())))) if evaluator.cache_exists: break evaluator.save() # uses the cached results This logic is for the server; it will not break the loop if you evaluate locally. :return: bool or None (if not in check mode) """ if not self.first_batch_processed: raise ValueError( 'No batches of data have been processed so no batch_hash exists' ) if not is_server(): # we only check the cache on the server return None client = Client.public() cached_res = client.get_results_by_run_hash(self.batch_hash) if cached_res: self.results = cached_res self.cached_results = True print("No model change detected (using the first batch run " "hash). Will use cached results.") return True return False
def cache_exists(self): """ Checks whether the cache exists in the sotabench.com database - if so then sets self.results to cached results and returns True. You can use this property for control flow to break a for loop over a dataset after the first iteration. This prevents rerunning the same calculation for the same model twice. Examples: Breaking a for loop .. code-block:: python ... with torch.no_grad(): for i, (input, target) in enumerate(iterator): ... output = model(input) # optional formatting of output here to be a list of detection dicts evaluator.add(output) if evaluator.cache_exists: break evaluator.save() :return: bool or None (if not in check mode) """ if not is_server(): # we only check the cache on the server return None if not self.first_batch_processed: return False if self._cache_exists is not None: return self._cache_exists client = Client.public() cached_res = client.get_results_by_run_hash(self.batch_hash) if cached_res: self.results = cached_res self.cached_results = True print("No model change detected (using the first batch run " "hash). Will use cached results.") self._cache_exists = True else: self._cache_exists = False return self._cache_exists
def __init__(self, local_root: str = '.', dataset_filename: str = None, model_name: str = None, paper_arxiv_id: str = None, paper_pwc_id: str = None, paper_results: dict = None, model_description=None, version: SQuADVersion = SQuADVersion.V20): """ Creates an evaluator for SQuAD v1.1 or v2.0 Question Answering benchmarks. :param local_root: Path to the directory where the dataset files are located locally. Ignored when run on sotabench server. :param dataset_filename: Local filename of the JSON file with the SQuAD dataset. If None, the standard filename is used, based on :param:`version`. Ignored when run on sotabench server. :param model_name: The name of the model from the paper - if you want to link your build to a model from a machine learning paper. See the SQuAD benchmarks pages for model names, (f.e., https://sotabench.com/benchmarks/question-answering-on-squad11-dev) on the paper leaderboard or models yet to try tabs. :param paper_arxiv_id: Optional linking to arXiv if you want to link to papers on the leaderboard; put in the corresponding paper's arXiv ID, e.g. '1907.10529'. :param paper_pwc_id: Optional linking to Papers With Code; put in the corresponding papers with code URL slug, e.g. 'spanbert-improving-pre-training-by' :param paper_results: If the paper model you are reproducing does not have model results on sotabench.com, you can specify the paper results yourself through this argument, where keys are metric names, values are metric values. e.g: {'EM': 0.858, 'F1': 0.873}. Ensure that the metric names match those on the sotabench leaderboard - for SQuAD benchmarks it should be `EM` for exact match and `F1` for F1 score. Make sure to use results of evaluation on a development set. :param model_description: Optional model description. :param version: Which dataset to evaluate on, either `SQuADVersion.V11` or `SQuADVersion.V20`. """ super().__init__(model_name, paper_arxiv_id, paper_pwc_id, paper_results, model_description) self.root = change_root_if_server(root=local_root, server_root=".data/nlp/squad") self.version = version if dataset_filename is None or is_server(): dataset_filename = "dev-{}.json".format(version.value) self.dataset_path = Path(self.root) / dataset_filename self.metrics = SQuADMetrics(self.dataset_path, version)
def run_benchmark(model_url: str, model_name: str, version: SQuADVersion): evaluator = SQuADEvaluator(local_root="data/nlp/squad", model_name=model_name, paper_arxiv_id="1907.10529", version=version) model = run_squad.BertForQuestionAnswering.from_pretrained(model_url) settings = get_default_settings(evaluator.version) tokenizer = run_squad.BertTokenizer.from_pretrained("spanbert-large-cased", do_lower_case=False) device = torch.device("cuda") model.to(device) eval_examples = run_squad.read_squad_examples( input_file=evaluator.dataset_path, is_training=False, version_2_with_negative=settings.version_2_with_negative) # when on sotabench server, run the pipeline on a small dataset first and # compare the results with cache to avoid recomputing on whole dataset cache_exists = False if is_server(): small_examples = eval_examples[::100] answers = evaluate(model, tokenizer, device, small_examples, settings) evaluator.add(answers) if evaluator.cache_exists: cache_exists = True else: evaluator.reset() evaluator.reset_time() if not cache_exists or not is_server(): answers = evaluate(model, tokenizer, device, eval_examples, settings) evaluator.add(answers) evaluator.save() print(evaluator.results)
def cache_exists(self): """ Checks whether the cache exists in the sotabench.com database - if so then sets self.results to cached results and returns True. You can use this property for control flow to break a for loop over a dataset after the first iteration. This prevents rerunning the same calculation for the same model twice. Examples: Breaking a for loop .. code-block:: python ... with torch.no_grad(): for i, (input, target) in enumerate(iterator): ... output = model(input) # output and target should then be flattened into 1D np.ndarrays and passed in below evaluator.update(output=output, target=target) if evaluator.cache_exists: break evaluator.save() :return: bool or None (if not in check mode) """ if not self.first_batch_processed: raise ValueError( 'No batches of data have been processed so no batch_hash exists' ) if not is_server(): return None client = Client.public() cached_res = client.get_results_by_run_hash(self.batch_hash) if cached_res: self.results = cached_res self.cached_results = True print("No model change detected (using the first batch run " "hash). Will use cached results.") return True return False
def __init__(self, local_root: str = '.', dataset_filename: str = None, model_name: str = None, paper_arxiv_id: str = None, paper_pwc_id: str = None, paper_results: dict = None, model_description=None, version: SQuADVersion = SQuADVersion.V20): super().__init__(model_name, paper_arxiv_id, paper_pwc_id, paper_results, model_description) self.root = change_root_if_server(root=local_root, server_root=".data/nlp/squad") self.version = version if dataset_filename is None or is_server(): dataset_filename = "dev-{}.json".format(version.value) self.dataset_path = Path(self.root) / dataset_filename self.metrics = SQuADMetrics(self.dataset_path, version)
def get_datasets(versions): squad_links = { SQuADVersion.V11: "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v1.1.json", SQuADVersion.V20: "https://rajpurkar.github.io/SQuAD-explorer/dataset/dev-v2.0.json" } filenames = { SQuADVersion.V11: "dev-v1.1.json", SQuADVersion.V20: "dev-v2.0.json" } data_dir = Path(".data") if is_server() else Path("data") datasets_path = data_dir / "nlp" / "squad" datasets_path.mkdir(parents=True, exist_ok=True) for version in versions: filename = datasets_path / filenames[version] if not filename.exists(): download_url_to_file(squad_links[version], filename)
def evaluate(pretrained_name): model = ULMFiT().from_pretrained_(pretrained_name) if is_server(): wikitext_folder = WikiText103Evaluator.dataset.get_path( local_root="unused") else: wikitext_folder = untar_data(URLs.WIKITEXT) ds = model.arch.dataset(wikitext_folder, tokenizer=model.pretrain_lm.tokenizer) test_df = ds.read_data(ds.tst_path) data_lm = ds.databunch_from_df(TextLMDataBunch, test_df, test_df, bs=20, bptt=70) learn = model.finetune_lm.get_learner(data_lm) full_data = np.concatenate(data_lm.valid_ds.items) evaluator = WikiText103Evaluator(model_name="Multifit (slim)", model_description=pretrained_name, paper_arxiv_id="1909.04761", local_root=str(wikitext_folder)) learn.loss_func = None dev = torch.device("cuda") evaluator.reset() batches = iterate_over_batches(torch.tensor(full_data), bs=200, bptt=70) for x, y in progress_bar(batches, total=len(full_data) // 200 // 70): logits = learn.pred_batch(batch=[x.to(dev), y.to(dev)]) log_probs = torch.log_softmax(logits, -1) evaluator.add(log_probs, y) if evaluator.cache_exists: break evaluator.save() print(pretrained_name) evaluator.print_results() return evaluator.results
_entry('regnety_064', 'RegNetY-6.4GF', '2003.13678'), _entry('regnety_080', 'RegNetY-8.0GF', '2003.13678'), _entry('regnety_120', 'RegNetY-12GF', '2003.13678'), _entry('regnety_160', 'RegNetY-16GF', '2003.13678'), _entry('regnety_320', 'RegNetY-32GF', '2003.13678', batch_size=BATCH_SIZE // 2), _entry('rexnet_100', 'ReXNet-1.0x', '2007.00992'), _entry('rexnet_130', 'ReXNet-1.3x', '2007.00992'), _entry('rexnet_150', 'ReXNet-1.5x', '2007.00992'), _entry('rexnet_200', 'ReXNet-2.0x', '2007.00992'), _entry('vit_small_patch16_224', 'ViT-S/16', None), _entry('vit_base_patch16_224', 'ViT-B/16', None), ] if is_server(): DATA_ROOT = './.data/vision/imagenet' else: # local settings DATA_ROOT = './' DATA_FILENAME = 'ILSVRC2012_img_val.tar' TAR_PATH = os.path.join(DATA_ROOT, DATA_FILENAME) for m in model_list: model_name = m['model'] # create model from name model = create_model(model_name, pretrained=True) param_count = sum([m.numel() for m in model.parameters()]) print('Model %s, %s created. Param count: %d' % (model_name, m['paper_model_name'], param_count)) dataset = DatasetTar(TAR_PATH)
def __init__(self, dataset: WMTDataset, source_lang: Language, target_lang: Language, local_root: str = '.', source_dataset_filename: str = None, target_dataset_filename: str = None, model_name: str = None, paper_arxiv_id: str = None, paper_pwc_id: str = None, paper_results: dict = None, model_description: str = None, tokenization: Callable[[str], str] = None): """ Creates an evaluator for one of the WMT benchmarks. :param dataset: Which dataset to evaluate on, f.e., WMTDataset.News2014. :param source_lang: Source language of the documents to translate. :param target_lang: Target language into which the documents are translated. :param local_root: Path to the directory where the dataset files are located locally. Ignored when run on sotabench server. :param source_dataset_filename: Local filename of the SGML file with the source documents. If None, the standard WMT filename is used, based on :param:`dataset`, :param:`source_lang` and :param:`target_lang`. Ignored when run on sotabench server. :param target_dataset_filename: Local filename of the SGML file with the reference documents. If None, the standard WMT filename is used, based on :param:`dataset`, :param:`source_lang` and :param:`target_lang`. Ignored when run on sotabench server. :param model_name: The name of the model from the paper - if you want to link your build to a model from a machine learning paper. See the WMT benchmarks pages for model names, (f.e., https://sotabench.com/benchmarks/machine-translation-on-wmt2014-english-german) on the paper leaderboard or models yet to try tabs. :param paper_arxiv_id: Optional linking to arXiv if you want to link to papers on the leaderboard; put in the corresponding paper's arXiv ID, e.g. '1907.06616'. :param paper_pwc_id: Optional linking to Papers With Code; put in the corresponding papers with code URL slug, e.g. 'facebook-fairs-wmt19-news-translation-task' :param paper_results: If the paper model you are reproducing does not have model results on sotabench.com, you can specify the paper results yourself through this argument, where keys are metric names, values are metric values. e.g: {'SacreBLEU': 42.7, 'BLEU score': 43.1}. Ensure that the metric names match those on the sotabench leaderboard - for WMT benchmarks it should be `SacreBLEU` for de-tokenized case sensitive BLEU score and `BLEU score` for tokenized BLEU. :param model_description: Optional model description. :param tokenization: An optional tokenization function to compute tokenized BLEU score. It takes a single string - a segment to tokenize, and returns a string with tokens separated by space, f.e.: tokenization = lambda seg: seg.replace("'s", " 's").replace("-", " - ") If None, only de-tokenized SacreBLEU score is reported. """ super().__init__(model_name, paper_arxiv_id, paper_pwc_id, paper_results, model_description) self.root = change_root_if_server(root=local_root, server_root=".data/nlp/wmt") self.dataset = dataset self.source_lang = source_lang self.target_lang = target_lang default_src_fn, default_dst_fn = self._get_source_dataset_filename() if source_dataset_filename is None or is_server(): source_dataset_filename = default_src_fn if target_dataset_filename is None or is_server(): target_dataset_filename = default_dst_fn self.source_dataset_path = Path(self.root) / source_dataset_filename self.target_dataset_path = Path(self.root) / target_dataset_filename self.metrics = TranslationMetrics(self.source_dataset_path, self.target_dataset_path, tokenization)
import numpy as np import torch import yaml from sotabencheval.object_detection import COCOEvaluator from sotabencheval.utils import is_server from tqdm import tqdm from models.experimental import attempt_load from utils.datasets import create_dataloader from utils.general import (coco80_to_coco91_class, check_dataset, check_file, check_img_size, compute_loss, non_max_suppression, scale_coords, xyxy2xywh, clip_coords, set_logging) from utils.torch_utils import select_device, time_synchronized DATA_ROOT = './.data/vision/coco' if is_server( ) else '../coco' # sotabench data dir def test( data, weights=None, batch_size=16, imgsz=640, conf_thres=0.001, iou_thres=0.6, # for NMS save_json=False, single_cls=False, augment=False, verbose=False, model=None, dataloader=None,
import os import numpy as np import PIL import torch from torch.utils.data import DataLoader import torchvision.transforms as transforms from torchvision.datasets import ImageNet from efficientnet_pytorch import EfficientNet from sotabencheval.image_classification import ImageNetEvaluator from sotabencheval.utils import is_server if is_server(): DATA_ROOT = './.data/vision/imagenet' else: # local settings DATA_ROOT = os.environ['IMAGENET_DIR'] assert bool(DATA_ROOT), 'please set IMAGENET_DIR environment variable' print('Local data root: ', DATA_ROOT) model_name = 'EfficientNet-B5' model = EfficientNet.from_pretrained(model_name.lower()) image_size = EfficientNet.get_image_size(model_name.lower()) input_transform = transforms.Compose([ transforms.Resize(image_size, PIL.Image.BICUBIC), transforms.CenterCrop(image_size), transforms.ToTensor(), transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225]), ])
scale_coords, xyxy2xywh, clip_coords, plot_images, xywh2xyxy, box_iou, output_to_target, ap_per_class, set_logging, ) from utils.torch_utils import select_device, time_synchronized from sotabencheval.object_detection import COCOEvaluator from sotabencheval.utils import is_server DATA_ROOT = "./.data/vision/coco" if is_server( ) else "../coco" # sotabench data dir def test( data, weights=None, batch_size=16, imgsz=640, conf_thres=0.001, iou_thres=0.6, # for NMS save_json=False, single_cls=False, augment=False, verbose=False, model=None, dataloader=None,