def __getitem__(self, key): dlc = self.contents()[key] sources = [] cache_path = None if 'cache_path' in dlc: if self._base_path: cache_path = os.path.join(self._base_path, dlc['cache_path']) else: cache_path = dlc['cache_path'] if 'url' in dlc: if not dlc.get('skip_local') and dlc.get('expected_md5'): local_path = Path( util.home_path()) / 'downloads' / dlc['expected_md5'] local_path.parent.mkdir(parents=True, exist_ok=True) local_msg = ( f'If you have a local copy of {dlc["url"]}, you can symlink it here ' f'to avoid downloading it again: {local_path}') sources.append(LocalDownload(local_path, local_msg)) sources.append(RequestsDownload(dlc['url'])) elif 'instructions' in dlc: if 'cache_path' in dlc: local_path = Path(cache_path) else: local_path = Path( util.home_path()) / 'downloads' / dlc['expected_md5'] sources.append( LocalDownload(local_path, dlc['instructions'].format(path=local_path))) else: raise RuntimeError('Must either provide url or instructions') return Download(sources, expected_md5=dlc.get('expected_md5'), cache_path=cache_path, dua=self._dua)
def __getitem__(self, key): dlc = self.contents()[key] sources = [] cache_path = None download_args = dlc.get('download_args', {}) if 'auth' in dlc: download_args['auth'] = dlc['auth'] if 'cache_path' in dlc: if self._base_path: cache_path = os.path.join(self._base_path, dlc['cache_path']) else: cache_path = dlc['cache_path'] if 'url' in dlc: small_file_size = int(os.environ.get('IR_DATASETS_SMALL_FILE_SIZE', '5000000')) if not dlc.get('skip_local') and dlc.get('expected_md5') and not dlc.get('size_hint', small_file_size) < small_file_size: local_path = Path(self.get_download_path()) / dlc['expected_md5'] local_msg = (f'If you have a local copy of {dlc["url"]}, you can symlink it here ' f'to avoid downloading it again: {local_path}') sources.append(LocalDownload(local_path, local_msg, mkdir=False)) if dlc['url'].startswith('https://drive.google.com/'): sources.append(GoogleDriveDownload(dlc['url'], **download_args)) else: sources.append(RequestsDownload(dlc['url'], **download_args)) if dlc.get('irds_mirror') and dlc.get('expected_md5'): # this file has the irds mirror to fall back on sources.append(RequestsDownload(f'https://mirror.ir-datasets.com/{dlc["expected_md5"]}')) elif 'instructions' in dlc: if 'cache_path' in dlc: local_path = Path(cache_path) else: local_path = Path(util.home_path()) / 'downloads' / dlc['expected_md5'] sources.append(LocalDownload(local_path, dlc['instructions'].format(path=local_path))) else: raise RuntimeError('Must either provide url or instructions') return Download(sources, expected_md5=dlc.get('expected_md5'), cache_path=cache_path, dua=self._dua, stream=dlc.get('stream', False), size_hint=dlc.get('size_hint'))
def _init(): base_path = home_path() / NAME documentation = YamlDocumentation(f"docs/{NAME}.yaml") download_config = DownloadConfig.context(NAME, base_path) base = Dataset(documentation('_')) # Arguments that can be loaded from Zenodo. arguments: Dict[str, ArgsMeDocs] = { name: ArgsMeDocs(Cache(ZipExtract(download_config[name], zip_path), base_path / f"{name}.json"), namespace=f"{NAME}/{name}", language=language, count_hint=count_hint) for name, (count_hint, language, zip_path) in SUBSETS.items() } # Arguments that are combined versions of other subsets. combined_arguments: Dict[str, ArgsMeCombinedArguments] = { name: ArgsMeCombinedArguments( base_path / f"{name}.json", [arguments[subset_name] for subset_name in subset_names], namespace=f"{NAME}/{name}", language=language, count_hint=count_hint) for name, (subset_names, count_hint, language) in COMBINED_SUBSETS.items() } # Wrap in datasets with documentation. datasets = { name: Dataset(arguments, documentation(name)) for name, arguments in chain(arguments.items(), combined_arguments.items()) } # NOTE: the following datasets are defined in touche.py: # - argsme/1.0/touche-2020-task-1/uncorrected # - argsme/2020-04-01/touche-2020-task-1 # - argsme/2020-04-01/touche-2020-task-1/uncorrected # - argsme/2020-04-01/touche-2021-task-1 # Register datasets. registry.register(NAME, base) for name, arguments in datasets.items(): registry.register(f'{NAME}/{name}', arguments) return base, datasets
def _handle_auth(self, http_args): auth_dir = util.home_path() / 'auth' if not auth_dir.exists(): auth_dir.mkdir(parents=True, exist_ok=True) auth_path = auth_dir / self.auth if auth_path.exists(): with auth_path.open('rt') as fin: lines = fin.read().split('\n') if len(lines) < 2: raise RuntimeError(f'{str(auth_path)} in incorrect format. Set the first line as the username and the second line as the password.') uname, pwd = lines[0].strip(), lines[1].strip() http_args['auth'] = (uname, pwd) else: _logger.info('To download {url}, you need to enter a username and password. To avoid this message in the future, you may ' 'also set them in a file''named {auth_path}, with the first line as the username and the second line as the ' 'password.'.format(auth_path=str(auth_path), **http_args)) uname = input('enter username for {url}: '.format(**http_args)) pwd = input('enter password for {url}: '.format(**http_args)) http_args['auth'] = (uname, pwd)
def _init(): base_path = home_path() / NAME documentation = YamlDocumentation(f"docs/{NAME}.yaml") download_config = DownloadConfig.context(NAME, base_path) def cached_download(name: str, extension: str) -> Cache: return Cache(download_config[name], base_path / f"{name}.{extension}") def cached_zip_download(name: str, zip_path: str, extension: str) -> Cache: return Cache(ZipExtract(download_config[name], zip_path), base_path / f"{name}.{extension}") # Define and create task datasets. task_base_datasets = { f"argsme/2020-04-01/{NAME}-2020-task-1": Dataset( registry["argsme/2020-04-01"].docs_handler(), ToucheQueries( cached_zip_download("2020/task-1/queries", "topics-task-1.xml", "xml"), namespace=f"argsme/2020-04-01/{NAME}-2020-task-1/queries", language="en", ), ToucheQrels( cached_download("2020/task-1/qrels", "qrels"), QRELS_DEFS_2021_TASK_1_RELEVANCE, ), documentation("2020/task-1"), ), f"clueweb12/{NAME}-2020-task-2": Dataset( registry["clueweb12"].docs_handler(), ToucheQueries( cached_zip_download("2020/task-2/queries", "topics-task-2.xml", "xml"), namespace=f"clueweb12/{NAME}-2020-task-2/queries", language="en", ), ToucheQrels( cached_download("2020/task-2/qrels", "qrels"), QRELS_DEFS_2020_TASK_2, ), documentation("2020/task-2"), ), f"argsme/2020-04-01/{NAME}-2021-task-1": Dataset( registry["argsme/2020-04-01"].docs_handler(), ToucheQueries( cached_zip_download("2021/task-1/queries", "topics-task-1-only-titles.xml", "xml"), namespace=f"argsme/2020-04-01/{NAME}-2021-task-1/queries", language="en", has_description=False, ), ToucheQualityQrels( ToucheQrels( cached_download("2021/task-1/qrels-relevance", "qrels"), QRELS_DEFS_2021_TASK_1_RELEVANCE, ), ToucheQrels( cached_download("2021/task-1/qrels-quality", "qrels"), QRELS_DEFS_2021_TASK_1_QUALITY, ), ), documentation("2021/task-1"), ), f"clueweb12/{NAME}-2021-task-2": Dataset( registry["clueweb12"].docs_handler(), ToucheQueries( cached_zip_download("2021/task-2/queries", "topics-task2-51-100.xml", "xml"), namespace=f"clueweb12/{NAME}-2021-task-2/queries", language="en", ), ToucheQualityQrels( ToucheQrels( cached_download("2021/task-2/qrels-relevance", "qrels"), QRELS_DEFS_2021_TASK_2_RELEVANCE, ), ToucheQrels( cached_download("2021/task-2/qrels-quality", "qrels"), QRELS_DEFS_2021_TASK_2_QUALITY, ), ), documentation("2021/task-2"), ), } for name, dataset in task_base_datasets.items(): registry.register(name, dataset) # Define and create task sub-datasets. task_sub_datasets = { f"argsme/1.0/{NAME}-2020-task-1/uncorrected": Dataset( registry["argsme/1.0"].docs_handler(), registry[f"argsme/2020-04-01/{NAME}-2020-task-1"].queries_handler( ), ToucheQrels( cached_download("2020/task-1/qrels-argsme-1.0-uncorrected", "qrels"), QRELS_DEFS_2020_TASK_1, ), documentation("2020/task-1/argsme-1.0/uncorrected"), ), f"argsme/2020-04-01/{NAME}-2020-task-1/uncorrected": Dataset( registry["argsme/2020-04-01"].docs_handler(), registry[f"argsme/2020-04-01/{NAME}-2020-task-1"].queries_handler( ), ToucheQrels( cached_download( "2020/task-1/qrels-argsme-2020-04-01-uncorrected", "qrels"), QRELS_DEFS_2020_TASK_1, ), documentation("2020/task-1/argsme-2020-04-01/uncorrected"), ), } for name, dataset in task_sub_datasets.items(): registry.register(name, dataset) return task_base_datasets, task_sub_datasets
def get_home_path(self): if self.home_path is None: self.home_path = util.home_path() return self.home_path