Пример #1
0
 def __getitem__(self, key):
     dlc = self.contents()[key]
     sources = []
     cache_path = None
     if 'cache_path' in dlc:
         if self._base_path:
             cache_path = os.path.join(self._base_path, dlc['cache_path'])
         else:
             cache_path = dlc['cache_path']
     if 'url' in dlc:
         if not dlc.get('skip_local') and dlc.get('expected_md5'):
             local_path = Path(
                 util.home_path()) / 'downloads' / dlc['expected_md5']
             local_path.parent.mkdir(parents=True, exist_ok=True)
             local_msg = (
                 f'If you have a local copy of {dlc["url"]}, you can symlink it here '
                 f'to avoid downloading it again: {local_path}')
             sources.append(LocalDownload(local_path, local_msg))
         sources.append(RequestsDownload(dlc['url']))
     elif 'instructions' in dlc:
         if 'cache_path' in dlc:
             local_path = Path(cache_path)
         else:
             local_path = Path(
                 util.home_path()) / 'downloads' / dlc['expected_md5']
         sources.append(
             LocalDownload(local_path,
                           dlc['instructions'].format(path=local_path)))
     else:
         raise RuntimeError('Must either provide url or instructions')
     return Download(sources,
                     expected_md5=dlc.get('expected_md5'),
                     cache_path=cache_path,
                     dua=self._dua)
Пример #2
0
 def __getitem__(self, key):
     dlc = self.contents()[key]
     sources = []
     cache_path = None
     download_args = dlc.get('download_args', {})
     if 'auth' in dlc:
         download_args['auth'] = dlc['auth']
     if 'cache_path' in dlc:
         if self._base_path:
             cache_path = os.path.join(self._base_path, dlc['cache_path'])
         else:
             cache_path = dlc['cache_path']
     if 'url' in dlc:
         small_file_size = int(os.environ.get('IR_DATASETS_SMALL_FILE_SIZE', '5000000'))
         if not dlc.get('skip_local') and dlc.get('expected_md5') and not dlc.get('size_hint', small_file_size) < small_file_size:
             local_path = Path(self.get_download_path()) / dlc['expected_md5']
             local_msg = (f'If you have a local copy of {dlc["url"]}, you can symlink it here '
                          f'to avoid downloading it again: {local_path}')
             sources.append(LocalDownload(local_path, local_msg, mkdir=False))
         if dlc['url'].startswith('https://drive.google.com/'):
             sources.append(GoogleDriveDownload(dlc['url'], **download_args))
         else:
             sources.append(RequestsDownload(dlc['url'], **download_args))
         if dlc.get('irds_mirror') and dlc.get('expected_md5'):
             # this file has the irds mirror to fall back on
             sources.append(RequestsDownload(f'https://mirror.ir-datasets.com/{dlc["expected_md5"]}'))
     elif 'instructions' in dlc:
         if 'cache_path' in dlc:
             local_path = Path(cache_path)
         else:
             local_path = Path(util.home_path()) / 'downloads' / dlc['expected_md5']
         sources.append(LocalDownload(local_path, dlc['instructions'].format(path=local_path)))
     else:
         raise RuntimeError('Must either provide url or instructions')
     return Download(sources, expected_md5=dlc.get('expected_md5'), cache_path=cache_path, dua=self._dua, stream=dlc.get('stream', False), size_hint=dlc.get('size_hint'))
Пример #3
0
def _init():
    base_path = home_path() / NAME

    documentation = YamlDocumentation(f"docs/{NAME}.yaml")
    download_config = DownloadConfig.context(NAME, base_path)

    base = Dataset(documentation('_'))

    # Arguments that can be loaded from Zenodo.
    arguments: Dict[str, ArgsMeDocs] = {
        name: ArgsMeDocs(Cache(ZipExtract(download_config[name], zip_path),
                               base_path / f"{name}.json"),
                         namespace=f"{NAME}/{name}",
                         language=language,
                         count_hint=count_hint)
        for name, (count_hint, language, zip_path) in SUBSETS.items()
    }

    # Arguments that are combined versions of other subsets.
    combined_arguments: Dict[str, ArgsMeCombinedArguments] = {
        name: ArgsMeCombinedArguments(
            base_path / f"{name}.json",
            [arguments[subset_name] for subset_name in subset_names],
            namespace=f"{NAME}/{name}",
            language=language,
            count_hint=count_hint)
        for name, (subset_names, count_hint,
                   language) in COMBINED_SUBSETS.items()
    }

    # Wrap in datasets with documentation.
    datasets = {
        name: Dataset(arguments, documentation(name))
        for name, arguments in chain(arguments.items(),
                                     combined_arguments.items())
    }

    # NOTE: the following datasets are defined in touche.py:
    #  - argsme/1.0/touche-2020-task-1/uncorrected
    #  - argsme/2020-04-01/touche-2020-task-1
    #  - argsme/2020-04-01/touche-2020-task-1/uncorrected
    #  - argsme/2020-04-01/touche-2021-task-1

    # Register datasets.
    registry.register(NAME, base)
    for name, arguments in datasets.items():
        registry.register(f'{NAME}/{name}', arguments)

    return base, datasets
Пример #4
0
 def _handle_auth(self, http_args):
     auth_dir = util.home_path() / 'auth'
     if not auth_dir.exists():
         auth_dir.mkdir(parents=True, exist_ok=True)
     auth_path = auth_dir / self.auth
     if auth_path.exists():
         with auth_path.open('rt') as fin:
             lines = fin.read().split('\n')
             if len(lines) < 2:
                 raise RuntimeError(f'{str(auth_path)} in incorrect format. Set the first line as the username and the second line as the password.')
             uname, pwd = lines[0].strip(), lines[1].strip()
             http_args['auth'] = (uname, pwd)
     else:
         _logger.info('To download {url}, you need to enter a username and password. To avoid this message in the future, you may '
                      'also set them in a file''named {auth_path}, with the first line as the username and the second line as the '
                      'password.'.format(auth_path=str(auth_path), **http_args))
         uname = input('enter username for {url}: '.format(**http_args))
         pwd = input('enter password for {url}: '.format(**http_args))
         http_args['auth'] = (uname, pwd)
Пример #5
0
def _init():
    base_path = home_path() / NAME

    documentation = YamlDocumentation(f"docs/{NAME}.yaml")
    download_config = DownloadConfig.context(NAME, base_path)

    def cached_download(name: str, extension: str) -> Cache:
        return Cache(download_config[name], base_path / f"{name}.{extension}")

    def cached_zip_download(name: str, zip_path: str, extension: str) -> Cache:
        return Cache(ZipExtract(download_config[name], zip_path),
                     base_path / f"{name}.{extension}")

    # Define and create task datasets.
    task_base_datasets = {
        f"argsme/2020-04-01/{NAME}-2020-task-1":
        Dataset(
            registry["argsme/2020-04-01"].docs_handler(),
            ToucheQueries(
                cached_zip_download("2020/task-1/queries", "topics-task-1.xml",
                                    "xml"),
                namespace=f"argsme/2020-04-01/{NAME}-2020-task-1/queries",
                language="en",
            ),
            ToucheQrels(
                cached_download("2020/task-1/qrels", "qrels"),
                QRELS_DEFS_2021_TASK_1_RELEVANCE,
            ),
            documentation("2020/task-1"),
        ),
        f"clueweb12/{NAME}-2020-task-2":
        Dataset(
            registry["clueweb12"].docs_handler(),
            ToucheQueries(
                cached_zip_download("2020/task-2/queries", "topics-task-2.xml",
                                    "xml"),
                namespace=f"clueweb12/{NAME}-2020-task-2/queries",
                language="en",
            ),
            ToucheQrels(
                cached_download("2020/task-2/qrels", "qrels"),
                QRELS_DEFS_2020_TASK_2,
            ),
            documentation("2020/task-2"),
        ),
        f"argsme/2020-04-01/{NAME}-2021-task-1":
        Dataset(
            registry["argsme/2020-04-01"].docs_handler(),
            ToucheQueries(
                cached_zip_download("2021/task-1/queries",
                                    "topics-task-1-only-titles.xml", "xml"),
                namespace=f"argsme/2020-04-01/{NAME}-2021-task-1/queries",
                language="en",
                has_description=False,
            ),
            ToucheQualityQrels(
                ToucheQrels(
                    cached_download("2021/task-1/qrels-relevance", "qrels"),
                    QRELS_DEFS_2021_TASK_1_RELEVANCE,
                ),
                ToucheQrels(
                    cached_download("2021/task-1/qrels-quality", "qrels"),
                    QRELS_DEFS_2021_TASK_1_QUALITY,
                ),
            ),
            documentation("2021/task-1"),
        ),
        f"clueweb12/{NAME}-2021-task-2":
        Dataset(
            registry["clueweb12"].docs_handler(),
            ToucheQueries(
                cached_zip_download("2021/task-2/queries",
                                    "topics-task2-51-100.xml", "xml"),
                namespace=f"clueweb12/{NAME}-2021-task-2/queries",
                language="en",
            ),
            ToucheQualityQrels(
                ToucheQrels(
                    cached_download("2021/task-2/qrels-relevance", "qrels"),
                    QRELS_DEFS_2021_TASK_2_RELEVANCE,
                ),
                ToucheQrels(
                    cached_download("2021/task-2/qrels-quality", "qrels"),
                    QRELS_DEFS_2021_TASK_2_QUALITY,
                ),
            ),
            documentation("2021/task-2"),
        ),
    }
    for name, dataset in task_base_datasets.items():
        registry.register(name, dataset)

    # Define and create task sub-datasets.
    task_sub_datasets = {
        f"argsme/1.0/{NAME}-2020-task-1/uncorrected":
        Dataset(
            registry["argsme/1.0"].docs_handler(),
            registry[f"argsme/2020-04-01/{NAME}-2020-task-1"].queries_handler(
            ),
            ToucheQrels(
                cached_download("2020/task-1/qrels-argsme-1.0-uncorrected",
                                "qrels"),
                QRELS_DEFS_2020_TASK_1,
            ),
            documentation("2020/task-1/argsme-1.0/uncorrected"),
        ),
        f"argsme/2020-04-01/{NAME}-2020-task-1/uncorrected":
        Dataset(
            registry["argsme/2020-04-01"].docs_handler(),
            registry[f"argsme/2020-04-01/{NAME}-2020-task-1"].queries_handler(
            ),
            ToucheQrels(
                cached_download(
                    "2020/task-1/qrels-argsme-2020-04-01-uncorrected",
                    "qrels"),
                QRELS_DEFS_2020_TASK_1,
            ),
            documentation("2020/task-1/argsme-2020-04-01/uncorrected"),
        ),
    }
    for name, dataset in task_sub_datasets.items():
        registry.register(name, dataset)

    return task_base_datasets, task_sub_datasets
Пример #6
0
 def get_home_path(self):
     if self.home_path is None:
         self.home_path = util.home_path()
     return self.home_path