def wrapped(name, base_path, logger): path = os.path.join(base_path, name) if not os.path.exists(path): _download_tarball(url, path, logger) weights_tarball = os.path.join(path, 'weights.tar.gz') util.extract_tarball(weights_tarball, path, logger, reset_permissions=True) os.remove(weights_tarball) return path
def wrapped(name, base_path, logger): path = os.path.join(base_path, name) if not os.path.exists(path): _download_tarball(url, path, logger, expected_md5=expected_md5) weights_tarball = os.path.join(path, 'weights.tar.gz') util.extract_tarball(weights_tarball, path, logger, reset_permissions=True) os.remove(weights_tarball) os.rename(os.path.join(path, 'bert_config.json'), os.path.join(path, 'config.json')) return path
def _init_iter_collection(self): # Using the trick here from capreolus, pulling document content out of public index: # <https://github.com/capreolus-ir/capreolus/blob/d6ae210b24c32ff817f615370a9af37b06d2da89/capreolus/collection/robust04.yaml#L15> with util.download_tmp(**_FILES['index']) as f: fd = f'{f.name}.d' util.extract_tarball(f.name, fd, self.logger, reset_permissions=True) index = indices.AnseriniIndex(f'{fd}/index-robust04-20191213') for did in self.logger.pbar(index.docids(), desc='documents'): raw_doc = index.get_raw(did) yield indices.RawDoc(did, raw_doc)
def _download_tarball(url, path, logger, expected_md5=None): util.download_if_needed(url, path + '.tar.gz', expected_md5=expected_md5) util.extract_tarball(path + '.tar.gz', path, logger, reset_permissions=True) os.remove(path + '.tar.gz') for file in glob(path + '/*/*') + glob(path + '/*/.*'): shutil.move(file, path)