Exemplo n.º 1
0
def convert_path(path, loc=None):
    """If the provided path doesn't exist search for it relative to loc (or this file)."""
    if os.path.isfile(path) or path.startswith("$") or validate_url(path):
        return path
    if loc is None:
        loc = os.path.dirname(os.path.realpath(__file__))
    return os.path.join(loc, path)
Exemplo n.º 2
0
    def download(self):
        dload_bundle = self.dataset_desc.get("download", None)
        if dload_bundle is not None:  # download a zip/tar/tar.gz directory, look for train, dev test files inside that.
            dcache_path = os.path.join(self.data_download_cache, DATA_CACHE_CONF)
            dcache = read_json(dcache_path)
            if dload_bundle in dcache and \
                    is_dir_correct(dcache[dload_bundle], self.dataset_desc, self.data_download_cache, dload_bundle,
                                   self.enc_dec) and not self.cache_ignore:
                download_dir = dcache[dload_bundle]
                logger.info("files for {} found in cache, not downloading".format(dload_bundle))
                return {k: os.path.join(download_dir, self.dataset_desc[k]) for k in self.dataset_desc
                        if k.endswith("_file")}
            else:  # try to download the bundle and unzip
                if not validate_url(dload_bundle):
                    raise RuntimeError("can not download from the given url")
                else:
                    cache_dir = self.data_download_cache
                    temp_file = web_downloader(dload_bundle)

                    download_dir = extractor(filepath=temp_file, cache_dir=cache_dir,
                                             extractor_func=Downloader.ZIPD.get(mime_type(temp_file), None))
                    if "sha1" in self.dataset_desc:
                        if os.path.split(download_dir)[-1] != self.dataset_desc["sha1"]:
                            raise RuntimeError("The sha1 of the downloaded file does not match with the provided one")
                    dcache.update({dload_bundle: download_dir})
                    write_json(dcache, os.path.join(self.data_download_cache, DATA_CACHE_CONF))
                    return {k: os.path.join(download_dir, self.dataset_desc[k]) for k in self.dataset_desc
                            if k.endswith("_file")}
        else:  # we have download links to every file or they exist
            if not self.enc_dec:
                    return {k: SingleFileDownloader(self.dataset_desc[k], self.data_download_cache).download()
                        for k in self.dataset_desc if k.endswith("_file") and self.dataset_desc[k]}
            else:
                return {k: self.dataset_desc[k] for k in self.dataset_desc if k.endswith("_file")}
Exemplo n.º 3
0
 def download(self):
     if is_file_correct(self.embedding_file):
         logger.info("embedding file location: {}".format(self.embedding_file))
         return self.embedding_file
     dcache_path = os.path.join(self.data_download_cache, DATA_CACHE_CONF)
     dcache = read_json(dcache_path)
     if self.embedding_file in dcache and not self.cache_ignore:
         download_loc = dcache[self.embedding_file]
         logger.info("files for {} found in cache".format(self.embedding_file))
         return self._get_embedding_file(download_loc, self.embedding_key)
     else:  # try to download the bundle and unzip
         url = self.embedding_file
         if not validate_url(url):
             raise RuntimeError("can not download from the given url")
         else:
             cache_dir = self.data_download_cache
             temp_file = web_downloader(url)
             download_loc = extractor(filepath=temp_file, cache_dir=cache_dir,
                                      extractor_func=Downloader.ZIPD.get(mime_type(temp_file), None))
             if self.sha1 is not None:
                 if os.path.split(download_loc)[-1] != self.sha1:
                     raise RuntimeError("The sha1 of the downloaded file does not match with the provided one")
             dcache.update({url: download_loc})
             write_json(dcache, os.path.join(self.data_download_cache, DATA_CACHE_CONF))
             return self._get_embedding_file(download_loc, self.embedding_key)
Exemplo n.º 4
0
 def download(self):
     file_loc = self.dataset_file
     if is_file_correct(file_loc):
         return file_loc
     elif validate_url(file_loc):  # is it a web URL? check if exists in cache
         url = file_loc
         dcache_path = os.path.join(self.data_download_cache, DATA_CACHE_CONF)
         dcache = read_json(dcache_path)
         if url in dcache and is_file_correct(dcache[url], self.data_download_cache, url) and not self.cache_ignore:
             logger.info("file for {} found in cache, not downloading".format(url))
             return dcache[url]
         else:  # download the file in the cache, update the json
             cache_dir = self.data_download_cache
             logger.info("using {} as data/embeddings cache".format(cache_dir))
             temp_file = web_downloader(url)
             dload_file = extractor(filepath=temp_file, cache_dir=cache_dir,
                                    extractor_func=Downloader.ZIPD.get(mime_type(temp_file), None))
             dcache.update({url: dload_file})
             write_json(dcache, os.path.join(self.data_download_cache, DATA_CACHE_CONF))
             return dload_file
     raise RuntimeError("the file [{}] is not in cache and can not be downloaded".format(file_loc))