def fetch_archive_from_http(url: str, output_dir: str, proxies: Optional[dict] = None): """ Fetch an archive (zip or tar.gz) from a url via http and extract content to an output directory. :param url: http address :type url: str :param output_dir: local path :type output_dir: str :param proxies: proxies details as required by requests library :type proxies: dict :return: bool if anything got fetched """ # verify & prepare local directory path = Path(output_dir) if not path.exists(): path.mkdir(parents=True) is_not_empty = len(list(Path(path).rglob("*"))) > 0 if is_not_empty: logger.info( f"Found data stored in `{output_dir}`. Delete this first if you really want to fetch new data." ) return False else: logger.info(f"Fetching from {url} to `{output_dir}`") # download & extract with tempfile.NamedTemporaryFile() as temp_file: http_get(url, temp_file, proxies=proxies) temp_file.flush() temp_file.seek(0) # making tempfile accessible # extract if url[-4:] == ".zip": zip_archive = zipfile.ZipFile(temp_file.name) zip_archive.extractall(output_dir) elif url[-7:] == ".tar.gz": tar_archive = tarfile.open(temp_file.name) tar_archive.extractall(output_dir) elif url[-3:] == ".gz": filename = url.split("/")[-1].replace(".gz", "") output_filename = Path(output_dir) / filename with gzip.open(temp_file.name) as f, open( output_filename, "wb") as output: for line in f: output.write(line) else: logger.warning( 'Skipped url {0} as file type is not supported here. ' 'See haystack documentation for support of more file types' .format(url)) # temp_file gets deleted here return True
def fetch_archive_from_http(url: str, output_dir: str, proxies: Optional[dict] = None): """ Fetch an archive (zip or tar.gz) from a url via http and extract content to an output directory. :param url: http address :type url: str :param output_dir: local path :type output_dir: str :param proxies: proxies details as required by requests library :type proxies: dict :return: bool if anything got fetched """ # verify & prepare local directory path = Path(output_dir) if not path.exists(): path.mkdir(parents=True) is_not_empty = len(list(Path(path).rglob("*"))) > 0 if is_not_empty: logger.info( f"Found data stored in `{output_dir}`. Delete this first if you really want to fetch new data." ) return False else: logger.info(f"Fetching from {url} to `{output_dir}`") # download & extract with tempfile.NamedTemporaryFile() as temp_file: http_get(url, temp_file, proxies=proxies) temp_file.flush() temp_file.seek(0) # making tempfile accessible # extract if url[-4:] == ".zip": zip_archive = zipfile.ZipFile(temp_file.name) zip_archive.extractall(output_dir) elif url[-7:] == ".tar.gz": tar_archive = tarfile.open(temp_file.name) tar_archive.extractall(output_dir) # temp_file gets deleted here return True