def _download_extract_downstream_data(input_file, proxies=None): # download archive to temp dir and extract to correct position full_path = Path(os.path.realpath(input_file)) directory = full_path.parent taskname = directory.stem datadir = directory.parent logger.info("downloading and extracting file {} to dir {}".format( taskname, datadir)) if "conll03-" in taskname: # conll03 is copyrighted, but luckily somebody put it on github. Kudos! if not os.path.exists(directory): os.makedirs(directory) for dataset in ["train", "dev", "test"]: if "de" in taskname: _conll03get(dataset, directory, "de") elif "en" in taskname: _conll03get(dataset, directory, "en") else: logger.error("Cannot download {}. Unknown data source.".format( taskname)) elif taskname not in DOWNSTREAM_TASK_MAP: logger.error( "Cannot download {}. Unknown data source.".format(taskname)) else: if os.name == "nt": # make use of NamedTemporaryFile compatible with Windows delete_tmp_file = False else: delete_tmp_file = True with tempfile.NamedTemporaryFile(delete=delete_tmp_file) as temp_file: http_get(DOWNSTREAM_TASK_MAP[taskname], temp_file, proxies=proxies) temp_file.flush() temp_file.seek(0) # making tempfile accessible # checking files for correctness with md5sum. if ("germeval14" in taskname): if "2c9d5337d7a25b9a4bf6f5672dd091bc" != _get_md5checksum( temp_file.name): logger.error( f"Someone has changed the file for {taskname}. Please make sure the correct file is used and update the md5sum in farm/data_handler/utils.py" ) elif "germeval18" in taskname: if "23244fa042dcc39e844635285c455205" != _get_md5checksum( temp_file.name): logger.error( f"Someone has changed the file for {taskname}. Please make sure the correct file is used and update the md5sum in farm/data_handler/utils.py" ) elif "gnad" in taskname: if "ef62fe3f59c1ad54cf0271d8532b8f22" != _get_md5checksum( temp_file.name): logger.error( f"Someone has changed the file for {taskname}. Please make sure the correct file is used and update the md5sum in farm/data_handler/utils.py" ) elif "germeval17" in taskname: if "f1bf67247dcfe7c3c919b7b20b3f736e" != _get_md5checksum( temp_file.name): logger.error( f"Someone has changed the file for {taskname}. Please make sure the correct file is used and update the md5sum in farm/data_handler/utils.py" ) tfile = tarfile.open(temp_file.name) tfile.extractall(datadir)
def _download_extract_downstream_data(input_file, proxies=None): # download archive to temp dir and extract to correct position full_path = os.path.realpath(input_file) directory = os.path.dirname(full_path) taskname = directory.split("/")[-1] datadir = "/".join(directory.split("/")[:-1]) logger.info("downloading and extracting file {} to dir {}".format( taskname, datadir)) if "conll03" in taskname: # conll03 is copyrighted, but luckily somebody put it on github. Kudos! if not os.path.exists(directory): os.makedirs(directory) for dataset in ["train", "dev", "test"]: if "de" in taskname: _conll03get(dataset, directory, "de") elif "en" in taskname: _conll03get(dataset, directory, "en") else: logger.error("Cannot download {}. Unknown data source.".format( taskname)) elif taskname not in DOWNSTREAM_TASK_MAP: logger.error( "Cannot download {}. Unknown data source.".format(taskname)) else: with tempfile.NamedTemporaryFile() as temp_file: http_get(DOWNSTREAM_TASK_MAP[taskname], temp_file, proxies=proxies) temp_file.flush() temp_file.seek(0) # making tempfile accessible tfile = tarfile.open(temp_file.name) tfile.extractall(datadir)
def _download_extract_downstream_data(input_file, proxies=None): # download archive to temp dir and extract to correct position full_path = Path(os.path.realpath(input_file)) directory = full_path.parent taskname = directory.stem datadir = directory.parent logger.info( "downloading and extracting file {} to dir {}".format(taskname, datadir) ) if "conll03" in taskname: # conll03 is copyrighted, but luckily somebody put it on github. Kudos! if not os.path.exists(directory): os.makedirs(directory) for dataset in ["train", "dev", "test"]: if "de" in taskname: _conll03get(dataset, directory, "de") elif "en" in taskname: _conll03get(dataset, directory, "en") else: logger.error("Cannot download {}. Unknown data source.".format(taskname)) elif taskname not in DOWNSTREAM_TASK_MAP: logger.error("Cannot download {}. Unknown data source.".format(taskname)) else: if os.name == "nt": # make use of NamedTemporaryFile compatible with Windows delete_tmp_file = False else: delete_tmp_file = True with tempfile.NamedTemporaryFile(delete=delete_tmp_file) as temp_file: http_get(DOWNSTREAM_TASK_MAP[taskname], temp_file, proxies=proxies) temp_file.flush() temp_file.seek(0) # making tempfile accessible tfile = tarfile.open(temp_file.name) tfile.extractall(datadir)
def download_file(s3_url: str, out_dir: str, file_name: str): print('Loading from ', s3_url) local_file = os.path.join(out_dir, file_name) if os.path.exists(local_file): print('File already exist ', local_file) return with open(local_file, "w") as file: http_get(s3_url, temp_file=file) wget.download(s3_url, out=local_file) print('Saved to ', local_file)
def download_from_url(url: str, filepath: Union[str, Path]): """ Download from a url to a local file. Skip already existing files. :param url: Url :param filepath: local path where the url content shall be stored :return: local path of the downloaded file """ logger.info(f"Downloading {url}") # Create local folder folder, filename = os.path.split(filepath) if not os.path.exists(folder): os.makedirs(folder) # Download file if not present locally if os.path.exists(filepath): logger.info(f"Skipping {url} (exists locally)") else: logger.info(f"Downloading {url} to {filepath} ") with open(filepath, "wb") as file: http_get(url=url, temp_file=file) return filepath