def fetch_file(config: Config, local: LocalStorage, path: str, sha256: Optional[str] = None, source_url: Optional[str] = None): """ Top level function that fetches the file, either from local or remote storage and copies it to `path`. If fetching remotely, it always goes to the local storage first, then copied. :param config: project Config instance :param local: LocalStorage instance :param path: where the file should be copied to :param sha256: hash of the file we need :param source_url: URL of the source of the file we need :return: """ if sha256 is None and source_url is None: raise RuntimeError("Fetching file `%s`: neither sha256 nor source_url was specified.") if sha256 is not None: local_copy_success = local.copy_file_to(sha256, path) else: local_copy_success = False if not local_copy_success: if source_url is None: source_url = config.source_url(sha256=sha256) if source_url is None: remote = RemoteStorage.get_from_config(config) else: remote = UrlRemoteStorage() remote.download_to_local(config=config, local=local, sha256=sha256, source_url=source_url, path=path) if sha256 is not None: local.copy_file_to(sha256, path)
def handle(self, args): config = Config() local = LocalStorage() if args.artefacts == []: # pull everything for e in config.config["files"]: file_abs_path = config.abs_path(e["path"]) fetch_file(config=config, local=local, path=str(file_abs_path), sha256=e["hash"]) else: for artefact in args.artefacts: # 1) check if the artefact is a file we are tracking latest, _ = config.get_latest_and_all_file_entries(artefact) if latest is not None: # pull the latest version of this file fetch_file(config=config, local=local, path=artefact, sha256=latest["hash"]) continue # 2) Check for usage used_entries = config.tracked_files_used_in(artefact) if used_entries: for e in used_entries: file_abs_path = config.abs_path(e["path"]) fetch_file(config=config, local=local, path=str(file_abs_path), sha256=e["hash"]) continue # 3) check for a directory dir_path = None try: # might fail on python 3.5... dir_path = Path(artefact).resolve() except Exception: pass if dir_path and dir_path.exists() and dir_path.is_dir(): dir_entries = config.abs_path_matches_prefix(str(dir_path)) if dir_entries: for e in dir_entries: file_abs_path = config.abs_path(e["path"]) fetch_file(config=config, local=local, path=str(file_abs_path), sha256=e["hash"]) continue
def handle(self, args): config = Config() if "remote" in config.config: remote = RemoteStorage.get_from_config(config) local = LocalStorage() remote.upload(local, config) else: print( "ERROR: Remote not specified for this lazydata project. Use `lazydata add-remote` to add it." )
def download_to_local(config: Config, local: LocalStorage, sha256: Optional[str] = None, source_url: Optional[str] = None, path: Optional[str] = None): if sha256 is not None: local_path = local.hash_to_file(sha256) source_url = config.source_url(sha256=sha256) if source_url is None: raise RuntimeError( "Cannot find source_url for file with hash `%s`. " "See `lazydata add-source` command." % sha256) if path is None: path = config.path(sha256=sha256) if path is None: raise RuntimeError( "Cannot find path for downloading a file.") elif source_url is not None: if path is None: path = config.path(source_url=source_url) if path is None: raise RuntimeError( "Cannot find path for downloading a file.") local_path = Path(path) else: raise RuntimeError( "Cannot download a file without sha256 and source_url specified." ) local_path.parent.mkdir(parents=True, exist_ok=True) f = SmartDL(urls=source_url, dest=str(local_path), progress_bar=False) print("Downloading `%s`" % path) f.start() # make sure the sha256 of the just downloaded file is correct downloaded_sha256 = calculate_file_sha256(str(local_path)) if sha256 is not None and sha256 != downloaded_sha256: raise RuntimeError( "Hash for the downloaded file `%s` is incorrect. " "File might be corrupted in the remote storage backend." % str(local_path)) local.store_file(path=path)
def handle(self, args): url = args.url endpoint_url = args.endpoint_url if not url.startswith("s3://"): print( "ERROR: Only S3 URLs are currently supported. For example: `s3://mybucket` or `s3://mybucket/myfolder`" ) sys.exit(1) success = False while not success: remote = RemoteStorage.get_from_url(url, endpoint_url=endpoint_url) try: if remote.check_storage_exists(): config = Config() config.add_remote(url, endpoint_url=endpoint_url) success = True else: success = True print( "ERROR: The remote storage location you specified does not exist or is not accessible to you" ) except botocore.exceptions.NoCredentialsError: success = False if not success: print("ERROR: No valid AWS credentials found.") config_now = input( "Do you want to configure AWS credentials now? [y/n] ") if config_now.strip() == "y": setup_aws_credentials() print( "Credentials successfully stored. Trying again with these new credentials..." ) else: success = True print( "Alright, will not configure credentials now. Re-run this command to try again, " "or configure using the AWS CLI: `aws configure`.")
def track(path:str) -> str: """ Track a file using lazydata. :param path: a path to the file to be tracked :return: Returns the path string that is now tracked """ stack = traceback.extract_stack() script_location = "" if len(stack) >= 2: script_location = stack[-2].filename # remove the ipython hash because it's going to be changing all the time if script_location.startswith("<ipython-input") or script_location.startswith("<stdin"): script_location = "" path_obj = Path(path) # 1) Check if the path exists path_exists = path_obj.exists() if path_exists and path_obj.is_dir(): raise NotImplementedError("Tracking directories is not currently supported: `%s`" % path) # 2) Check it's present in the config file config = Config() latest, older = config.get_latest_and_all_file_entries(path) if path_exists and latest is None: # CASE: Start tracking a new file print("LAZYDATA: Tracking new file `%s`" % path) local = LocalStorage() local.store_file(path) config.add_file_entry(path, script_location) elif path_exists and latest: # CASE: Check for change or stale version # check if it has changed local = LocalStorage() cached_sha256 = local.get_file_sha256(path) # compare with the value in config if latest["hash"] in cached_sha256: # file is at the latest version! # just make sure the usage is recorded config.add_usage(latest, script_location) return path # check if it's one of the stale versions matching_old = [e for e in older if e["hash"] in cached_sha256] if matching_old: print("LAZYDATA: Detected an old version of `%s`, updating to the latest..." % path) fetch_file(config, local, latest["hash"], path) # make sure usage is recorded config.add_usage(latest, script_location) else: # It's not a stale version... # So now recalculate the SHA256 to see if the file really changed path_sha256 = calculate_file_sha256(path) if latest["hash"] != path_sha256: print("LAZYDATA: Tracked file `%s` changed, recording a new version..." % path) local.store_file(path) config.add_file_entry(path, script_location) # make sure usage is recorded config.add_usage(latest, script_location) else: # the file hasn't changed but the metadata was missing locally, so add it... local.store_file(path) # make sure usage is recorded config.add_usage(latest, script_location) elif not path_exists and latest: # CASE: Remote download print("LAZYDATA: Getting latest version of tracked file `%s`..." % path) local = LocalStorage() fetch_file(config, local, latest["hash"], path) # make sure usage is recorded config.add_usage(latest, script_location) elif not path_exists and not latest: # CASE: Trying to track non-existing raise RuntimeError("Cannot track file, because file is not found: %s" % path) return path
def handle(self, args): config = Config() source_url = args.source_url path = args.path entry, _ = config.get_latest_and_all_file_entries(path=path) config.add_source(entry=entry, source_url=source_url)