def add_file_entry(self, path:str, script_path:str, source_url: Optional[str] = None) -> Dict[str, str]: """ Add a file entry to the config file :param path: The path to the data file :param script_path: The path to the script that used it :param source_url: The source to download file from :return: """ # path relative to the config file path_rel = str(self.path_relative_to_config(path)) script_path_rel = str(self.path_relative_to_config(script_path)) sha256 = calculate_file_sha256(path) result = { "path": path_rel, "hash": sha256, "usage": script_path_rel, } if source_url is not None: result['source_url'] = source_url self.config["files"].append(result) self.save_config() return result
def download_to_local(self, config: Config, local: LocalStorage, sha256: str, **kwargs): try: transfer = boto3.s3.transfer.S3Transfer(self.client) local_path = local.hash_to_file(sha256) remote_path = local.hash_to_remote_path(sha256) s3_key = str(PurePosixPath(self.path_prefix, remote_path)) local_path.parent.mkdir(parents=True, exist_ok=True) real_path = [ e["path"] for e in config.config["files"] if e["hash"] == sha256 ] if len(real_path) > 0: real_path = real_path[-1] else: # file no longer in config? this shouldn't happen but don't fail. real_path = "" print("Downloading `%s`" % real_path) transfer.download_file(self.bucket_name, s3_key, str(local_path)) # make sure the sha256 of the just downloaded file is correct downloaded_sha256 = calculate_file_sha256(str(local_path)) if sha256 != downloaded_sha256: raise RuntimeError( "Hash for the downloaded file `%s` is incorrect. File might be corrupted in the remote storage backend." % str(local_path)) except botocore.exceptions.NoCredentialsError: raise RuntimeError( "Download failed. AWS credentials not found. Run `lazydata config aws` to configure them." )
def store_file(self, path:str): """ Store a file in the local backend. :ivar path: The path to the file to store :return: """ stat = os.stat(path) abspath = Path(path).resolve() sha256 = calculate_file_sha256(path) # see if we stored this file already datapath = self.hash_to_file(sha256) # copy over the the cache, # TODO: option to hardlink datapath.parent.mkdir(parents=True, exist_ok=True) shutil.copyfile(str(abspath), str(datapath)) # Store in the metadata DB if doesn't exist already existing_entries = DataFile.select().where( ( (DataFile.abspath == abspath) & (DataFile.sha256 == sha256) & (DataFile.mtime == stat.st_mtime) & (DataFile.size == stat.st_size) ) ) if existing_entries.count() == 0: DataFile.create(abspath=abspath, sha256=sha256, mtime=stat.st_mtime, size=stat.st_size)
def download_to_local(config: Config, local: LocalStorage, sha256: Optional[str] = None, source_url: Optional[str] = None, path: Optional[str] = None): if sha256 is not None: local_path = local.hash_to_file(sha256) source_url = config.source_url(sha256=sha256) if source_url is None: raise RuntimeError( "Cannot find source_url for file with hash `%s`. " "See `lazydata add-source` command." % sha256) if path is None: path = config.path(sha256=sha256) if path is None: raise RuntimeError( "Cannot find path for downloading a file.") elif source_url is not None: if path is None: path = config.path(source_url=source_url) if path is None: raise RuntimeError( "Cannot find path for downloading a file.") local_path = Path(path) else: raise RuntimeError( "Cannot download a file without sha256 and source_url specified." ) local_path.parent.mkdir(parents=True, exist_ok=True) f = SmartDL(urls=source_url, dest=str(local_path), progress_bar=False) print("Downloading `%s`" % path) f.start() # make sure the sha256 of the just downloaded file is correct downloaded_sha256 = calculate_file_sha256(str(local_path)) if sha256 is not None and sha256 != downloaded_sha256: raise RuntimeError( "Hash for the downloaded file `%s` is incorrect. " "File might be corrupted in the remote storage backend." % str(local_path)) local.store_file(path=path)
def add_file_entry(self, path:str, script_path:str): """ Add a file entry to the config file :param path: The path to the data file :param script_path: The path to the script that used it :return: """ # path relative to the config file path_rel = str(self.path_relative_to_config(path)) script_path_rel = str(self.path_relative_to_config(script_path)) sha256 = calculate_file_sha256(path) self.config["files"].append({ "path": path_rel, "hash": sha256, "usage": script_path_rel, }) self.save_config()
def track(path:str) -> str: """ Track a file using lazydata. :param path: a path to the file to be tracked :return: Returns the path string that is now tracked """ stack = traceback.extract_stack() script_location = "" if len(stack) >= 2: script_location = stack[-2].filename # remove the ipython hash because it's going to be changing all the time if script_location.startswith("<ipython-input") or script_location.startswith("<stdin"): script_location = "" path_obj = Path(path) # 1) Check if the path exists path_exists = path_obj.exists() if path_exists and path_obj.is_dir(): raise NotImplementedError("Tracking directories is not currently supported: `%s`" % path) # 2) Check it's present in the config file config = Config() latest, older = config.get_latest_and_all_file_entries(path) if path_exists and latest is None: # CASE: Start tracking a new file print("LAZYDATA: Tracking new file `%s`" % path) local = LocalStorage() local.store_file(path) config.add_file_entry(path, script_location) elif path_exists and latest: # CASE: Check for change or stale version # check if it has changed local = LocalStorage() cached_sha256 = local.get_file_sha256(path) # compare with the value in config if latest["hash"] in cached_sha256: # file is at the latest version! # just make sure the usage is recorded config.add_usage(latest, script_location) return path # check if it's one of the stale versions matching_old = [e for e in older if e["hash"] in cached_sha256] if matching_old: print("LAZYDATA: Detected an old version of `%s`, updating to the latest..." % path) fetch_file(config, local, latest["hash"], path) # make sure usage is recorded config.add_usage(latest, script_location) else: # It's not a stale version... # So now recalculate the SHA256 to see if the file really changed path_sha256 = calculate_file_sha256(path) if latest["hash"] != path_sha256: print("LAZYDATA: Tracked file `%s` changed, recording a new version..." % path) local.store_file(path) config.add_file_entry(path, script_location) # make sure usage is recorded config.add_usage(latest, script_location) else: # the file hasn't changed but the metadata was missing locally, so add it... local.store_file(path) # make sure usage is recorded config.add_usage(latest, script_location) elif not path_exists and latest: # CASE: Remote download print("LAZYDATA: Getting latest version of tracked file `%s`..." % path) local = LocalStorage() fetch_file(config, local, latest["hash"], path) # make sure usage is recorded config.add_usage(latest, script_location) elif not path_exists and not latest: # CASE: Trying to track non-existing raise RuntimeError("Cannot track file, because file is not found: %s" % path) return path