예제 #1
0
    def add_file_entry(self, path:str, script_path:str, source_url: Optional[str] = None) -> Dict[str, str]:
        """
        Add a file entry to the config file

        :param path: The path to the data file
        :param script_path: The path to the script that used it
        :param source_url: The source to download file from
        :return:
        """
        # path relative to the config file
        path_rel = str(self.path_relative_to_config(path))
        script_path_rel = str(self.path_relative_to_config(script_path))

        sha256 = calculate_file_sha256(path)

        result = {
            "path": path_rel,
            "hash": sha256,
            "usage": script_path_rel,
        }
        if source_url is not None:
            result['source_url'] = source_url
        self.config["files"].append(result)

        self.save_config()

        return result
예제 #2
0
    def download_to_local(self, config: Config, local: LocalStorage,
                          sha256: str, **kwargs):
        try:
            transfer = boto3.s3.transfer.S3Transfer(self.client)

            local_path = local.hash_to_file(sha256)
            remote_path = local.hash_to_remote_path(sha256)
            s3_key = str(PurePosixPath(self.path_prefix, remote_path))

            local_path.parent.mkdir(parents=True, exist_ok=True)

            real_path = [
                e["path"] for e in config.config["files"]
                if e["hash"] == sha256
            ]
            if len(real_path) > 0:
                real_path = real_path[-1]
            else:
                # file no longer in config? this shouldn't happen but don't fail.
                real_path = ""

            print("Downloading `%s`" % real_path)

            transfer.download_file(self.bucket_name, s3_key, str(local_path))

            # make sure the sha256 of the just downloaded file is correct
            downloaded_sha256 = calculate_file_sha256(str(local_path))
            if sha256 != downloaded_sha256:
                raise RuntimeError(
                    "Hash for the downloaded file `%s` is incorrect. File might be corrupted in the remote storage backend."
                    % str(local_path))
        except botocore.exceptions.NoCredentialsError:
            raise RuntimeError(
                "Download failed. AWS credentials not found. Run `lazydata config aws` to configure them."
            )
예제 #3
0
    def store_file(self, path:str):
        """
        Store a file in the local backend.

        :ivar path: The path to the file to store
        :return:
        """

        stat = os.stat(path)
        abspath = Path(path).resolve()

        sha256 = calculate_file_sha256(path)

        # see if we stored this file already
        datapath = self.hash_to_file(sha256)

        # copy over the the cache,
        # TODO: option to hardlink
        datapath.parent.mkdir(parents=True, exist_ok=True)
        shutil.copyfile(str(abspath), str(datapath))

        # Store in the metadata DB if doesn't exist already
        existing_entries = DataFile.select().where(
            (
                (DataFile.abspath == abspath) &
                (DataFile.sha256 == sha256) &
                (DataFile.mtime == stat.st_mtime) &
                (DataFile.size == stat.st_size)
            )
        )

        if existing_entries.count() == 0:
            DataFile.create(abspath=abspath, sha256=sha256, mtime=stat.st_mtime, size=stat.st_size)
예제 #4
0
    def download_to_local(config: Config,
                          local: LocalStorage,
                          sha256: Optional[str] = None,
                          source_url: Optional[str] = None,
                          path: Optional[str] = None):
        if sha256 is not None:
            local_path = local.hash_to_file(sha256)
            source_url = config.source_url(sha256=sha256)
            if source_url is None:
                raise RuntimeError(
                    "Cannot find source_url for file with hash `%s`. "
                    "See `lazydata add-source` command." % sha256)
            if path is None:
                path = config.path(sha256=sha256)
                if path is None:
                    raise RuntimeError(
                        "Cannot find path for downloading a file.")
        elif source_url is not None:
            if path is None:
                path = config.path(source_url=source_url)
                if path is None:
                    raise RuntimeError(
                        "Cannot find path for downloading a file.")
            local_path = Path(path)
        else:
            raise RuntimeError(
                "Cannot download a file without sha256 and source_url specified."
            )

        local_path.parent.mkdir(parents=True, exist_ok=True)

        f = SmartDL(urls=source_url, dest=str(local_path), progress_bar=False)
        print("Downloading `%s`" % path)
        f.start()
        # make sure the sha256 of the just downloaded file is correct
        downloaded_sha256 = calculate_file_sha256(str(local_path))
        if sha256 is not None and sha256 != downloaded_sha256:
            raise RuntimeError(
                "Hash for the downloaded file `%s` is incorrect. "
                "File might be corrupted in the remote storage backend." %
                str(local_path))
        local.store_file(path=path)
예제 #5
0
    def add_file_entry(self, path:str, script_path:str):
        """
        Add a file entry to the config file

        :param path: The path to the data file
        :param script_path: The path to the script that used it
        :return:
        """
        # path relative to the config file
        path_rel = str(self.path_relative_to_config(path))
        script_path_rel = str(self.path_relative_to_config(script_path))

        sha256 = calculate_file_sha256(path)

        self.config["files"].append({
            "path": path_rel,
            "hash": sha256,
            "usage": script_path_rel,
        })

        self.save_config()
예제 #6
0
def track(path:str) -> str:
    """
    Track a file using lazydata.

    :param path: a path to the file to be tracked
    :return: Returns the path string that is now tracked
    """

    stack = traceback.extract_stack()

    script_location = ""
    if len(stack) >= 2:
        script_location = stack[-2].filename

    # remove the ipython hash because it's going to be changing all the time
    if script_location.startswith("<ipython-input") or script_location.startswith("<stdin"):
        script_location = ""

    path_obj = Path(path)

    # 1) Check if the path exists
    path_exists = path_obj.exists()

    if path_exists and path_obj.is_dir():
        raise NotImplementedError("Tracking directories is not currently supported: `%s`" % path)

    # 2) Check it's present in the config file
    config = Config()
    latest, older = config.get_latest_and_all_file_entries(path)

    if path_exists and latest is None:
        # CASE: Start tracking a new file
        print("LAZYDATA: Tracking new file `%s`" % path)
        local = LocalStorage()
        local.store_file(path)
        config.add_file_entry(path, script_location)
    elif path_exists and latest:
        # CASE: Check for change or stale version
        # check if it has changed
        local = LocalStorage()
        cached_sha256 = local.get_file_sha256(path)

        # compare with the value in config
        if latest["hash"] in cached_sha256:
            # file is at the latest version!
            # just make sure the usage is recorded
            config.add_usage(latest, script_location)
            return path

        # check if it's one of the stale versions
        matching_old = [e for e in older if e["hash"] in cached_sha256]
        if matching_old:
            print("LAZYDATA: Detected an old version of `%s`, updating to the latest..." % path)
            fetch_file(config, local, latest["hash"], path)
            # make sure usage is recorded
            config.add_usage(latest, script_location)
        else:
            # It's not a stale version...
            # So now recalculate the SHA256 to see if the file really changed
            path_sha256 = calculate_file_sha256(path)

            if latest["hash"] != path_sha256:
                print("LAZYDATA: Tracked file `%s` changed, recording a new version..." % path)
                local.store_file(path)
                config.add_file_entry(path, script_location)
                # make sure usage is recorded
                config.add_usage(latest, script_location)
            else:
                # the file hasn't changed but the metadata was missing locally, so add it...
                local.store_file(path)
                # make sure usage is recorded
                config.add_usage(latest, script_location)

    elif not path_exists and latest:
        # CASE: Remote download
        print("LAZYDATA: Getting latest version of tracked file `%s`..." % path)
        local = LocalStorage()
        fetch_file(config, local, latest["hash"], path)
        # make sure usage is recorded
        config.add_usage(latest, script_location)
    elif not path_exists and not latest:
        # CASE: Trying to track non-existing
        raise RuntimeError("Cannot track file, because file is not found: %s" % path)

    return path