Exemplo n.º 1
0
def fetch_file(config: Config, local: LocalStorage, path: str, sha256: Optional[str] = None,
               source_url: Optional[str] = None):
    """
    Top level function that fetches the file, either from local or remote storage
    and copies it to `path`.

    If fetching remotely, it always goes to the local storage first, then copied.

    :param config: project Config instance
    :param local: LocalStorage instance
    :param path: where the file should be copied to
    :param sha256: hash of the file we need
    :param source_url: URL of the source of the file we need
    :return:
    """
    if sha256 is None and source_url is None:
        raise RuntimeError("Fetching file `%s`: neither sha256 nor source_url was specified.")

    if sha256 is not None:
        local_copy_success = local.copy_file_to(sha256, path)
    else:
        local_copy_success = False
    if not local_copy_success:
        if source_url is None:
            source_url = config.source_url(sha256=sha256)
        if source_url is None:
            remote = RemoteStorage.get_from_config(config)
        else:
            remote = UrlRemoteStorage()
        remote.download_to_local(config=config, local=local, sha256=sha256, source_url=source_url, path=path)

        if sha256 is not None:
            local.copy_file_to(sha256, path)
Exemplo n.º 2
0
    def handle(self, args):
        config = Config()
        local = LocalStorage()

        if args.artefacts == []:
            # pull everything
            for e in config.config["files"]:
                file_abs_path = config.abs_path(e["path"])
                fetch_file(config=config,
                           local=local,
                           path=str(file_abs_path),
                           sha256=e["hash"])
        else:
            for artefact in args.artefacts:

                # 1) check if the artefact is a file we are tracking
                latest, _ = config.get_latest_and_all_file_entries(artefact)
                if latest is not None:
                    # pull the latest version of this file
                    fetch_file(config=config,
                               local=local,
                               path=artefact,
                               sha256=latest["hash"])
                    continue

                # 2) Check for usage
                used_entries = config.tracked_files_used_in(artefact)
                if used_entries:
                    for e in used_entries:
                        file_abs_path = config.abs_path(e["path"])
                        fetch_file(config=config,
                                   local=local,
                                   path=str(file_abs_path),
                                   sha256=e["hash"])
                    continue

                # 3) check for a directory
                dir_path = None
                try:
                    # might fail on python 3.5...
                    dir_path = Path(artefact).resolve()
                except Exception:
                    pass
                if dir_path and dir_path.exists() and dir_path.is_dir():
                    dir_entries = config.abs_path_matches_prefix(str(dir_path))
                    if dir_entries:
                        for e in dir_entries:
                            file_abs_path = config.abs_path(e["path"])
                            fetch_file(config=config,
                                       local=local,
                                       path=str(file_abs_path),
                                       sha256=e["hash"])

                    continue
Exemplo n.º 3
0
 def handle(self, args):
     config = Config()
     if "remote" in config.config:
         remote = RemoteStorage.get_from_config(config)
         local = LocalStorage()
         remote.upload(local, config)
     else:
         print(
             "ERROR: Remote not specified for this lazydata project. Use `lazydata add-remote` to add it."
         )
Exemplo n.º 4
0
    def download_to_local(config: Config,
                          local: LocalStorage,
                          sha256: Optional[str] = None,
                          source_url: Optional[str] = None,
                          path: Optional[str] = None):
        if sha256 is not None:
            local_path = local.hash_to_file(sha256)
            source_url = config.source_url(sha256=sha256)
            if source_url is None:
                raise RuntimeError(
                    "Cannot find source_url for file with hash `%s`. "
                    "See `lazydata add-source` command." % sha256)
            if path is None:
                path = config.path(sha256=sha256)
                if path is None:
                    raise RuntimeError(
                        "Cannot find path for downloading a file.")
        elif source_url is not None:
            if path is None:
                path = config.path(source_url=source_url)
                if path is None:
                    raise RuntimeError(
                        "Cannot find path for downloading a file.")
            local_path = Path(path)
        else:
            raise RuntimeError(
                "Cannot download a file without sha256 and source_url specified."
            )

        local_path.parent.mkdir(parents=True, exist_ok=True)

        f = SmartDL(urls=source_url, dest=str(local_path), progress_bar=False)
        print("Downloading `%s`" % path)
        f.start()
        # make sure the sha256 of the just downloaded file is correct
        downloaded_sha256 = calculate_file_sha256(str(local_path))
        if sha256 is not None and sha256 != downloaded_sha256:
            raise RuntimeError(
                "Hash for the downloaded file `%s` is incorrect. "
                "File might be corrupted in the remote storage backend." %
                str(local_path))
        local.store_file(path=path)
Exemplo n.º 5
0
    def handle(self, args):
        url = args.url
        endpoint_url = args.endpoint_url

        if not url.startswith("s3://"):
            print(
                "ERROR: Only S3 URLs are currently supported. For example: `s3://mybucket` or `s3://mybucket/myfolder`"
            )
            sys.exit(1)

        success = False
        while not success:
            remote = RemoteStorage.get_from_url(url, endpoint_url=endpoint_url)
            try:
                if remote.check_storage_exists():
                    config = Config()
                    config.add_remote(url, endpoint_url=endpoint_url)
                    success = True
                else:
                    success = True
                    print(
                        "ERROR: The remote storage location you specified does not exist or is not accessible to you"
                    )
            except botocore.exceptions.NoCredentialsError:
                success = False

            if not success:
                print("ERROR: No valid AWS credentials found.")
                config_now = input(
                    "Do you want to configure AWS credentials now? [y/n] ")
                if config_now.strip() == "y":
                    setup_aws_credentials()
                    print(
                        "Credentials successfully stored. Trying again with these new credentials..."
                    )
                else:
                    success = True
                    print(
                        "Alright, will not configure credentials now. Re-run this command to try again, "
                        "or configure using the AWS CLI: `aws configure`.")
Exemplo n.º 6
0
def track(path:str) -> str:
    """
    Track a file using lazydata.

    :param path: a path to the file to be tracked
    :return: Returns the path string that is now tracked
    """

    stack = traceback.extract_stack()

    script_location = ""
    if len(stack) >= 2:
        script_location = stack[-2].filename

    # remove the ipython hash because it's going to be changing all the time
    if script_location.startswith("<ipython-input") or script_location.startswith("<stdin"):
        script_location = ""

    path_obj = Path(path)

    # 1) Check if the path exists
    path_exists = path_obj.exists()

    if path_exists and path_obj.is_dir():
        raise NotImplementedError("Tracking directories is not currently supported: `%s`" % path)

    # 2) Check it's present in the config file
    config = Config()
    latest, older = config.get_latest_and_all_file_entries(path)

    if path_exists and latest is None:
        # CASE: Start tracking a new file
        print("LAZYDATA: Tracking new file `%s`" % path)
        local = LocalStorage()
        local.store_file(path)
        config.add_file_entry(path, script_location)
    elif path_exists and latest:
        # CASE: Check for change or stale version
        # check if it has changed
        local = LocalStorage()
        cached_sha256 = local.get_file_sha256(path)

        # compare with the value in config
        if latest["hash"] in cached_sha256:
            # file is at the latest version!
            # just make sure the usage is recorded
            config.add_usage(latest, script_location)
            return path

        # check if it's one of the stale versions
        matching_old = [e for e in older if e["hash"] in cached_sha256]
        if matching_old:
            print("LAZYDATA: Detected an old version of `%s`, updating to the latest..." % path)
            fetch_file(config, local, latest["hash"], path)
            # make sure usage is recorded
            config.add_usage(latest, script_location)
        else:
            # It's not a stale version...
            # So now recalculate the SHA256 to see if the file really changed
            path_sha256 = calculate_file_sha256(path)

            if latest["hash"] != path_sha256:
                print("LAZYDATA: Tracked file `%s` changed, recording a new version..." % path)
                local.store_file(path)
                config.add_file_entry(path, script_location)
                # make sure usage is recorded
                config.add_usage(latest, script_location)
            else:
                # the file hasn't changed but the metadata was missing locally, so add it...
                local.store_file(path)
                # make sure usage is recorded
                config.add_usage(latest, script_location)

    elif not path_exists and latest:
        # CASE: Remote download
        print("LAZYDATA: Getting latest version of tracked file `%s`..." % path)
        local = LocalStorage()
        fetch_file(config, local, latest["hash"], path)
        # make sure usage is recorded
        config.add_usage(latest, script_location)
    elif not path_exists and not latest:
        # CASE: Trying to track non-existing
        raise RuntimeError("Cannot track file, because file is not found: %s" % path)

    return path
Exemplo n.º 7
0
 def handle(self, args):
     config = Config()
     source_url = args.source_url
     path = args.path
     entry, _ = config.get_latest_and_all_file_entries(path=path)
     config.add_source(entry=entry, source_url=source_url)