def download_file(self, blob, local_path, container_name=None, use_basename=True): """ Downloads a file from Azure Blob service. Args: blob: `str`. blob to download. local_path: `str`. the path to download to. container_name: `str`. the name of the container. use_basename: `bool`. whether or not to use the basename of the blob. """ if not container_name: container_name, _, blob = self.parse_wasbs_url(blob) local_path = os.path.abspath(local_path) if use_basename: local_path = append_basename(local_path, blob) try: check_dirname_exists(local_path) except PolyaxonPathException as e: raise PolyaxonStoresException("Connection error: %s" % e) from e client = self.connection.get_container_client(container_name) try: with open(local_path, "wb") as file: client.download_blob(blob).readinto(file) except HttpResponseError as e: raise PolyaxonStoresException("Connection error: %s" % e) from e
def download_file(self, key, local_path, bucket_name=None, use_basename=True): """ Download a file from S3. Args: key: `str`. S3 key that will point to the file. local_path: `str`. the path to download to. bucket_name: `str`. Name of the bucket in which to store the file. use_basename: `bool`. whether or not to use the basename of the key. """ if not bucket_name: bucket_name, key = self.parse_s3_url(key) local_path = os.path.abspath(local_path) if use_basename: local_path = append_basename(local_path, key) try: check_dirname_exists(local_path) except PolyaxonPathException as e: raise PolyaxonStoresException("Connection error: %s" % e) from e try: self.connection.download_file(bucket_name, key, local_path) except ClientError as e: raise PolyaxonStoresException("Connection error: %s" % e) from e
def download_dir( self, blob, local_path, container_name=None, use_basename=True, workers=0 ): """ Download a directory from Azure Blob service. Args: blob: `str`. blob to download. local_path: `str`. the path to download to. container_name: `str`. the name of the container. use_basename: `bool`. whether or not to use the basename of the key. workers: number of workers threads to use for parallel execution. """ if not container_name: container_name, _, blob = self.parse_wasbs_url(blob) local_path = os.path.abspath(local_path) if use_basename: local_path = append_basename(local_path, blob) try: check_dirname_exists(local_path, is_dir=True) except PolyaxonPathException: os.makedirs(local_path) results = self.list(container_name=container_name, key=blob, delimiter="/") # Create directories for prefix in sorted(results["prefixes"]): direname = os.path.join(local_path, prefix) prefix = os.path.join(blob, prefix) # Download files under self.download_dir( blob=prefix, local_path=direname, container_name=container_name, use_basename=False, ) pool, future_results = self.init_pool(workers) # Download files for file_key in results["blobs"]: file_key = file_key[0] filename = os.path.join(local_path, file_key) file_key = os.path.join(blob, file_key) future_results = self.submit_pool( workers=workers, pool=pool, future_results=future_results, fn=self.download_file, blob=file_key, local_path=filename, container_name=container_name, use_basename=False, ) if workers: futures.wait(future_results) self.close_pool(pool=pool)
def download_file(self, blob, local_path, container_name=None, use_basename=True): """ Downloads a file from Google Cloud Storage. Args: blob: `str`. blob to download. local_path: `str`. the path to download to. container_name: `str`. the name of the container. use_basename: `bool`. whether or not to use the basename of the blob. """ if not container_name: container_name, _, blob = self.parse_wasbs_url(blob) local_path = os.path.abspath(local_path) if use_basename: local_path = append_basename(local_path, blob) try: check_dirname_exists(local_path) except PolyaxonPathException as e: raise PolyaxonStoresException("Connection error: %s" % e) from e try: self.connection.get_blob_to_path(container_name, blob, local_path) except AzureHttpError as e: raise PolyaxonStoresException("Connection error: %s" % e) from e
def download_file(self, blob, local_path, bucket_name=None, use_basename=True): """ Downloads a file from Google Cloud Storage. Args: blob: `str`. blob to download. local_path: `str`. the path to download to. bucket_name: `str`. the name of the bucket. use_basename: `bool`. whether or not to use the basename of the blob. """ if not bucket_name: bucket_name, blob = self.parse_gcs_url(blob) local_path = os.path.abspath(local_path) if use_basename: local_path = append_basename(local_path, blob) try: check_dirname_exists(local_path) except PolyaxonPathException as e: raise PolyaxonStoresException("Connection error: %s" % e) from e try: blob = self.get_blob(blob=blob, bucket_name=bucket_name) blob.download_to_filename(local_path) except (NotFound, GoogleAPIError) as e: raise PolyaxonStoresException("Connection error: %s" % e) from e
def upload_dir( self, dirname, blob, container_name=None, use_basename=True, workers=0, last_time=None, exclude: List[str] = None, ): """ Uploads a local directory to Azure Blob service. Args: dirname: `str`. name of the directory to upload. blob: `str`. blob to upload to. container_name: `str`. the name of the container. use_basename: `bool`. whether or not to use the basename of the directory. last_time: `datetime`. if provided, it will only upload the file if changed after last_time. exclude: `list`. List of paths to exclude. """ if not container_name: container_name, _, blob = self.parse_wasbs_url(blob) if use_basename: blob = append_basename(blob, dirname) pool, future_results = self.init_pool(workers) # Turn the path to absolute paths dirname = os.path.abspath(dirname) with get_files_in_path_context(dirname, exclude=exclude) as files: for f in files: # If last time is provided we check if we should re-upload the file if last_time and not file_modified_since( filepath=f, last_time=last_time ): continue file_blob = os.path.join(blob, os.path.relpath(f, dirname)) future_results = self.submit_pool( workers=workers, pool=pool, future_results=future_results, fn=self.upload_file, filename=f, blob=file_blob, container_name=container_name, use_basename=False, ) if workers: futures.wait(future_results) self.close_pool(pool=pool)
def download_file(self, path_from, local_path, use_basename=True, **kwargs): local_path = os.path.abspath(local_path) if use_basename: local_path = append_basename(local_path, path_from) if local_path == path_from: return check_or_create_path(local_path, is_dir=False) if os.path.exists(path_from) and os.path.isfile(path_from): shutil.copy(path_from, local_path)
def download_dir(self, blob, local_path, bucket_name=None, use_basename=True, workers=0): """ Download a directory from Google Cloud Storage. Args: blob: `str`. blob to download. local_path: `str`. the path to download to. bucket_name: `str`. Name of the bucket in which to store the file. use_basename: `bool`. whether or not to use the basename of the key. workers: number of workers threads to use for parallel execution. """ if not bucket_name: bucket_name, blob = self.parse_gcs_url(blob) local_path = os.path.abspath(local_path) if use_basename: local_path = append_basename(local_path, blob) file_blobs = list(self.connection.list_blobs(bucket_name, prefix=blob)) subdirs = set([ os.path.dirname(os.path.relpath(file_blob.name, blob)) for file_blob in file_blobs ]) os.makedirs(local_path, exist_ok=True) for subdir in sorted(subdirs): os.makedirs(os.path.join(local_path, subdir), exist_ok=True) pool, future_results = self.init_pool(workers) # Download files for file_blob in file_blobs: filename = os.path.join(local_path, os.path.relpath(file_blob.name, blob)) future_results = self.submit_pool( workers=workers, pool=pool, future_results=future_results, fn=_download_blob, blob=file_blob, local_path=filename, ) if workers: futures.wait(future_results) self.close_pool(pool=pool)
def upload_dir( self, dirname, blob, bucket_name=None, use_basename=True, workers=0, last_time=None, ): """ Uploads a local directory to to Google Cloud Storage. Args: dirname: `str`. name of the directory to upload. blob: `str`. blob to upload to. bucket_name: `str`. the name of the bucket. use_basename: `bool`. whether or not to use the basename of the directory. last_time: `datetime`. If provided will only upload the file if changed after last_time. """ if not bucket_name: bucket_name, blob = self.parse_gcs_url(blob) if use_basename: blob = append_basename(blob, dirname) pool, future_results = self.init_pool(workers) # Turn the path to absolute paths dirname = os.path.abspath(dirname) with get_files_in_path_context(dirname) as files: for f in files: # If last time is provided we check if we should re-upload the file if last_time and not file_modified_since(filepath=f, last_time=last_time): continue file_blob = os.path.join(blob, os.path.relpath(f, dirname)) future_results = self.submit_pool( workers=workers, pool=pool, future_results=future_results, fn=self.upload_file, filename=f, blob=file_blob, bucket_name=bucket_name, use_basename=False, ) if workers: futures.wait(future_results) self.close_pool(pool=pool)
def upload_file( self, filename, key, bucket_name=None, overwrite=True, encrypt=False, acl=None, use_basename=True, ): """ Uploads a local file to S3. Args: filename: `str`. name of the file to upload. key: `str`. S3 key that will point to the file. bucket_name: `str`. Name of the bucket in which to store the file. overwrite: `bool`. A flag to decide whether or not to overwrite the key if it already exists. If replace is False and the key exists, an error will be raised. encrypt: `bool`. If True, the file will be encrypted on the server-side by S3 and will be stored in an encrypted form while at rest in S3. acl: `str`. ACL to use for uploading, e.g. "public-read". use_basename: `bool`. whether or not to use the basename of the filename. """ if not bucket_name: bucket_name, key = self.parse_s3_url(key) if use_basename: key = append_basename(key, filename) if not overwrite and self.check_key(key, bucket_name): raise PolyaxonStoresException( "The key {} already exists.".format(key)) extra_args = {} if encrypt: extra_args["ServerSideEncryption"] = self.ENCRYPTION if acl: extra_args["ACL"] = acl self.connection.upload_file(filename, bucket_name, key, ExtraArgs=extra_args)
def upload_file(self, filename, blob, bucket_name=None, use_basename=True): """ Uploads a local file to Google Cloud Storage. Args: filename: `str`. the file to upload. blob: `str`. blob to upload to. bucket_name: `str`. the name of the bucket. use_basename: `bool`. whether or not to use the basename of the filename. """ if not bucket_name: bucket_name, blob = self.parse_gcs_url(blob) if use_basename: blob = append_basename(blob, filename) bucket = self.get_bucket(bucket_name) bucket.blob(blob).upload_from_filename(filename)
def upload_file(self, filename, blob, container_name=None, use_basename=True): """ Uploads a local file to Azure Blob service. Args: filename: `str`. the file to upload. blob: `str`. blob to upload to. container_name: `str`. the name of the container. use_basename: `bool`. whether or not to use the basename of the filename. """ if not container_name: container_name, _, blob = self.parse_wasbs_url(blob) if use_basename: blob = append_basename(blob, filename) client = self.connection.get_container_client(container_name) with open(filename, "rb") as file: client.upload_blob(blob, file, overwrite=True)
def upload_dir( self, dirname, path_to, use_basename=True, workers=0, last_time=None, exclude: List[str] = None, ): if use_basename: path_to = append_basename(path_to, dirname) if dirname == path_to: return check_or_create_path(path_to, is_dir=True) pool, future_results = self.init_pool(workers) # Turn the path to absolute paths dirname = os.path.abspath(dirname) with get_files_in_path_context(dirname, exclude=exclude) as files: for f in files: # If last time is provided we check if we should re-upload the file if last_time and not file_modified_since( filepath=f, last_time=last_time ): continue file_blob = os.path.join(path_to, os.path.relpath(f, dirname)) future_results = self.submit_pool( workers=workers, pool=pool, future_results=future_results, fn=self.upload_file, filename=f, path_to=file_blob, use_basename=False, ) if workers: futures.wait(future_results) self.close_pool(pool=pool)
def upload_file(self, filename, blob, container_name=None, use_basename=True): """ Uploads a local file to Google Cloud Storage. Args: filename: `str`. the file to upload. blob: `str`. blob to upload to. container_name: `str`. the name of the container. use_basename: `bool`. whether or not to use the basename of the filename. """ if not container_name: container_name, _, blob = self.parse_wasbs_url(blob) if use_basename: blob = append_basename(blob, filename) self.connection.create_blob_from_path(container_name, blob, filename)
def download_dir( self, key: str, local_path: str, bucket_name: str = None, use_basename: bool = True, workers: int = 0, ): """ Download a directory from S3. Args: key: `str`. S3 key that will point to the file. local_path: `str`. the path to download to. bucket_name: `str`. Name of the bucket in which to store the file. use_basename: `bool`. whether or not to use the basename of the key. workers: number of workers threads to use for parallel execution. """ if not bucket_name: bucket_name, key = self.parse_s3_url(key) local_path = os.path.abspath(local_path) if use_basename: local_path = append_basename(local_path, key) try: check_dirname_exists(local_path, is_dir=True) except PolyaxonPathException: os.makedirs(local_path) results = self.list(bucket_name=bucket_name, prefix=key, delimiter="/") # Create directories for prefix in sorted(results["prefixes"]): direname = os.path.join(local_path, prefix) prefix = os.path.join(key, prefix) # Download files under self.download_dir( key=prefix, local_path=direname, bucket_name=bucket_name, use_basename=False, ) pool, future_results = self.init_pool(workers) # Download files for file_key in results["keys"]: file_key = file_key[0] filename = os.path.join(local_path, file_key) file_key = os.path.join(key, file_key) future_results = self.submit_pool( workers=workers, pool=pool, future_results=future_results, fn=self.download_file, key=file_key, local_path=filename, bucket_name=bucket_name, use_basename=False, ) if workers: futures.wait(future_results) self.close_pool(pool=pool)
def upload_dir( self, dirname, key, bucket_name=None, overwrite=False, encrypt=False, acl=None, use_basename=True, workers=0, last_time=None, ): """ Uploads a local directory to S3. Args: dirname: `str`. name of the directory to upload. key: `str`. S3 key that will point to the file. bucket_name: `str`. Name of the bucket in which to store the file. overwrite: `bool`. A flag to decide whether or not to overwrite the key if it already exists. If replace is False and the key exists, an error will be raised. encrypt: `bool`. If True, the file will be encrypted on the server-side by S3 and will be stored in an encrypted form while at rest in S3. acl: `str`. ACL to use for uploading, e.g. "public-read". use_basename: `bool`. whether or not to use the basename of the directory. last_time: `datetime`. if provided, it will only upload the file if changed after last_time. """ if not bucket_name: bucket_name, key = self.parse_s3_url(key) if use_basename: key = append_basename(key, dirname) pool, future_results = self.init_pool(workers) # Turn the path to absolute paths dirname = os.path.abspath(dirname) with get_files_in_path_context(dirname) as files: for f in files: # If last time is provided we check if we should re-upload the file if last_time and not file_modified_since( filepath=f, last_time=last_time ): continue file_key = os.path.join(key, os.path.relpath(f, dirname)) future_results = self.submit_pool( workers=workers, pool=pool, future_results=future_results, fn=self.upload_file, filename=f, key=file_key, bucket_name=bucket_name, overwrite=overwrite, encrypt=encrypt, acl=acl, use_basename=False, ) if workers: futures.wait(future_results) self.close_pool(pool=pool)
def test_append_basename(self): assert append_basename("foo", "bar") == "foo/bar" assert append_basename("foo", "moo/bar") == "foo/bar" assert append_basename("/foo", "bar") == "/foo/bar" assert append_basename("/foo/moo", "bar") == "/foo/moo/bar" assert append_basename("/foo/moo", "boo/bar.txt") == "/foo/moo/bar.txt"