def _initiate_upload(self): if not self.autocommit and not self.append_block and self.tell( ) < self.blocksize: # only happens when closing small file, use on-shot PUT return logger.debug("Initiate upload for %s" % self) self.parts = [] try: self.mpu = self._call_s3(self.fs.s3.create_multipart_upload, Bucket=self.bucket, Key=self.key, ACL=self.acl) except ClientError as e: raise translate_boto_error(e) except ParamValidationError as e: raise ValueError('Initiating write to %r failed: %s' % (self.path, e)) if self.append_block: # use existing data in key when appending, # and block is big enough out = self.fs._call_s3(self.fs.s3.upload_part_copy, self.s3_additional_kwargs, Bucket=self.bucket, Key=self.key, PartNumber=1, UploadId=self.mpu['UploadId'], CopySource=self.path) self.parts.append({ 'PartNumber': 1, 'ETag': out['CopyPartResult']['ETag'] })
def wrapper(*args, **kwargs): try: func(*args, **kwargs) except Exception as exc: from s3fs.errors import translate_boto_error raise translate_boto_error(exc)
def bulk_delete(self, pathlist, **kwargs): """ Remove multiple keys with one call Parameters ---------- pathlist : listof strings The keys to remove, must all be in the same bucket. """ if not pathlist: return buckets = {split_path(path)[0] for path in pathlist} if len(buckets) > 1: raise ValueError("Bulk delete files should refer to only one " "bucket") bucket = buckets.pop() if len(pathlist) > 1000: for i in range((len(pathlist) // 1000) + 1): self.bulk_delete(pathlist[i * 1000:(i + 1) * 1000]) return delete_keys = { 'Objects': [{ 'Key': split_path(path)[1] } for path in pathlist] } for path in pathlist: self.invalidate_cache(self._parent(path)) try: self._call_s3(self.s3.delete_objects, kwargs, Bucket=bucket, Delete=delete_keys) except ClientError as e: raise translate_boto_error(e)
def info(self, path, version_id=None): if path in ['/', '']: return {'name': path, 'size': 0, 'type': 'directory'} kwargs = self.kwargs.copy() if version_id is not None: if not self.version_aware: raise ValueError("version_id cannot be specified if the " "filesystem is not version aware") kwargs['VersionId'] = version_id if self.version_aware: try: bucket, key = split_path(path) out = self._call_s3(self.s3.head_object, kwargs, Bucket=bucket, Key=key, **self.req_kw) return { 'ETag': out['ETag'], 'Key': '/'.join([bucket, key]), 'LastModified': out['LastModified'], 'Size': out['ContentLength'], 'size': out['ContentLength'], 'path': '/'.join([bucket, key]), 'StorageClass': "STANDARD", 'VersionId': out.get('VersionId') } except ClientError as e: raise translate_boto_error(e) except ParamValidationError as e: raise ValueError('Failed to head path %r: %s' % (path, e)) return super().info(path)
def _initiate_upload(self): if self.acl and self.acl not in key_acls: raise ValueError('ACL not in %s', key_acls) self.parts = [] self.size = 0 if self.blocksize < 5 * 2**20: raise ValueError('Block size must be >=5MB') try: self.mpu = self._call_s3(self.fs.s3.create_multipart_upload, Bucket=self.bucket, Key=self.key, ACL=self.acl) except ClientError as e: raise translate_boto_error(e) except ParamValidationError as e: raise ValueError('Initiating write to %r failed: %s' % (self.path, e)) if 'a' in self.mode and self.fs.exists(self.path): if self.append_block: # use existing data in key when appending, # and block is big enough out = self.fs._call_s3(self.fs.s3.upload_part_copy, self.s3_additional_kwargs, Bucket=self.bucket, Key=self.key, PartNumber=1, UploadId=self.mpu['UploadId'], CopySource=self.path) self.parts.append({ 'PartNumber': 1, 'ETag': out['CopyPartResult']['ETag'] })
def _lsdir(self, path, refresh=False, max_items=None): if path.startswith('s3://'): path = path[len('s3://'):] path = path.rstrip('/') bucket, prefix = split_path(path) prefix = prefix + '/' if prefix else "" if path not in self.dircache or refresh: try: pag = self.s3.get_paginator('list_objects_v2') config = {} if max_items is not None: config.update(MaxItems=max_items, PageSize=2 * max_items) it = pag.paginate(Bucket=bucket, Prefix=prefix, Delimiter='/', PaginationConfig=config, **self.req_kw) files = [] dircache = [] for i in it: dircache.extend(i.get('CommonPrefixes', [])) for c in i.get('Contents', []): c['type'] = 'file' c['size'] = c['Size'] files.append(c) if dircache: files.extend([{'Key': l['Prefix'][:-1], 'Size': 0, 'StorageClass': "DIRECTORY", 'type': 'directory', 'size': 0} for l in dircache]) for f in files: f['Key'] = '/'.join([bucket, f['Key']]) f['name'] = f['Key'] except ClientError as e: raise translate_boto_error(e) self.dircache[path] = files return self.dircache[path]
def _fetch_range(client, bucket, key, version_id, start, end, max_attempts=10, req_kw=None): if req_kw is None: req_kw = {} logger.debug("Fetch: %s/%s, %s-%s", bucket, key, start, end) for i in range(max_attempts): try: if version_id is not None: kwargs = dict({'VersionId': version_id}, **req_kw) else: kwargs = req_kw resp = client.get_object(Bucket=bucket, Key=key, Range='bytes=%i-%i' % (start, end - 1), **kwargs) return resp['Body'].read() except S3_RETRYABLE_ERRORS as e: logger.debug('Exception %r on S3 download, retrying', e, exc_info=True) continue except ConnectionError as e: logger.debug('ConnectionError %r on S3 download, retrying', e, exc_info=True) continue except ClientError as e: if e.response['Error'].get('Code', 'Unknown') in ['416', 'InvalidRange']: return b'' raise translate_boto_error(e) except Exception as e: if 'time' in str(e).lower(): # Actual exception type changes often continue else: raise raise RuntimeError("Max number of S3 retries exceeded")
def rmdir(self, path): path = self._strip_protocol(path).rstrip('/') if not self._parent(path): try: self.s3.delete_bucket(Bucket=path) except ClientError as e: raise translate_boto_error(e) self.invalidate_cache(path) self.invalidate_cache('')
def _fetch_range(client, bucket, key, version_id, start, end, max_attempts=10, req_kw=None): if req_kw is None: req_kw = {} if start == end: # When these match, we would make a request with `range=start-end - 1` # According to RFC2616, servers are supposed to ignore the Range # field when it's invalid like this. S3 does ignore it, moto doesn't. # To avoid differences in behavior under mocking, we just avoid # making these requests. It's hoped that since we're being called # from a caching object, this won't end up mattering. logger.debug( 'skip fetch for negative range - bucket=%s,key=%s,start=%d,end=%d', bucket, key, start, end) return b'' logger.debug("Fetch: %s/%s, %s-%s", bucket, key, start, end) for i in range(max_attempts): try: if version_id is not None: kwargs = dict({'VersionId': version_id}, **req_kw) else: kwargs = req_kw resp = client.get_object(Bucket=bucket, Key=key, Range='bytes=%i-%i' % (start, end - 1), **kwargs) return resp['Body'].read() except S3_RETRYABLE_ERRORS as e: logger.debug('Exception %r on S3 download, retrying', e, exc_info=True) time.sleep(1.7**i * 0.1) continue except ConnectionError as e: logger.debug('ConnectionError %r on S3 download, retrying', e, exc_info=True) time.sleep(1.7**i * 0.1) continue except ClientError as e: if e.response['Error'].get('Code', 'Unknown') in ['416', 'InvalidRange']: return b'' raise translate_boto_error(e) except Exception as e: if 'time' in str(e).lower(): # Actual exception type changes often continue else: raise raise RuntimeError("Max number of S3 retries exceeded")
def touch(self, path, truncate=True, data=None, **kwargs): """Create empty file or truncate""" bucket, key = split_path(path) if not truncate and self.exists(path): raise ValueError("S3 does not support touching existent files") try: self._call_s3(self.s3.put_object, kwargs, Bucket=bucket, Key=key) except ClientError as ex: raise translate_boto_error(ex) self.invalidate_cache(self._parent(path))
def export_artifacts(experiment: Dict[str, str], report_path: str, experiment_output_directory: str, export_base_path: str) -> None: """Save the experiment artifacts to the `bench_export_directory`. experiment: experiment dict that contains "dataset_name" (e.g. ames_housing), "experiment_name" (specified by user), and "config_path" (path to experiment config. Relative to ludwig/benchmarks/configs). report_path: path where the experiment metrics report is saved. experiment_output_directory: path where the model, data, and logs of the experiment are saved. export_base_path: remote or local path (directory) where artifacts are exported. (e.g. s3://benchmarking.us-west-2.ludwig.com/bench/ or your/local/bench/) """ protocol, _ = fsspec.core.split_protocol(export_base_path) fs, _ = get_fs_and_path(export_base_path) try: export_full_path = os.path.join(export_base_path, experiment["dataset_name"], experiment["experiment_name"]) fs.put(report_path, os.path.join(export_full_path, REPORT_JSON), recursive=True) fs.put( os.path.join("configs", experiment["config_path"]), os.path.join(export_full_path, CONFIG_YAML), recursive=True, ) fs.put( os.path.join(experiment["dataset_name"], EXPERIMENT_RUN, "model", MODEL_HYPERPARAMETERS_FILE_NAME), os.path.join(export_full_path, MODEL_HYPERPARAMETERS_FILE_NAME), recursive=True, ) # zip experiment directory to export try: shutil.make_archive("artifacts", "zip", experiment_output_directory) fs.put("artifacts.zip", os.path.join(export_full_path, "artifacts.zip"), recursive=True) os.remove("artifacts.zip") except Exception as e: logging.error( f"Couldn't export '{experiment_output_directory}' to bucket") logging.error(e) print("Uploaded metrics report and experiment config to\n\t", export_full_path) except ClientError as e: logging.error(translate_boto_error(e))
def copy_managed(self, path1, path2, **kwargs): buc1, key1 = split_path(path1) buc2, key2 = split_path(path2) copy_source = {'Bucket': buc1, 'Key': key1} try: self.s3.copy(CopySource=copy_source, Bucket=buc2, Key=key2, ExtraArgs=self._get_s3_method_kwargs( self.s3.copy_object, kwargs)) except ClientError as e: raise translate_boto_error(e) except ParamValidationError as e: raise ValueError('Copy failed (%r -> %r): %s' % (path1, path2, e))
def copy_basic(self, path1, path2, **kwargs): """ Copy file between locations on S3 """ buc1, key1 = split_path(path1) buc2, key2 = split_path(path2) try: self._call_s3(self.s3.copy_object, kwargs, Bucket=buc2, Key=key2, CopySource='/'.join([buc1, key1])) except ClientError as e: raise translate_boto_error(e) except ParamValidationError as e: raise ValueError('Copy failed (%r -> %r): %s' % (path1, path2, e))
def rm(self, path, recursive=False, **kwargs): """ Remove keys and/or bucket. Parameters ---------- path : string The location to remove. recursive : bool (True) Whether to remove also all entries below, i.e., which are returned by `walk()`. """ bucket, key = split_path(path) if recursive: files = self.find(path, maxdepth=None) if key and not files: raise FileNotFoundError(path) self.bulk_delete(files, **kwargs) if not key: self.rmdir(bucket) return if key: if not self.exists(path): raise FileNotFoundError(path) try: self._call_s3(self.s3.delete_object, kwargs, Bucket=bucket, Key=key) except ClientError as e: raise translate_boto_error(e) self.invalidate_cache(self._parent(path)) else: if self.exists(bucket): try: self.s3.delete_bucket(Bucket=bucket) except BotoCoreError as e: raise IOError('Delete bucket %r failed: %s' % (bucket, e)) self.invalidate_cache(bucket) self.invalidate_cache('') else: raise FileNotFoundError(path)
def mkdir(self, path, acl="", **kwargs): path = self._strip_protocol(path).rstrip('/') if not self._parent(path): if acl and acl not in buck_acls: raise ValueError('ACL not in %s', buck_acls) try: params = {"Bucket": path, 'ACL': acl} region_name = (kwargs.get("region_name", None) or self.client_kwargs.get("region_name", None)) if region_name: params['CreateBucketConfiguration'] = { 'LocationConstraint': region_name } self.s3.create_bucket(**params) self.invalidate_cache('') self.invalidate_cache(path) except ClientError as e: raise translate_boto_error(e) except ParamValidationError as e: raise ValueError('Bucket create failed %r: %s' % (path, e))