def __init__( self, simple_links=True, block_size=None, same_scheme=True, size_policy=None, cache_type="bytes", cache_options=None, asynchronous=False, loop=None, client_kwargs=None, **storage_options, ): """ NB: if this is called async, you must await set_client Parameters ---------- block_size: int Blocks to read bytes; if 0, will default to raw requests file-like objects instead of HTTPFile instances simple_links: bool If True, will consider both HTML <a> tags and anything that looks like a URL; if False, will consider only the former. same_scheme: True When doing ls/glob, if this is True, only consider paths that have http/https matching the input URLs. size_policy: this argument is deprecated client_kwargs: dict Passed to aiohttp.ClientSession, see https://docs.aiohttp.org/en/stable/client_reference.html For example, ``{'auth': aiohttp.BasicAuth('user', 'pass')}`` storage_options: key-value Any other parameters passed on to requests cache_type, cache_options: defaults used in open """ super().__init__(self, asynchronous=asynchronous, loop=loop, **storage_options) self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE self.simple_links = simple_links self.same_schema = same_scheme self.cache_type = cache_type self.cache_options = cache_options self.client_kwargs = client_kwargs or {} self.kwargs = storage_options self._session = None # Clean caching-related parameters from `storage_options` # before propagating them as `request_options` through `self.kwargs`. # TODO: Maybe rename `self.kwargs` to `self.request_options` to make # it clearer. request_options = copy(storage_options) self.use_listings_cache = request_options.pop("use_listings_cache", False) request_options.pop("listings_expiry_time", None) request_options.pop("max_paths", None) request_options.pop("skip_instance_cache", None) self.kwargs = request_options if not asynchronous: sync(self.loop, self.set_session)
def close_session(loop, session): if loop is not None and session is not None: if loop.is_running(): try: sync(loop, session.close, timeout=0.1) except fsspec.FSTimeoutError: pass else: pass
def close_session(loop, session): if loop is not None and loop.is_running(): try: sync(loop, session.close, timeout=0.1) return except (TimeoutError, FSTimeoutError): pass if session._connector is not None: # close after loop is dead session._connector._close()
def test_write_small_secure(s3): # Unfortunately moto does not yet support enforcing SSE policies. It also # does not return the correct objects that can be used to test the results # effectively. # This test is left as a placeholder in case moto eventually supports this. sse_params = SSEParams(server_side_encryption='aws:kms') with s3.open(secure_bucket_name + '/test', 'wb', writer_kwargs=sse_params) as f: f.write(b'hello') assert s3.cat(secure_bucket_name + '/test') == b'hello' sync(s3.loop, s3.s3.head_object, Bucket=secure_bucket_name, Key='test')
def __init__( self, fs, url, mode="rb", asynchronous=False, session=None, loop=None, **kwargs ): path = fs._strip_protocol(url) url = URL(fs.webdav_url) / path self.url = url.as_uri() self.details = {"name": self.url, "size": None} self.asynchronous = asynchronous self.session = session self.loop = loop super(HTTPStreamFile, self).__init__( fs=fs, path=path, mode=mode, block_size=0, cache_type="none", cache_options={}, **kwargs) if self.mode == "rb": self.r = sync(self.loop, self.session.get, self.url, **self.kwargs) elif self.mode == "wb": pass else: raise ValueError
def _simple_upload(self): """One-shot upload, less than 5MB""" self.buffer.seek(0) data = self.buffer.read() sync( self.gcsfs.loop, simple_upload, self.gcsfs, self.bucket, self.key, data, self.metadata, self.consistency, self.content_type, timeout=self.timeout, )
def __init__( self, api_url=None, webdav_url=None, username=None, password=None, token=None, block_size=None, asynchronous=False, loop=None, client_kwargs=None, **storage_options ): """ NB: if this is called async, you must await set_client Parameters ---------- block_size: int Blocks to read bytes; if 0, will default to raw requests file-like objects instead of HTTPFile instances client_kwargs: dict Passed to aiohttp.ClientSession, see https://docs.aiohttp.org/en/stable/client_reference.html For example, ``{'auth': aiohttp.BasicAuth('user', 'pass')}`` storage_options: key-value Any other parameters passed on to requests """ super().__init__( self, asynchronous=asynchronous, loop=loop, **storage_options ) self.api_url = api_url self.webdav_url = webdav_url self.client_kwargs = client_kwargs or {} if (username is None) ^ (password is None): raise ValueError('Username or password not provided') if (username is not None) and (password is not None): self.client_kwargs.update( auth=aiohttp.BasicAuth(username, password) ) if token is not None: if password is not None: raise ValueError('Provide either token or username/password') headers = self.client_kwargs.get('headers', {}) headers.update(Authorization=f'Bearer {token}') self.client_kwargs.update(headers=headers) block_size = DEFAULT_BLOCK_SIZE if block_size is None else block_size self.block_size = block_size self.kwargs = storage_options if not asynchronous: self._session = sync(self.loop, get_client, **self.client_kwargs) weakref.finalize(self, sync, self.loop, self.session.close) else: self._session = None
def __init__(self, fs, url, mode="rb", loop=None, session=None, **kwargs): self.asynchronous = kwargs.pop("asynchronous", False) self.url = url self.loop = loop self.session = session if mode != "rb": raise ValueError self.details = {"name": url, "size": None} super().__init__(fs=fs, path=url, mode=mode, cache_type="none", **kwargs) self.r = sync(self.loop, get, self.session, url, **kwargs)
def write(self, data): if self.mode != "wb": raise ValueError("File not in write mode") self.r = sync( self.loop, self.session.put, self.url, data=data, **self.kwargs ) self.r.raise_for_status()
def _initiate_upload(self): """ Create multi-upload """ self.location = sync( self.gcsfs.loop, initiate_upload, self.gcsfs, self.bucket, self.key, self.content_type, self.metadata, timeout=self.timeout, )
def _open( self, path, mode="rb", block_size=None, autocommit=None, # XXX: This differs from the base class. cache_type=None, cache_options=None, size=None, **kwargs, ): """Make a file-like object Parameters ---------- path: str Full URL with protocol mode: string must be "rb" block_size: int or None Bytes to download in one request; use instance value if None. If zero, will return a streaming Requests file-like instance. kwargs: key-value Any other parameters, passed to requests calls """ if mode != "rb": raise NotImplementedError block_size = block_size if block_size is not None else self.block_size kw = self.kwargs.copy() kw["asynchronous"] = self.asynchronous kw.update(kwargs) size = size or self.size(path) session = sync(self.loop, self.set_session) if block_size and size: return HTTPFile( self, path, session=session, block_size=block_size, mode=mode, size=size, cache_type=cache_type or self.cache_type, cache_options=cache_options or self.cache_options, loop=self.loop, **kw, ) else: return HTTPStreamFile(self, path, mode=mode, loop=self.loop, session=session, **kw)
def __init__( self, simple_links=True, block_size=None, same_scheme=True, size_policy=None, cache_type="bytes", cache_options=None, asynchronous=False, loop=None, client_kwargs=None, **storage_options, ): """ NB: if this is called async, you must await set_client Parameters ---------- block_size: int Blocks to read bytes; if 0, will default to raw requests file-like objects instead of HTTPFile instances simple_links: bool If True, will consider both HTML <a> tags and anything that looks like a URL; if False, will consider only the former. same_scheme: True When doing ls/glob, if this is True, only consider paths that have http/https matching the input URLs. size_policy: this argument is deprecated client_kwargs: dict Passed to aiohttp.ClientSession, see https://docs.aiohttp.org/en/stable/client_reference.html For example, ``{'auth': aiohttp.BasicAuth('user', 'pass')}`` storage_options: key-value Any other parameters passed on to requests cache_type, cache_options: defaults used in open """ super().__init__(self, asynchronous=asynchronous, loop=loop, **storage_options) self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE self.simple_links = simple_links self.same_schema = same_scheme self.cache_type = cache_type self.cache_options = cache_options self.client_kwargs = client_kwargs or {} self.kwargs = storage_options if not asynchronous: self._session = sync(self.loop, get_client, **self.client_kwargs) weakref.finalize(self, sync, self.loop, self.session.close) else: self._session = None
def __init__( self, project=DEFAULT_PROJECT, access="full_control", token=None, block_size=None, consistency="none", cache_timeout=None, secure_serialize=True, check_connection=False, requests_timeout=None, requester_pays=False, asynchronous=False, session_kwargs=None, loop=None, timeout=None, **kwargs, ): super().__init__( self, listings_expiry_time=cache_timeout, asynchronous=asynchronous, loop=loop, **kwargs, ) if access not in self.scopes: raise ValueError("access must be one of {}", self.scopes) if project is None: warnings.warn( "GCS project not set - cannot list or create buckets") if block_size is not None: self.default_block_size = block_size self.requester_pays = requester_pays self.consistency = consistency self.cache_timeout = cache_timeout or kwargs.pop( "listings_expiry_time", None) self.requests_timeout = requests_timeout self.timeout = timeout self._session = None self.session_kwargs = session_kwargs or {} self.credentials = GoogleCredentials(project, access, token, check_connection) if not self.asynchronous: self._session = sync(self.loop, get_client, timeout=self.timeout, **self.session_kwargs) weakref.finalize(self, self.close_session, self.loop, self._session)
def test_xattr(s3): bucket, key = (test_bucket_name, 'tmp/test/xattr') filename = bucket + '/' + key body = b'aaaa' public_read_acl = {'Permission': 'READ', 'Grantee': { 'URI': 'http://acs.amazonaws.com/groups/global/AllUsers', 'Type': 'Group'}} sync(s3.loop, s3.s3.put_object, Bucket=bucket, Key=key, ACL='public-read', Metadata=test_xattr_sample_metadata, Body=body) # save etag for later etag = s3.info(filename)['ETag'] assert public_read_acl in sync( s3.loop, s3.s3.get_object_acl, Bucket=bucket, Key=key )['Grants'] assert s3.getxattr( filename, 'test_xattr') == test_xattr_sample_metadata['test_xattr'] assert s3.metadata(filename) == {'test-xattr': '1'} # note _ became - s3file = s3.open(filename) assert s3file.getxattr( 'test_xattr') == test_xattr_sample_metadata['test_xattr'] assert s3file.metadata() == {'test-xattr': '1'} # note _ became - s3file.setxattr(test_xattr='2') assert s3file.getxattr('test_xattr') == '2' s3file.setxattr(**{'test_xattr': None}) assert s3file.metadata() == {} assert s3.cat(filename) == body # check that ACL and ETag are preserved after updating metadata assert public_read_acl in sync(s3.loop, s3.s3.get_object_acl, Bucket=bucket, Key=key)['Grants'] assert s3.info(filename)['ETag'] == etag
def __init__(self, fs, url, mode="rb", loop=None, session=None, **kwargs): self.asynchronous = kwargs.pop("asynchronous", False) self.url = url self.loop = loop self.session = session if mode != "rb": raise ValueError self.details = {"name": url, "size": None} super().__init__(fs=fs, path=url, mode=mode, cache_type="none", **kwargs) async def cor(): r = await self.session.get(url, **kwargs).__aenter__() self.fs._raise_not_found_for_status(r, url) return r self.r = sync(self.loop, cor)
def test_copy_managed(s3): data = b'abc' * 12*2**20 fn = test_bucket_name + '/test/biggerfile' with s3.open(fn, 'wb') as f: f.write(data) sync(s3.loop, s3._copy_managed, fn, fn + '2', size=len(data), block=5 * 2 ** 20) assert s3.cat(fn) == s3.cat(fn + '2') with pytest.raises(ValueError): sync(s3.loop, s3._copy_managed, fn, fn + '3', size=len(data), block=4 * 2 ** 20) with pytest.raises(ValueError): sync(s3.loop, s3._copy_managed, fn, fn + '3', size=len(data), block=6 * 2 ** 30)
def __init__(self, simple_links=True, block_size=None, same_scheme=True, size_policy=None, cache_type="bytes", cache_options=None, asynchronous=False, loop=None, **storage_options): """ NB: if this is called async, you must await set_client Parameters ---------- block_size: int Blocks to read bytes; if 0, will default to raw requests file-like objects instead of HTTPFile instances simple_links: bool If True, will consider both HTML <a> tags and anything that looks like a URL; if False, will consider only the former. same_scheme: True When doing ls/glob, if this is True, only consider paths that have http/https matching the input URLs. size_policy: this argument is deprecated storage_options: key-value May be credentials, e.g., `{'auth': ('username', 'pword')}` or any other parameters passed on to requests cache_type, cache_options: defaults used in open """ super().__init__(self, asynchronous=asynchronous, loop=loop, **storage_options) self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE self.simple_links = simple_links self.same_schema = same_scheme self.cache_type = cache_type self.cache_options = cache_options self.kwargs = storage_options if not asynchronous: self._session = sync(self.loop, get_client) weakref.finalize(self, sync, self.loop, self.session.close) else: self._session = None
def _directory_model_from_path(self, path, content=False): def s3_detail_to_model(s3_detail): model_path = s3_detail["Key"] model = base_model(self.fs.unprefix(model_path)) if s3_detail["StorageClass"] == 'DIRECTORY': model["created"] = model["last_modified"] = DUMMY_CREATED_DATE model["type"] = "directory" lstat = self.fs.lstat(model_path) if "ST_MTIME" in lstat and lstat["ST_MTIME"]: model["last_modified"] = model["created"] = lstat[ "ST_MTIME"] else: model["last_modified"] = s3_detail.get("LastModified").replace( microsecond=0, tzinfo=tzutc()) model["created"] = model["last_modified"] # model["size"] = s3_detail.get("Size") model["type"] = "notebook" if model_path.endswith( ".ipynb") else "file" return model self.log.debug( "S3contents.GenericManager._directory_model_from_path: path('%s') type(%s)", path, content, ) model = base_directory_model(path) if self.fs.isdir(path): lstat = self.fs.lstat(path) if "ST_MTIME" in lstat and lstat["ST_MTIME"]: model["last_modified"] = model["created"] = lstat["ST_MTIME"] if content: if not self.dir_exists(path): self.no_such_entity(path) model["format"] = "json" prefixed_path = self.fs.path(path) files_s3_detail = sync(self.fs.fs.loop, self.fs.fs._lsdir, prefixed_path) filtered_files_s3_detail = list( filter( lambda detail: os.path.basename(detail['Key']) != self.fs. dir_keep_file, files_s3_detail)) model["content"] = list( map(s3_detail_to_model, filtered_files_s3_detail)) return model
def _close_session(looplocal): loop = getattr(looplocal, "loop", None) session = getattr(looplocal, "_session", None) if loop is not None and session is not None: sync(loop, session.close)
def buckets(self): """Return list of available project buckets.""" return [ b["name"] for b in sync(self.loop, self._list_buckets, timeout=self.timeout) ]
def __init__( self, simple_links=True, block_size=None, same_scheme=True, size_policy=None, cache_type="bytes", cache_options=None, asynchronous=False, loop=None, client_kwargs=None, get_client=get_client, **storage_options, ): """ NB: if this is called async, you must await set_client Parameters ---------- block_size: int Blocks to read bytes; if 0, will default to raw requests file-like objects instead of HTTPFile instances simple_links: bool If True, will consider both HTML <a> tags and anything that looks like a URL; if False, will consider only the former. same_scheme: True When doing ls/glob, if this is True, only consider paths that have http/https matching the input URLs. size_policy: this argument is deprecated client_kwargs: dict Passed to aiohttp.ClientSession, see https://docs.aiohttp.org/en/stable/client_reference.html For example, ``{'auth': aiohttp.BasicAuth('user', 'pass')}`` get_client: Callable[..., aiohttp.ClientSession] A callable which takes keyword arguments and constructs an aiohttp.ClientSession. It's state will be managed by the HTTPFileSystem class. storage_options: key-value Any other parameters passed on to requests cache_type, cache_options: defaults used in open """ super().__init__( simple_links=simple_links, block_size=block_size, same_scheme=same_scheme, size_policy=size_policy, cache_type=cache_type, cache_options=cache_options, asynchronous=asynchronous, loop=loop, client_kwargs=client_kwargs, get_client=get_client, **storage_options, ) request_options = copy(storage_options) self.use_listings_cache = request_options.pop("use_listings_cache", False) request_options.pop("listings_expiry_time", None) request_options.pop("max_paths", None) request_options.pop("skip_instance_cache", None) listings_cache_type = request_options.pop("listings_cache_type", None) listings_cache_location = request_options.pop("listings_cache_location", None) if self.use_listings_cache: if listings_cache_type == "filedircache": logger.info(f"Dircache located at {listings_cache_location}") self.kwargs = request_options if not asynchronous: sync(self.loop, self.set_session)
def close_session(loop, session): if loop is not None and loop.is_running(): sync(loop, session.close) elif session._connector is not None: # close after loop is dead session._connector._close()
def test_checksum(s3): bucket = test_bucket_name d = "checksum" prefix = d+"/e" o1 = prefix + "1" o2 = prefix + "2" path1 = bucket + "/" + o1 path2 = bucket + "/" + o2 client = s3.s3 # init client and files sync(s3.loop, client.put_object, Bucket=bucket, Key=o1, Body="") sync(s3.loop, client.put_object, Bucket=bucket, Key=o2, Body="") # change one file, using cache sync(s3.loop, client.put_object, Bucket=bucket, Key=o1, Body="foo") checksum = s3.checksum(path1) s3.ls(path1) # force caching sync(s3.loop, client.put_object, Bucket=bucket, Key=o1, Body="bar") # refresh == False => checksum doesn't change assert checksum == s3.checksum(path1) # change one file, without cache sync(s3.loop, client.put_object, Bucket=bucket, Key=o1, Body="foo") checksum = s3.checksum(path1, refresh=True) s3.ls(path1) # force caching sync(s3.loop, client.put_object, Bucket=bucket, Key=o1, Body="bar") # refresh == True => checksum changes assert checksum != s3.checksum(path1, refresh=True) # Test for nonexistent file sync(s3.loop, client.put_object, Bucket=bucket, Key=o1, Body="bar") s3.ls(path1) # force caching sync(s3.loop, client.delete_object, Bucket=bucket, Key=o1) with pytest.raises(FileNotFoundError): s3.checksum(o1, refresh=True) # Test multipart upload upload_id = sync(s3.loop, client.create_multipart_upload, Bucket=bucket, Key=o1, )["UploadId"] etag1 = sync(s3.loop, client.upload_part, Bucket=bucket, Key=o1, UploadId=upload_id, PartNumber=1, Body="0" * (5 * 1024 * 1024), )['ETag'] etag2 = sync(s3.loop, client.upload_part, Bucket=bucket, Key=o1, UploadId=upload_id, PartNumber=2, Body="0", )['ETag'] sync(s3.loop, client.complete_multipart_upload, Bucket=bucket, Key=o1, UploadId=upload_id, MultipartUpload={'Parts': [ {'PartNumber': 1, 'ETag': etag1}, {'PartNumber': 2, 'ETag': etag2}, ]}, ) s3.checksum(path1, refresh=True)