def __init__( self, account_name: str, account_key: str = None, custom_domain: str = None, is_emulated: bool = False, sas_token: str = None, protocol=DEFAULT_PROTOCOL, endpoint_suffix=SERVICE_HOST_BASE, request_session=None, connection_string: str = None, socket_timeout=None, token_credential=None, blocksize=BlockBlobService.MAX_BLOCK_SIZE, ): AbstractFileSystem.__init__(self) self.account_name = account_name self.account_key = account_key self.custom_domain = custom_domain self.is_emulated = is_emulated self.sas_token = sas_token self.protocol = protocol self.endpoint_suffix = endpoint_suffix self.request_session = request_session self.connection_string = connection_string self.socket_timeout = socket_timeout self.token_credential = token_credential self.blocksize = blocksize self.do_connect()
def __init__(self, simple_links=True, block_size=None, same_scheme=True, size_policy=None, **storage_options): """ Parameters ---------- block_size: int Blocks to read bytes; if 0, will default to raw requests file-like objects instead of HTTPFile instances simple_links: bool If True, will consider both HTML <a> tags and anything that looks like a URL; if False, will consider only the former. same_scheme: True When doing ls/glob, if this is True, only consider paths that have http/https matching the input URLs. size_policy: this argument is deprecated storage_options: key-value May be credentials, e.g., `{'auth': ('username', 'pword')}` or any other parameters passed on to requests """ AbstractFileSystem.__init__(self) self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE self.simple_links = simple_links self.same_schema = same_scheme self.kwargs = storage_options self.session = requests.Session()
def __init__( self, account_name: str, account_key: str = None, connection_string: str = None, credential: str = None, sas_token: str = None, request_session=None, socket_timeout: int = None, blocksize: int = create_configuration( storage_sdk="blob").max_block_size, client_id: str = None, client_secret: str = None, tenant_id: str = None, ): AbstractFileSystem.__init__(self) self.account_name = account_name self.account_key = account_key self.connection_string = connection_string self.credential = credential self.sas_token = sas_token self.request_session = request_session self.socket_timeout = socket_timeout self.blocksize = blocksize self.client_id = client_id self.client_secret = client_secret self.tenant_id = tenant_id if (self.credential is None and self.account_key is None and self.sas_token is None and self.client_id is not None): self.credential = self._get_credential_from_service_principal() self.do_connect()
def __init__(self, fo="", mode="r", **storage_options): """ Parameters ---------- fo: str or file-like Contains ZIP, and must exist. If a str, will fetch file using `open_files()`, which must return one file exactly. mode: str Currently, only 'r' accepted storage_options: key-value May be credentials, e.g., `{'auth': ('username', 'pword')}` or any other parameters for requests """ AbstractFileSystem.__init__(self) if mode != "r": raise ValueError("Only read from zip files accepted") self.in_fo = fo if isinstance(fo, str): files = open_files(fo) if len(files) != 1: raise ValueError('Path "{}" did not resolve to exactly' 'one file: "{}"'.format(fo, files)) fo = files[0] self.fo = fo.__enter__() # the whole instance is a context self.zip = zipfile.ZipFile(self.fo) self.block_size = storage_options.pop("block_size", DEFAULT_BLOCK_SIZE) self.kwargs = storage_options self.dir_cache = None
def __init__(self, account_name: str, container_name: str, account_key: str): AbstractFileSystem.__init__(self) self.account_name = account_name self.account_key = account_key self.container_name = container_name self.do_connect()
def __init__(self, tenant_id, client_id, client_secret, store_name): AbstractFileSystem.__init__(self) self.tenant_id = tenant_id self.client_id = client_id self.client_secret = client_secret self.store_name = store_name self.do_connect()
def test_created(fs: AbstractFileSystem, temp_file): try: fs.touch(temp_file) created = fs.created(path=temp_file) assert isinstance(created, datetime.datetime) finally: if not isinstance(fs, tuple(READ_ONLY_FILESYSTEMS)): fs.rm(temp_file)
def upload_feature(feature_name: str, df: pd.DataFrame, fs: AbstractFileSystem) -> None: containers = fs.ls(".") # Issue with exist_ok flag: see https://github.com/dask/adlfs/issues/130 if not any(c.startswith(settings.features_container_name.strip("/")) for c in containers): fs.mkdir(settings.features_container_name) with fs.open(settings.feature_location(feature_name), mode="wb") as f: df.to_parquet(f)
def read_feather(path: str, fs: fsspec.AbstractFileSystem = None): fs = fs or fsspec.filesystem("file") if not fs.exists(path): return try: with fs.open(path) as f: reader = pa.ipc.open_stream(f) return reader.read_pandas() except (pa.ArrowInvalid, FileNotFoundError): return
def _get_merged_time_coordinate(source: str, target: str, dim: str, fs: fsspec.AbstractFileSystem) -> xr.DataArray: source_ds = xr.open_zarr(source, consolidated=True) if dim in source_ds.coords: if fs.exists(target): target_ds = xr.open_zarr(fs.get_mapper(target), consolidated=True) time = xr.concat([target_ds[dim], source_ds[dim]], dim=dim) else: time = source_ds[dim] return time
def __init__( self, account_name: str, container_name: str, account_key: str, custom_domain: str = None, is_emulated: bool = False, ): AbstractFileSystem.__init__(self) self.account_name = account_name self.account_key = account_key self.container_name = container_name self.custom_domain = custom_domain self.is_emulated = is_emulated self.do_connect()
def _numpy_saver( fs: fsspec.AbstractFileSystem, filepath: str, array: np.ndarray, codec: BaseCodec ): """ Saves a single numpy array into filepath given specific filesystem """ with fs.open(filepath, "wb") as f: f.write(codec.encode(array))
def test_modified(fs: AbstractFileSystem, temp_file): try: fs.touch(temp_file) created = fs.created(path=temp_file) time.sleep(0.05) fs.touch(temp_file) modified = fs.modified(path=temp_file) assert isinstance(modified, datetime.datetime) assert modified > created finally: fs.rm(temp_file)
def detect_folders( bucket: str, fs: fsspec.AbstractFileSystem, ) -> Mapping[str, DiagnosticFolder]: diag_ncs = fs.glob(os.path.join(bucket, "*", "diags.nc")) return { Path(url).parent.name: DiagnosticFolder(fs, Path(url).parent.as_posix()) for url in diag_ncs }
def __init__(self, **storage_options): """ Parameters ---------- block_size: int Blocks to read bytes; if 0, will default to raw requests file-like objects instead of HTTPFile instances simple_links: bool If True, will consider both HTML <a> tags and anything that looks like a URL; if False, will consider only the former. storage_options: key-value May be credentials, e.g., `{'auth': ('username', 'pword')}` or any other parameters passed on to requests """ AbstractFileSystem.__init__(self) self.block_size = storage_options.pop('block_size', DEFAULT_BLOCK_SIZE) self.simple_links = storage_options.pop('simple_links', True) self.kwargs = storage_options self.session = requests.Session()
def __init__(self, simple_links=True, block_size=None, same_scheme=True, size_policy=None, cache_type="bytes", cache_options=None, asynchronous=False, loop=None, **storage_options): """ NB: if this is called async, you must await set_client Parameters ---------- block_size: int Blocks to read bytes; if 0, will default to raw requests file-like objects instead of HTTPFile instances simple_links: bool If True, will consider both HTML <a> tags and anything that looks like a URL; if False, will consider only the former. same_scheme: True When doing ls/glob, if this is True, only consider paths that have http/https matching the input URLs. size_policy: this argument is deprecated storage_options: key-value May be credentials, e.g., `{'auth': ('username', 'pword')}` or any other parameters passed on to requests cache_type, cache_options: defaults used in open """ AbstractFileSystem.__init__(self, asynchronous=asynchronous, loop=loop) self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE self.simple_links = simple_links self.same_schema = same_scheme self.cache_type = cache_type self.cache_options = cache_options self.kwargs = storage_options if not asynchronous: self._session = sync(self.loop, get_client) weakref.finalize(self, sync, self.loop, self.session.close) else: self._session = None
def get_timestep(key: Key, fs: fsspec.AbstractFileSystem, url: str) -> Iterable[Tuple[Key, xr.DataArray]]: time, category, tile = key location = _file(url, time, category, tile) logging.info(f"Opening {location}") with fs.open(location, "rb") as f: ds = xr.open_dataset(f).load() ds = vcm.standardize_metadata(ds) for variable in ds: yield key, ds[variable]
def _get_block_size(bufsize: int, fs: AbstractFileSystem, filepath: str): """Instead of checking for an S3 file system, just be mindful of the S3 minimum block size. """ if bufsize < 0: # block size is the file size unless min block size is bigger filesize = fs.size(filepath) blocksize = max(filesize, Pipe.min_s3_blocksize) else: # block size is buffer size unless min block size is bigger blocksize = max(bufsize, Pipe.min_s3_blocksize) return blocksize
def _numpy_load(fs: fsspec.AbstractFileSystem, filepath: str, codec: BaseCodec) -> np.ndarray: """Given filesystem and filepath, loads numpy array""" # assert fs.exists( # filepath # ), f"Dataset file {filepath} does not exists. Your dataset data is likely to be corrupted" try: with fs.open(filepath, "rb") as f: return codec.decode(f.read()) except Exception as e: logger.error(traceback.format_exc() + str(e)) raise Exception( f"Dataset file {filepath} does not exists. Your dataset data is likely to be corrupted" )
def get_prefix_time_mapping( fs: fsspec.AbstractFileSystem, url: str ) -> Mapping[str, cftime.DatetimeJulian]: """Return a dictionary mapping restart file prefixes to times Args: fs: fsspec filesystem object url: url to the run-directory Returns: a mapping from "file_prefix (e.g. "INPUT" or "RESTART/YYYYMMDD.HHMMSS" timestamp) to parsed date time objects """ times = _get_restart_times(fs, url) prefixes = _get_prefixes(fs.walk(url)) return dict(zip(prefixes, times))
def prepare_fs(cls, fs: fsspec.AbstractFileSystem, root: str): if fs.isdir(root): # print(f'{fs.protocol}: deleting {root}') fs.delete(root, recursive=True) # print(f'{fs.protocol}: making root {root}') fs.mkdirs(root) # Write a text file into each subdirectory, so # we also test that store.get_data_ids() scans # recursively. dir_path = root for subdir_name in DATA_PATH.split('/'): dir_path += '/' + subdir_name # print(f'{fs.protocol}: making {dir_path}') fs.mkdir(dir_path) file_path = dir_path + '/README.md' # print(f'{fs.protocol}: writing {file_path}') with fs.open(file_path, 'w') as fp: fp.write('\n')
def dataset_batches( file_meta: FileMeta, fs: fsspec.AbstractFileSystem, n_rows: int ) -> Iterator[pd.DataFrame]: try: d: ds.Dataset = ds.dataset(file_meta.filename, filesystem=fs) except ArrowInvalid: return for fn in sorted(map(str, d.files)): f = pq.ParquetFile(fs.open(fn)) for batch in f.iter_batches(batch_size=n_rows): if batch.num_rows == 0: break df = batch.to_pandas() df = df[(df["ts_init"] >= file_meta.start) & (df["ts_init"] <= file_meta.end)] if df.empty: return if file_meta.instrument_id: df.loc[:, "instrument_id"] = file_meta.instrument_id yield df
def append_zarr_along_time(source_path: str, target_path: str, fs: fsspec.AbstractFileSystem, dim: str = "time"): """Append local zarr store at source_path to zarr store at target_path along time. Args: source_path: Local path to zarr store that represents an xarray dataset. target_path: Local or remote url for zarr store to be appended to. fs: Filesystem for target_path. dim: (optional) name of time dimension. Defaults to "time". Raises: ValueError: If the chunk size in time does not evenly divide length of time dimension for zarr stores at source_path. Warning: The zarr store as source_path will be modified in place. """ merged_time = _get_merged_time_coordinate(source_path, target_path, dim, fs) if fs.exists(target_path): source_store = zarr.open(source_path, mode="r+") target_store = zarr.open_consolidated(fsspec.get_mapper(target_path)) _assert_chunks_match(source_store, target_store, dim) _set_time_units_like(source_store, target_store) _shift_store(source_store, dim, _get_dim_size(target_store, dim)) elif fs.protocol == "file": os.makedirs(target_path) upload_dir(source_path, target_path) _overwrite_time_array_with_single_chunk(target_path, merged_time, dim) _, _, absolute_target_paths = fsspec.get_fs_token_paths(target_path) consolidate_metadata(fs, absolute_target_paths[0])
def find_available_groups( product_files: T.Dict[str, T.Tuple[str, str, str, str, str]], product_path: str, check_files_exist: bool = False, fs: fsspec.AbstractFileSystem = fsspec.filesystem("file"), ) -> T.Dict[str, T.List[str]]: groups: T.Dict[str, T.List[str]] = {} for path, (type, _, swath, polarization, _) in product_files.items(): swath_pol_group = f"{swath}/{polarization}".upper() abspath = os.path.join(product_path, os.path.normpath(path)) if check_files_exist: if not fs.exists(abspath): continue if type == "s1Level1ProductSchema": groups[swath.upper()] = [""] groups[swath_pol_group] = [abspath] + groups.get( swath_pol_group, []) for metadata_group in [ "orbit", "attitude", "azimuth_fm_rate", "dc_estimate", "gcp", "coordinate_conversion", ]: groups[f"{swath_pol_group}/{metadata_group}"] = [abspath] elif type == "s1Level1CalibrationSchema": groups[f"{swath_pol_group}/calibration"] = [abspath] elif type == "s1Level1NoiseSchema": groups[f"{swath_pol_group}/noise_range"] = [abspath] groups[f"{swath_pol_group}/noise_azimuth"] = [abspath] elif type == "s1Level1MeasurementSchema": groups[swath_pol_group] = [abspath] + groups.get( swath_pol_group, []) return groups
def load_raw_data(location: str, fs: AbstractFileSystem) -> pd.DataFrame: with fs.open(location) as f: df = pd.read_csv(f, usecols=["beds", "accommodates"]) return df
def get_schema(fs: fsspec.AbstractFileSystem, url: str) -> xr.Dataset: logging.info(f"Grabbing schema from {url}") with fs.open(url, "rb") as f: return vcm.standardize_metadata(xr.open_dataset(f))
def _load_restart(fs: fsspec.AbstractFileSystem, path: str) -> xr.Dataset: with fs.open(path) as f: return xr.open_dataset(f).compute()
def write_parquet( fs: fsspec.AbstractFileSystem, path: str, df: pd.DataFrame, partition_cols: Optional[List[str]], schema: pa.Schema, **kwargs, ): """ Write a single dataframe to parquet. """ # Check partition values are valid before writing to parquet mappings = check_partition_columns(df=df, partition_columns=partition_cols) df = clean_partition_cols(df=df, mappings=mappings) # Dataframe -> pyarrow Table table = pa.Table.from_pandas(df, schema=schema) if "basename_template" not in kwargs and "ts_init" in df.columns: kwargs["basename_template"] = ( f"{df['ts_init'].min()}-{df['ts_init'].max()}" + "-{i}.parquet" ) # Write the actual file partitions = ( ds.partitioning( schema=pa.schema(fields=[table.schema.field(c) for c in partition_cols]), flavor="hive", ) if partition_cols else None ) if pa.__version__ >= "6.0.0": kwargs.update(existing_data_behavior="overwrite_or_ignore") files = set(fs.glob(f"{path}/**")) ds.write_dataset( data=table, base_dir=path, filesystem=fs, partitioning=partitions, format="parquet", **kwargs, ) # Ensure data written by write_dataset is sorted new_files = set(fs.glob(f"{path}/**/*.parquet")) - files del df for fn in new_files: ndf = pd.read_parquet(fs.open(fn)) # assert ndf.shape[0] == shape if "ts_init" in ndf.columns: ndf = ndf.sort_values("ts_init").reset_index(drop=True) pq.write_table( table=pa.Table.from_pandas(ndf), where=fn, filesystem=fs, ) # Write the ``_common_metadata`` parquet file without row groups statistics pq.write_metadata(table.schema, f"{path}/_common_metadata", version="2.6", filesystem=fs) # Write out any partition columns we had to modify due to filesystem requirements if mappings: write_partition_column_mappings(fs=fs, path=path, mappings=mappings)
def get_file_count(fs: fsspec.AbstractFileSystem, path): return len(fs.listdir(path, detail=False))
def write_feather(table: pa.Table, path: str, file_system: AbstractFileSystem, **kwargs): with file_system.open(path, "wb") as f: paf.write_feather(table, f, **kwargs)