示例#1
0
文件: core.py 项目: bemgreem/adlfs
 def __init__(
     self,
     account_name: str,
     account_key: str = None,
     custom_domain: str = None,
     is_emulated: bool = False,
     sas_token: str = None,
     protocol=DEFAULT_PROTOCOL,
     endpoint_suffix=SERVICE_HOST_BASE,
     request_session=None,
     connection_string: str = None,
     socket_timeout=None,
     token_credential=None,
     blocksize=BlockBlobService.MAX_BLOCK_SIZE,
 ):
     AbstractFileSystem.__init__(self)
     self.account_name = account_name
     self.account_key = account_key
     self.custom_domain = custom_domain
     self.is_emulated = is_emulated
     self.sas_token = sas_token
     self.protocol = protocol
     self.endpoint_suffix = endpoint_suffix
     self.request_session = request_session
     self.connection_string = connection_string
     self.socket_timeout = socket_timeout
     self.token_credential = token_credential
     self.blocksize = blocksize
     self.do_connect()
示例#2
0
 def __init__(self,
              simple_links=True,
              block_size=None,
              same_scheme=True,
              size_policy=None,
              **storage_options):
     """
     Parameters
     ----------
     block_size: int
         Blocks to read bytes; if 0, will default to raw requests file-like
         objects instead of HTTPFile instances
     simple_links: bool
         If True, will consider both HTML <a> tags and anything that looks
         like a URL; if False, will consider only the former.
     same_scheme: True
         When doing ls/glob, if this is True, only consider paths that have
         http/https matching the input URLs.
     size_policy: this argument is deprecated
     storage_options: key-value
         May be credentials, e.g., `{'auth': ('username', 'pword')}` or any
         other parameters passed on to requests
     """
     AbstractFileSystem.__init__(self)
     self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE
     self.simple_links = simple_links
     self.same_schema = same_scheme
     self.kwargs = storage_options
     self.session = requests.Session()
示例#3
0
    def __init__(
        self,
        account_name: str,
        account_key: str = None,
        connection_string: str = None,
        credential: str = None,
        sas_token: str = None,
        request_session=None,
        socket_timeout: int = None,
        blocksize: int = create_configuration(
            storage_sdk="blob").max_block_size,
        client_id: str = None,
        client_secret: str = None,
        tenant_id: str = None,
    ):
        AbstractFileSystem.__init__(self)
        self.account_name = account_name
        self.account_key = account_key
        self.connection_string = connection_string
        self.credential = credential
        self.sas_token = sas_token
        self.request_session = request_session
        self.socket_timeout = socket_timeout
        self.blocksize = blocksize
        self.client_id = client_id
        self.client_secret = client_secret
        self.tenant_id = tenant_id

        if (self.credential is None and self.account_key is None
                and self.sas_token is None and self.client_id is not None):
            self.credential = self._get_credential_from_service_principal()
        self.do_connect()
示例#4
0
 def __init__(self, fo="", mode="r", **storage_options):
     """
     Parameters
     ----------
     fo: str or file-like
         Contains ZIP, and must exist. If a str, will fetch file using
         `open_files()`, which must return one file exactly.
     mode: str
         Currently, only 'r' accepted
     storage_options: key-value
         May be credentials, e.g., `{'auth': ('username', 'pword')}` or any
         other parameters for requests
     """
     AbstractFileSystem.__init__(self)
     if mode != "r":
         raise ValueError("Only read from zip files accepted")
     self.in_fo = fo
     if isinstance(fo, str):
         files = open_files(fo)
         if len(files) != 1:
             raise ValueError('Path "{}" did not resolve to exactly'
                              'one file: "{}"'.format(fo, files))
         fo = files[0]
     self.fo = fo.__enter__()  # the whole instance is a context
     self.zip = zipfile.ZipFile(self.fo)
     self.block_size = storage_options.pop("block_size", DEFAULT_BLOCK_SIZE)
     self.kwargs = storage_options
     self.dir_cache = None
示例#5
0
文件: core.py 项目: hayesgb/adlfs
 def __init__(self, account_name: str, container_name: str,
              account_key: str):
     AbstractFileSystem.__init__(self)
     self.account_name = account_name
     self.account_key = account_key
     self.container_name = container_name
     self.do_connect()
示例#6
0
文件: core.py 项目: hayesgb/adlfs
 def __init__(self, tenant_id, client_id, client_secret, store_name):
     AbstractFileSystem.__init__(self)
     self.tenant_id = tenant_id
     self.client_id = client_id
     self.client_secret = client_secret
     self.store_name = store_name
     self.do_connect()
示例#7
0
def test_created(fs: AbstractFileSystem, temp_file):
    try:
        fs.touch(temp_file)
        created = fs.created(path=temp_file)
        assert isinstance(created, datetime.datetime)
    finally:
        if not isinstance(fs, tuple(READ_ONLY_FILESYSTEMS)):
            fs.rm(temp_file)
def upload_feature(feature_name: str, df: pd.DataFrame, fs: AbstractFileSystem) -> None:
    containers = fs.ls(".")

    # Issue with exist_ok flag: see https://github.com/dask/adlfs/issues/130
    if not any(c.startswith(settings.features_container_name.strip("/")) for c in containers):
        fs.mkdir(settings.features_container_name)

    with fs.open(settings.feature_location(feature_name), mode="wb") as f:
        df.to_parquet(f)
示例#9
0
def read_feather(path: str, fs: fsspec.AbstractFileSystem = None):
    fs = fs or fsspec.filesystem("file")
    if not fs.exists(path):
        return
    try:
        with fs.open(path) as f:
            reader = pa.ipc.open_stream(f)
            return reader.read_pandas()
    except (pa.ArrowInvalid, FileNotFoundError):
        return
示例#10
0
def _get_merged_time_coordinate(source: str, target: str, dim: str,
                                fs: fsspec.AbstractFileSystem) -> xr.DataArray:
    source_ds = xr.open_zarr(source, consolidated=True)
    if dim in source_ds.coords:
        if fs.exists(target):
            target_ds = xr.open_zarr(fs.get_mapper(target), consolidated=True)
            time = xr.concat([target_ds[dim], source_ds[dim]], dim=dim)
        else:
            time = source_ds[dim]
        return time
示例#11
0
 def __init__(
     self,
     account_name: str,
     container_name: str,
     account_key: str,
     custom_domain: str = None,
     is_emulated: bool = False,
 ):
     AbstractFileSystem.__init__(self)
     self.account_name = account_name
     self.account_key = account_key
     self.container_name = container_name
     self.custom_domain = custom_domain
     self.is_emulated = is_emulated
     self.do_connect()
示例#12
0
文件: core.py 项目: 40a/Hub-1
def _numpy_saver(
    fs: fsspec.AbstractFileSystem, filepath: str, array: np.ndarray, codec: BaseCodec
):
    """ Saves a single numpy array into filepath given specific filesystem
    """
    with fs.open(filepath, "wb") as f:
        f.write(codec.encode(array))
示例#13
0
def test_modified(fs: AbstractFileSystem, temp_file):
    try:
        fs.touch(temp_file)
        created = fs.created(path=temp_file)
        time.sleep(0.05)
        fs.touch(temp_file)
        modified = fs.modified(path=temp_file)
        assert isinstance(modified, datetime.datetime)
        assert modified > created
    finally:
        fs.rm(temp_file)
示例#14
0
def detect_folders(
    bucket: str,
    fs: fsspec.AbstractFileSystem,
) -> Mapping[str, DiagnosticFolder]:
    diag_ncs = fs.glob(os.path.join(bucket, "*", "diags.nc"))
    return {
        Path(url).parent.name: DiagnosticFolder(fs,
                                                Path(url).parent.as_posix())
        for url in diag_ncs
    }
示例#15
0
 def __init__(self, **storage_options):
     """
     Parameters
     ----------
     block_size: int
         Blocks to read bytes; if 0, will default to raw requests file-like
         objects instead of HTTPFile instances
     simple_links: bool
         If True, will consider both HTML <a> tags and anything that looks
         like a URL; if False, will consider only the former.
     storage_options: key-value
         May be credentials, e.g., `{'auth': ('username', 'pword')}` or any
         other parameters passed on to requests
     """
     AbstractFileSystem.__init__(self)
     self.block_size = storage_options.pop('block_size', DEFAULT_BLOCK_SIZE)
     self.simple_links = storage_options.pop('simple_links', True)
     self.kwargs = storage_options
     self.session = requests.Session()
示例#16
0
    def __init__(self,
                 simple_links=True,
                 block_size=None,
                 same_scheme=True,
                 size_policy=None,
                 cache_type="bytes",
                 cache_options=None,
                 asynchronous=False,
                 loop=None,
                 **storage_options):
        """
        NB: if this is called async, you must await set_client

        Parameters
        ----------
        block_size: int
            Blocks to read bytes; if 0, will default to raw requests file-like
            objects instead of HTTPFile instances
        simple_links: bool
            If True, will consider both HTML <a> tags and anything that looks
            like a URL; if False, will consider only the former.
        same_scheme: True
            When doing ls/glob, if this is True, only consider paths that have
            http/https matching the input URLs.
        size_policy: this argument is deprecated
        storage_options: key-value
            May be credentials, e.g., `{'auth': ('username', 'pword')}` or any
            other parameters passed on to requests
        cache_type, cache_options: defaults used in open
        """
        AbstractFileSystem.__init__(self, asynchronous=asynchronous, loop=loop)
        self.block_size = block_size if block_size is not None else DEFAULT_BLOCK_SIZE
        self.simple_links = simple_links
        self.same_schema = same_scheme
        self.cache_type = cache_type
        self.cache_options = cache_options
        self.kwargs = storage_options
        if not asynchronous:
            self._session = sync(self.loop, get_client)
            weakref.finalize(self, sync, self.loop, self.session.close)
        else:
            self._session = None
示例#17
0
def get_timestep(key: Key, fs: fsspec.AbstractFileSystem,
                 url: str) -> Iterable[Tuple[Key, xr.DataArray]]:
    time, category, tile = key
    location = _file(url, time, category, tile)
    logging.info(f"Opening {location}")
    with fs.open(location, "rb") as f:
        ds = xr.open_dataset(f).load()
    ds = vcm.standardize_metadata(ds)

    for variable in ds:
        yield key, ds[variable]
示例#18
0
 def _get_block_size(bufsize: int, fs: AbstractFileSystem, filepath: str):
     """Instead of checking for an S3 file system, just be mindful of the S3
     minimum block size.
     """
     if bufsize < 0:
         # block size is the file size unless min block size is bigger
         filesize = fs.size(filepath)
         blocksize = max(filesize, Pipe.min_s3_blocksize)
     else:
         # block size is buffer size unless min block size is bigger
         blocksize = max(bufsize, Pipe.min_s3_blocksize)
     return blocksize
示例#19
0
def _numpy_load(fs: fsspec.AbstractFileSystem, filepath: str,
                codec: BaseCodec) -> np.ndarray:
    """Given filesystem and filepath, loads numpy array"""
    # assert fs.exists(
    #    filepath
    # ), f"Dataset file {filepath} does not exists. Your dataset data is likely to be corrupted"

    try:
        with fs.open(filepath, "rb") as f:
            return codec.decode(f.read())
    except Exception as e:
        logger.error(traceback.format_exc() + str(e))
        raise Exception(
            f"Dataset file {filepath} does not exists. Your dataset data is likely to be corrupted"
        )
示例#20
0
def get_prefix_time_mapping(
    fs: fsspec.AbstractFileSystem, url: str
) -> Mapping[str, cftime.DatetimeJulian]:
    """Return a dictionary mapping restart file prefixes to times

    Args:
        fs: fsspec filesystem object
        url: url to the run-directory
    Returns:
        a mapping from "file_prefix (e.g. "INPUT" or "RESTART/YYYYMMDD.HHMMSS"
        timestamp) to parsed date time objects

    """
    times = _get_restart_times(fs, url)
    prefixes = _get_prefixes(fs.walk(url))
    return dict(zip(prefixes, times))
示例#21
0
    def prepare_fs(cls, fs: fsspec.AbstractFileSystem, root: str):
        if fs.isdir(root):
            # print(f'{fs.protocol}: deleting {root}')
            fs.delete(root, recursive=True)

        # print(f'{fs.protocol}: making root {root}')
        fs.mkdirs(root)

        # Write a text file into each subdirectory, so
        # we also test that store.get_data_ids() scans
        # recursively.
        dir_path = root
        for subdir_name in DATA_PATH.split('/'):
            dir_path += '/' + subdir_name
            # print(f'{fs.protocol}: making {dir_path}')
            fs.mkdir(dir_path)
            file_path = dir_path + '/README.md'
            # print(f'{fs.protocol}: writing {file_path}')
            with fs.open(file_path, 'w') as fp:
                fp.write('\n')
示例#22
0
def dataset_batches(
    file_meta: FileMeta, fs: fsspec.AbstractFileSystem, n_rows: int
) -> Iterator[pd.DataFrame]:
    try:
        d: ds.Dataset = ds.dataset(file_meta.filename, filesystem=fs)
    except ArrowInvalid:
        return
    for fn in sorted(map(str, d.files)):
        f = pq.ParquetFile(fs.open(fn))
        for batch in f.iter_batches(batch_size=n_rows):
            if batch.num_rows == 0:
                break
            df = batch.to_pandas()
            df = df[(df["ts_init"] >= file_meta.start) & (df["ts_init"] <= file_meta.end)]
            if df.empty:
                return
            if file_meta.instrument_id:
                df.loc[:, "instrument_id"] = file_meta.instrument_id
            yield df
示例#23
0
def append_zarr_along_time(source_path: str,
                           target_path: str,
                           fs: fsspec.AbstractFileSystem,
                           dim: str = "time"):
    """Append local zarr store at source_path to zarr store at target_path along time.
    
    Args:
        source_path: Local path to zarr store that represents an xarray dataset.
        target_path: Local or remote url for zarr store to be appended to.
        fs: Filesystem for target_path.
        dim: (optional) name of time dimension. Defaults to "time".

    Raises:
        ValueError: If the chunk size in time does not evenly divide length of time
            dimension for zarr stores at source_path.

    Warning:
        The zarr store as source_path will be modified in place.
    """

    merged_time = _get_merged_time_coordinate(source_path, target_path, dim,
                                              fs)
    if fs.exists(target_path):
        source_store = zarr.open(source_path, mode="r+")
        target_store = zarr.open_consolidated(fsspec.get_mapper(target_path))
        _assert_chunks_match(source_store, target_store, dim)
        _set_time_units_like(source_store, target_store)
        _shift_store(source_store, dim, _get_dim_size(target_store, dim))
    elif fs.protocol == "file":
        os.makedirs(target_path)

    upload_dir(source_path, target_path)
    _overwrite_time_array_with_single_chunk(target_path, merged_time, dim)

    _, _, absolute_target_paths = fsspec.get_fs_token_paths(target_path)
    consolidate_metadata(fs, absolute_target_paths[0])
示例#24
0
def find_available_groups(
    product_files: T.Dict[str, T.Tuple[str, str, str, str, str]],
    product_path: str,
    check_files_exist: bool = False,
    fs: fsspec.AbstractFileSystem = fsspec.filesystem("file"),
) -> T.Dict[str, T.List[str]]:
    groups: T.Dict[str, T.List[str]] = {}
    for path, (type, _, swath, polarization, _) in product_files.items():
        swath_pol_group = f"{swath}/{polarization}".upper()
        abspath = os.path.join(product_path, os.path.normpath(path))
        if check_files_exist:
            if not fs.exists(abspath):
                continue
        if type == "s1Level1ProductSchema":
            groups[swath.upper()] = [""]
            groups[swath_pol_group] = [abspath] + groups.get(
                swath_pol_group, [])
            for metadata_group in [
                    "orbit",
                    "attitude",
                    "azimuth_fm_rate",
                    "dc_estimate",
                    "gcp",
                    "coordinate_conversion",
            ]:
                groups[f"{swath_pol_group}/{metadata_group}"] = [abspath]
        elif type == "s1Level1CalibrationSchema":
            groups[f"{swath_pol_group}/calibration"] = [abspath]
        elif type == "s1Level1NoiseSchema":
            groups[f"{swath_pol_group}/noise_range"] = [abspath]
            groups[f"{swath_pol_group}/noise_azimuth"] = [abspath]
        elif type == "s1Level1MeasurementSchema":
            groups[swath_pol_group] = [abspath] + groups.get(
                swath_pol_group, [])

    return groups
def load_raw_data(location: str, fs: AbstractFileSystem) -> pd.DataFrame:
    with fs.open(location) as f:
        df = pd.read_csv(f, usecols=["beds", "accommodates"])
    return df
示例#26
0
def get_schema(fs: fsspec.AbstractFileSystem, url: str) -> xr.Dataset:
    logging.info(f"Grabbing schema from {url}")
    with fs.open(url, "rb") as f:
        return vcm.standardize_metadata(xr.open_dataset(f))
示例#27
0
def _load_restart(fs: fsspec.AbstractFileSystem, path: str) -> xr.Dataset:
    with fs.open(path) as f:
        return xr.open_dataset(f).compute()
示例#28
0
def write_parquet(
    fs: fsspec.AbstractFileSystem,
    path: str,
    df: pd.DataFrame,
    partition_cols: Optional[List[str]],
    schema: pa.Schema,
    **kwargs,
):
    """
    Write a single dataframe to parquet.
    """
    # Check partition values are valid before writing to parquet
    mappings = check_partition_columns(df=df, partition_columns=partition_cols)
    df = clean_partition_cols(df=df, mappings=mappings)

    # Dataframe -> pyarrow Table
    table = pa.Table.from_pandas(df, schema=schema)

    if "basename_template" not in kwargs and "ts_init" in df.columns:
        kwargs["basename_template"] = (
            f"{df['ts_init'].min()}-{df['ts_init'].max()}" + "-{i}.parquet"
        )

    # Write the actual file
    partitions = (
        ds.partitioning(
            schema=pa.schema(fields=[table.schema.field(c) for c in partition_cols]),
            flavor="hive",
        )
        if partition_cols
        else None
    )
    if pa.__version__ >= "6.0.0":
        kwargs.update(existing_data_behavior="overwrite_or_ignore")
    files = set(fs.glob(f"{path}/**"))
    ds.write_dataset(
        data=table,
        base_dir=path,
        filesystem=fs,
        partitioning=partitions,
        format="parquet",
        **kwargs,
    )

    # Ensure data written by write_dataset is sorted
    new_files = set(fs.glob(f"{path}/**/*.parquet")) - files
    del df
    for fn in new_files:
        ndf = pd.read_parquet(fs.open(fn))
        # assert ndf.shape[0] == shape
        if "ts_init" in ndf.columns:
            ndf = ndf.sort_values("ts_init").reset_index(drop=True)
        pq.write_table(
            table=pa.Table.from_pandas(ndf),
            where=fn,
            filesystem=fs,
        )

    # Write the ``_common_metadata`` parquet file without row groups statistics
    pq.write_metadata(table.schema, f"{path}/_common_metadata", version="2.6", filesystem=fs)

    # Write out any partition columns we had to modify due to filesystem requirements
    if mappings:
        write_partition_column_mappings(fs=fs, path=path, mappings=mappings)
示例#29
0
def get_file_count(fs: fsspec.AbstractFileSystem, path):
    return len(fs.listdir(path, detail=False))
示例#30
0
def write_feather(table: pa.Table, path: str, file_system: AbstractFileSystem,
                  **kwargs):

    with file_system.open(path, "wb") as f:
        paf.write_feather(table, f, **kwargs)