Пример #1
0
 def open_aws_url(config, _, storage, url):
     reader_impl = SourceFile.extract_reader_impl(config)
     use_aws_account = "aws_access_key_id" in config["provider"] and "aws_secret_access_key" in config["provider"] and storage == "s3://"
     if reader_impl == "s3fs":
         if use_aws_account:
             aws_access_key_id = None
             if "aws_access_key_id" in config["provider"]:
                 aws_access_key_id = config["provider"]["aws_access_key_id"]
             aws_secret_access_key = None
             if "aws_secret_access_key" in config["provider"]:
                 aws_secret_access_key = config["provider"]["aws_secret_access_key"]
             s3 = S3FileSystem(anon=False, key=aws_access_key_id, secret=aws_secret_access_key)
             result = s3.open(f"s3://{url}", mode="r")
         else:
             s3 = S3FileSystem(anon=True)
             result = s3.open(f"s3://{url}", mode="r")
     else:
         if use_aws_account:
             aws_access_key_id = ""
             if "aws_access_key_id" in config["provider"]:
                 aws_access_key_id = config["provider"]["aws_access_key_id"]
             aws_secret_access_key = ""
             if "aws_secret_access_key" in config["provider"]:
                 aws_secret_access_key = config["provider"]["aws_secret_access_key"]
             result = open(f"s3://{aws_access_key_id}:{aws_secret_access_key}@{url}")
         else:
             config = Config(signature_version=UNSIGNED)
             params = {
                 "resource_kwargs": {"config": config},
             }
             result = open(f"{storage}{url}", transport_params=params)
     return result
Пример #2
0
    def __init__(self,
                 obs_file_system: s3fs.S3FileSystem,
                 dir_path: str,
                 zarr_kwargs: Dict[str, Any] = None,
                 ds_id: str = None,
                 exception_type: type = ValueError):

        level_paths = {}
        for entry in obs_file_system.walk(dir_path, directories=True):
            level_dir = entry.split("/")[-1]
            basename, ext = os.path.splitext(level_dir)
            if basename.isdigit():
                level = int(basename)
                if entry.endswith(".zarr") and obs_file_system.isdir(entry):
                    level_paths[level] = (ext, dir_path + "/" + level_dir)
                elif entry.endswith(".link") and obs_file_system.isfile(entry):
                    level_paths[level] = (ext, dir_path + "/" + level_dir)

        num_levels = len(level_paths)
        # Consistency check
        for level in range(num_levels):
            if level not in level_paths:
                raise exception_type(
                    f"Invalid multi-level dataset {ds_id!r}: missing level {level} in {dir_path}"
                )

        super().__init__(ds_id=ds_id, parameters=zarr_kwargs)
        self._obs_file_system = obs_file_system
        self._dir_path = dir_path
        self._level_paths = level_paths
        self._num_levels = num_levels
 def __init__(self,
              key=None,
              username=None,
              secret=None,
              password=None,
              path=None,
              host=None,
              s3=None,
              **kwargs):
     if username is not None:
         if key is not None:
             raise KeyError("S3 storage options got secrets argument "
                            "collision. Please, use either `key` "
                            "storage option or password field in URLpath, "
                            "not both options together.")
         key = username
     if key is not None:
         kwargs['key'] = key
     if password is not None:
         if secret is not None:
             raise KeyError("S3 storage options got secrets argument "
                            "collision. Please, use either `secret` "
                            "storage option or password field in URLpath, "
                            "not both options together.")
         secret = password
     if secret is not None:
         kwargs['secret'] = secret
     # S3FileSystem.__init__(self, kwargs)  # not sure what do do here
     S3FileSystem.__init__(self, **kwargs)
Пример #4
0
    def _write_test_data(cls, s3: s3fs.S3FileSystem):
        if not s3.isdir(cls.BUCKET_NAME):
            s3.mkdir(cls.BUCKET_NAME)

        data = helpers.make_test_store()
        s3map = s3fs.S3Map(root=cls.BUCKET_NAME + '/cube_1.zarr',
                           s3=s3,
                           create=True)
        s3map.update(data)
        s3map = s3fs.S3Map(root=cls.BUCKET_NAME + '/cube_2.zarr',
                           s3=s3,
                           create=True)
        s3map.update(data)
Пример #5
0
def _s3_open_file_with_retries(fs: s3fs.S3FileSystem, path: str,
                               retries: int) -> Any:
    for _ in range(retries):
        try:
            logger.info(f"opening {path}")
            file = fs.open(path)
            return file
        except Exception as ex:
            logger.warning(f"could not open {path}: {ex}")
            # if the file has just been uploaded, then it might not be visible immediatly
            # but the fail to open has been cached by s3fs
            # so, we invalidate the cache
            fs.invalidate_cache(path)
            # and we give some time to S3 to settle the file status
            sleep(1)
Пример #6
0
def resolve_filesystem_and_path(uri: str,
                                **kwargs) -> Tuple[EnhancedFileSystem, str]:
    parsed_uri = urlparse(uri)
    fs_path = parsed_uri.path
    # from https://github.com/apache/arrow/blob/master/python/pyarrow/filesystem.py#L419
    # with viewfs support
    if parsed_uri.scheme == 'hdfs' or parsed_uri.scheme == 'viewfs':
        netloc_split = parsed_uri.netloc.split(':')
        host = netloc_split[0]
        if host == '':
            host = 'default'
        else:
            host = parsed_uri.scheme + "://" + host
        port = 0
        if len(netloc_split) == 2 and netloc_split[1].isnumeric():
            port = int(netloc_split[1])

        fs = EnhancedFileSystem(pyarrow.hdfs.connect(host=host, port=port))
    elif parsed_uri.scheme == 's3' or parsed_uri.scheme == 's3a':
        fs = EnhancedFileSystem(
            pyarrow.filesystem.S3FSWrapper(S3FileSystem(**kwargs)))
    else:
        # Input is local path such as /home/user/myfile.parquet
        fs = EnhancedFileSystem(
            pyarrow.filesystem.LocalFileSystem.get_instance())

    _logger.info(f"Resolved base filesystem: {type(fs.base_fs)}")
    return fs, fs_path
Пример #7
0
def filesystem() -> AbstractFileSystem:
    fs = LocalFileSystem()

    endpoint_url = os.getenv("LIGHTNING_BUCKET_ENDPOINT_URL", "")
    bucket_name = os.getenv("LIGHTNING_BUCKET_NAME", "")
    if endpoint_url != "" and bucket_name != "":
        key = os.getenv("LIGHTNING_AWS_ACCESS_KEY_ID", "")
        secret = os.getenv("LIGHTNING_AWS_SECRET_ACCESS_KEY", "")
        # TODO: Remove when updated on the platform side.
        if key == "" or secret == "":
            key = os.getenv("AWS_ACCESS_KEY_ID", "")
            secret = os.getenv("AWS_SECRET_ACCESS_KEY", "")
        if key == "" or secret == "":
            raise RuntimeError("missing S3 bucket credentials")

        fs = S3FileSystem(key=key,
                          secret=secret,
                          use_ssl=False,
                          client_kwargs={"endpoint_url": endpoint_url})

        app_id = os.getenv("LIGHTNING_CLOUD_APP_ID", "")
        if app_id == "":
            raise RuntimeError("missing LIGHTNING_CLOUD_APP_ID")

        if not fs.exists(shared_storage_path()):
            raise RuntimeError(
                f"shared filesystem {shared_storage_path()} does not exist")

    return fs
Пример #8
0
def retrieve_puf(aws_access_key_id=AWS_ACCESS_KEY_ID,
                 aws_secret_access_key=AWS_SECRET_ACCESS_KEY):
    """
    Function for retrieving the PUF from the OSPC S3 bucket
    """
    s3_reader_installed = S3FileSystem is not None
    has_credentials = (aws_access_key_id is not None
                       and aws_secret_access_key is not None)
    if has_credentials and s3_reader_installed:
        print("Reading puf from S3 bucket.")
        fs = S3FileSystem(
            key=AWS_ACCESS_KEY_ID,
            secret=AWS_SECRET_ACCESS_KEY,
        )
        with fs.open("s3://ospc-data-files/puf.csv.gz") as f:
            # Skips over header from top of file.
            puf_df = pd.read_csv(f, compression="gzip")
        return puf_df
    elif Path("puf.csv.gz").exists():
        print("Reading puf from puf.csv.gz.")
        return pd.read_csv("puf.csv.gz", compression="gzip")
    elif Path("puf.csv").exists():
        print("Reading puf from puf.csv.")
        return pd.read_csv("puf.csv")
    else:
        warnings.warn(
            f"PUF file not available (has_credentials={has_credentials}, "
            f"s3_reader_installed={s3_reader_installed})")
        return None
Пример #9
0
def write_df_to_parquet_to_s3(df: pd.DataFrame,
                              filename: str,
                              s3_bucketname: str,
                              s3_bucketkey=None):
    # TODO: Need to figure out how to modify this file so it doesn't write the parquet file into the current working directory and then subsequently upload to S3. We want it to just upload directly to S3 (w/o having to write it to the current working directory)

    assert 's3://' not in s3_bucketname, 'prefix "s3://" not required'
    assert filename[-8:] == '.parquet', 'filename must have suffix ".parquet"'

    if 's3://' in s3_bucketname:
        pass
    else:
        s3_bucketname = 's3://' + s3_bucketname

    table = pa.Table.from_pandas(df)
    pq.write_table(table, filename)

    if s3_bucketkey is not None:
        key_to_use = s3_bucketkey + '/' + filename
    else:
        key_to_use = filename

    outputfile = s3_bucketname + '/' + key_to_use

    s3 = S3FileSystem()
    pq.write_to_dataset(table=table, root_path=outputfile, filesystem=s3)
Пример #10
0
    def path_exists(self, path):
        if 's3://' in path:
            path_in_s3 = path.replace("s3://", "")

            return S3FileSystem(anon=False).exists(path_in_s3)
        else:
            return os.path.exists(path)
Пример #11
0
 def s3_connect(self):
     """
     Wrapper to create a session at AWS S3 with given authorization-key
     """
     session = boto3.Session()
     self.s3_conn = session.resource("s3")
     self.s3_fs = S3FileSystem()
Пример #12
0
 def s3_service(self):
     try:
         return self._tls.s3_service
     except (AttributeError, KeyError):
         from s3fs import S3FileSystem
         self._tls.s3_service = S3FileSystem(**self.s3_args)
         return self._tls.s3_service
Пример #13
0
def _get_s3(key=None, username=None, secret=None, password=None, **kwargs):
    """ Reuse ``s3`` instance or construct a new S3FileSystem from storage_options.

    >>> isinstance(_get_s3(), S3FileSystem)
    True
    >>> s3 = _get_s3(anon=False)
    >>> s3.anon
    False
    """
    if username is not None:
        if key is not None:
            raise KeyError("S3 storage options got secrets argument "
                           "collision. Please, use either `key` "
                           "storage option or password field in URLpath, "
                           "not both options together.")
        key = username
    if key is not None:
        kwargs['key'] = key
    if password is not None:
        if secret is not None:
            raise KeyError("S3 storage options got secrets argument "
                           "collision. Please, use either `secret` "
                           "storage option or password field in URLpath, "
                           "not both options together.")
        secret = password
    if secret is not None:
        kwargs['secret'] = secret
    return S3FileSystem(**kwargs)
Пример #14
0
    def __init__(self,
                 s3_file_system: s3fs.S3FileSystem,
                 dir_path: str,
                 zarr_kwargs: Dict[str, Any] = None,
                 ds_id: str = None,
                 chunk_cache_capacity: int = None,
                 exception_type: type = ValueError):

        level_paths = {}
        entries = s3_file_system.ls(dir_path, detail=False)
        for entry in entries:
            level_dir = entry.split("/")[-1]
            basename, ext = os.path.splitext(level_dir)
            if basename.isdigit():
                level = int(basename)
                if entry.endswith(".zarr") and s3_file_system.isdir(entry):
                    level_paths[level] = (ext, dir_path + "/" + level_dir)
                elif entry.endswith(".link") and s3_file_system.isfile(entry):
                    level_paths[level] = (ext, dir_path + "/" + level_dir)

        num_levels = len(level_paths)
        # Consistency check
        for level in range(num_levels):
            if level not in level_paths:
                raise exception_type(
                    f"Invalid multi-level dataset {ds_id!r}: missing level {level} in {dir_path}"
                )

        super().__init__(ds_id=ds_id, parameters=zarr_kwargs)
        self._s3_file_system = s3_file_system
        self._dir_path = dir_path
        self._level_paths = level_paths
        self._num_levels = num_levels

        self._chunk_cache_capacities = None
        if chunk_cache_capacity:
            weights = []
            weigth_sum = 0
            for level in range(num_levels):
                weight = 2**(num_levels - 1 - level)
                weight *= weight
                weigth_sum += weight
                weights.append(weight)
            self._chunk_cache_capacities = [
                round(chunk_cache_capacity * weight / weigth_sum)
                for weight in weights
            ]
Пример #15
0
 def write_run_config_to_s3(self, config_string):
     s3_key = self._project_parameters.compile_path({}, 'run_config',
                                                    'toml')
     s3_path = Path(s3_key)
     backup_path = Path(*s3_path.parts[:-1],
                        f'run_config_until_{datetime.now()}.toml')
     with S3FileSystem().open(
             f'{self._aws_parameters.s3_config_bucket}/{backup_path}',
             'wb') as f:
         f.write(toml.dumps(self._run_config).encode('utf-8'))
     with S3FileSystem().open(
             f'{self._aws_parameters.s3_config_bucket}/{s3_key}',
             'wb') as f:
         f.write(config_string.encode('utf-8'))
     message = f'New config written to {self._aws_parameters.s3_config_bucket}/{s3_key}'
     print(message)
     return message
 def run(self):
     table = pq.read_table(self.input().path)
     pq.write_to_dataset(
         table,
         root_path='s3://aws-meetup-almaty/yellow-taxi-ds',
         partition_cols=['pickup_date'],
         filesystem=S3FileSystem(),
     )
Пример #17
0
Файл: s3.py Проект: fortizc/dask
 def __init__(self, key=None, username=None, secret=None, password=None,
              path=None, host=None, s3=None, **kwargs):
     if username is not None:
         if key is not None:
             raise KeyError("S3 storage options got secrets argument "
                            "collision. Please, use either `key` "
                            "storage option or password field in URLpath, "
                            "not both options together.")
         key = username
     if key is not None:
         kwargs['key'] = key
     if password is not None:
         if secret is not None:
             raise KeyError("S3 storage options got secrets argument "
                            "collision. Please, use either `secret` "
                            "storage option or password field in URLpath, "
                            "not both options together.")
         secret = password
     if secret is not None:
         kwargs['secret'] = secret
     # S3FileSystem.__init__(self, kwargs)  # not sure what do do here
     S3FileSystem.__init__(self, **kwargs)
Пример #18
0
Файл: s3.py Проект: fortizc/dask
 def open(self, path, mode='rb'):
     s3_path = self._trim_filename(path)
     f = S3FileSystem.open(self, s3_path, mode=mode)
     return f
Пример #19
0
 def open(self, path, mode='rb', **kwargs):
     bucket = kwargs.pop('host', '')
     s3_path = bucket + path
     return S3FileSystem.open(self, s3_path, mode=mode)
Пример #20
0
 def glob(self, path, **kwargs):
     bucket = kwargs.pop('host', '')
     s3_path = bucket + path
     return S3FileSystem.glob(self, s3_path)
Пример #21
0
Файл: s3.py Проект: fortizc/dask
 def glob(self, path):
     s3_path = self._trim_filename(path)
     return ['s3://%s' % s for s in S3FileSystem.glob(self, s3_path)]