def open_aws_url(config, _, storage, url): reader_impl = SourceFile.extract_reader_impl(config) use_aws_account = "aws_access_key_id" in config["provider"] and "aws_secret_access_key" in config["provider"] and storage == "s3://" if reader_impl == "s3fs": if use_aws_account: aws_access_key_id = None if "aws_access_key_id" in config["provider"]: aws_access_key_id = config["provider"]["aws_access_key_id"] aws_secret_access_key = None if "aws_secret_access_key" in config["provider"]: aws_secret_access_key = config["provider"]["aws_secret_access_key"] s3 = S3FileSystem(anon=False, key=aws_access_key_id, secret=aws_secret_access_key) result = s3.open(f"s3://{url}", mode="r") else: s3 = S3FileSystem(anon=True) result = s3.open(f"s3://{url}", mode="r") else: if use_aws_account: aws_access_key_id = "" if "aws_access_key_id" in config["provider"]: aws_access_key_id = config["provider"]["aws_access_key_id"] aws_secret_access_key = "" if "aws_secret_access_key" in config["provider"]: aws_secret_access_key = config["provider"]["aws_secret_access_key"] result = open(f"s3://{aws_access_key_id}:{aws_secret_access_key}@{url}") else: config = Config(signature_version=UNSIGNED) params = { "resource_kwargs": {"config": config}, } result = open(f"{storage}{url}", transport_params=params) return result
def __init__(self, obs_file_system: s3fs.S3FileSystem, dir_path: str, zarr_kwargs: Dict[str, Any] = None, ds_id: str = None, exception_type: type = ValueError): level_paths = {} for entry in obs_file_system.walk(dir_path, directories=True): level_dir = entry.split("/")[-1] basename, ext = os.path.splitext(level_dir) if basename.isdigit(): level = int(basename) if entry.endswith(".zarr") and obs_file_system.isdir(entry): level_paths[level] = (ext, dir_path + "/" + level_dir) elif entry.endswith(".link") and obs_file_system.isfile(entry): level_paths[level] = (ext, dir_path + "/" + level_dir) num_levels = len(level_paths) # Consistency check for level in range(num_levels): if level not in level_paths: raise exception_type( f"Invalid multi-level dataset {ds_id!r}: missing level {level} in {dir_path}" ) super().__init__(ds_id=ds_id, parameters=zarr_kwargs) self._obs_file_system = obs_file_system self._dir_path = dir_path self._level_paths = level_paths self._num_levels = num_levels
def __init__(self, key=None, username=None, secret=None, password=None, path=None, host=None, s3=None, **kwargs): if username is not None: if key is not None: raise KeyError("S3 storage options got secrets argument " "collision. Please, use either `key` " "storage option or password field in URLpath, " "not both options together.") key = username if key is not None: kwargs['key'] = key if password is not None: if secret is not None: raise KeyError("S3 storage options got secrets argument " "collision. Please, use either `secret` " "storage option or password field in URLpath, " "not both options together.") secret = password if secret is not None: kwargs['secret'] = secret # S3FileSystem.__init__(self, kwargs) # not sure what do do here S3FileSystem.__init__(self, **kwargs)
def _write_test_data(cls, s3: s3fs.S3FileSystem): if not s3.isdir(cls.BUCKET_NAME): s3.mkdir(cls.BUCKET_NAME) data = helpers.make_test_store() s3map = s3fs.S3Map(root=cls.BUCKET_NAME + '/cube_1.zarr', s3=s3, create=True) s3map.update(data) s3map = s3fs.S3Map(root=cls.BUCKET_NAME + '/cube_2.zarr', s3=s3, create=True) s3map.update(data)
def _s3_open_file_with_retries(fs: s3fs.S3FileSystem, path: str, retries: int) -> Any: for _ in range(retries): try: logger.info(f"opening {path}") file = fs.open(path) return file except Exception as ex: logger.warning(f"could not open {path}: {ex}") # if the file has just been uploaded, then it might not be visible immediatly # but the fail to open has been cached by s3fs # so, we invalidate the cache fs.invalidate_cache(path) # and we give some time to S3 to settle the file status sleep(1)
def resolve_filesystem_and_path(uri: str, **kwargs) -> Tuple[EnhancedFileSystem, str]: parsed_uri = urlparse(uri) fs_path = parsed_uri.path # from https://github.com/apache/arrow/blob/master/python/pyarrow/filesystem.py#L419 # with viewfs support if parsed_uri.scheme == 'hdfs' or parsed_uri.scheme == 'viewfs': netloc_split = parsed_uri.netloc.split(':') host = netloc_split[0] if host == '': host = 'default' else: host = parsed_uri.scheme + "://" + host port = 0 if len(netloc_split) == 2 and netloc_split[1].isnumeric(): port = int(netloc_split[1]) fs = EnhancedFileSystem(pyarrow.hdfs.connect(host=host, port=port)) elif parsed_uri.scheme == 's3' or parsed_uri.scheme == 's3a': fs = EnhancedFileSystem( pyarrow.filesystem.S3FSWrapper(S3FileSystem(**kwargs))) else: # Input is local path such as /home/user/myfile.parquet fs = EnhancedFileSystem( pyarrow.filesystem.LocalFileSystem.get_instance()) _logger.info(f"Resolved base filesystem: {type(fs.base_fs)}") return fs, fs_path
def filesystem() -> AbstractFileSystem: fs = LocalFileSystem() endpoint_url = os.getenv("LIGHTNING_BUCKET_ENDPOINT_URL", "") bucket_name = os.getenv("LIGHTNING_BUCKET_NAME", "") if endpoint_url != "" and bucket_name != "": key = os.getenv("LIGHTNING_AWS_ACCESS_KEY_ID", "") secret = os.getenv("LIGHTNING_AWS_SECRET_ACCESS_KEY", "") # TODO: Remove when updated on the platform side. if key == "" or secret == "": key = os.getenv("AWS_ACCESS_KEY_ID", "") secret = os.getenv("AWS_SECRET_ACCESS_KEY", "") if key == "" or secret == "": raise RuntimeError("missing S3 bucket credentials") fs = S3FileSystem(key=key, secret=secret, use_ssl=False, client_kwargs={"endpoint_url": endpoint_url}) app_id = os.getenv("LIGHTNING_CLOUD_APP_ID", "") if app_id == "": raise RuntimeError("missing LIGHTNING_CLOUD_APP_ID") if not fs.exists(shared_storage_path()): raise RuntimeError( f"shared filesystem {shared_storage_path()} does not exist") return fs
def retrieve_puf(aws_access_key_id=AWS_ACCESS_KEY_ID, aws_secret_access_key=AWS_SECRET_ACCESS_KEY): """ Function for retrieving the PUF from the OSPC S3 bucket """ s3_reader_installed = S3FileSystem is not None has_credentials = (aws_access_key_id is not None and aws_secret_access_key is not None) if has_credentials and s3_reader_installed: print("Reading puf from S3 bucket.") fs = S3FileSystem( key=AWS_ACCESS_KEY_ID, secret=AWS_SECRET_ACCESS_KEY, ) with fs.open("s3://ospc-data-files/puf.csv.gz") as f: # Skips over header from top of file. puf_df = pd.read_csv(f, compression="gzip") return puf_df elif Path("puf.csv.gz").exists(): print("Reading puf from puf.csv.gz.") return pd.read_csv("puf.csv.gz", compression="gzip") elif Path("puf.csv").exists(): print("Reading puf from puf.csv.") return pd.read_csv("puf.csv") else: warnings.warn( f"PUF file not available (has_credentials={has_credentials}, " f"s3_reader_installed={s3_reader_installed})") return None
def write_df_to_parquet_to_s3(df: pd.DataFrame, filename: str, s3_bucketname: str, s3_bucketkey=None): # TODO: Need to figure out how to modify this file so it doesn't write the parquet file into the current working directory and then subsequently upload to S3. We want it to just upload directly to S3 (w/o having to write it to the current working directory) assert 's3://' not in s3_bucketname, 'prefix "s3://" not required' assert filename[-8:] == '.parquet', 'filename must have suffix ".parquet"' if 's3://' in s3_bucketname: pass else: s3_bucketname = 's3://' + s3_bucketname table = pa.Table.from_pandas(df) pq.write_table(table, filename) if s3_bucketkey is not None: key_to_use = s3_bucketkey + '/' + filename else: key_to_use = filename outputfile = s3_bucketname + '/' + key_to_use s3 = S3FileSystem() pq.write_to_dataset(table=table, root_path=outputfile, filesystem=s3)
def path_exists(self, path): if 's3://' in path: path_in_s3 = path.replace("s3://", "") return S3FileSystem(anon=False).exists(path_in_s3) else: return os.path.exists(path)
def s3_connect(self): """ Wrapper to create a session at AWS S3 with given authorization-key """ session = boto3.Session() self.s3_conn = session.resource("s3") self.s3_fs = S3FileSystem()
def s3_service(self): try: return self._tls.s3_service except (AttributeError, KeyError): from s3fs import S3FileSystem self._tls.s3_service = S3FileSystem(**self.s3_args) return self._tls.s3_service
def _get_s3(key=None, username=None, secret=None, password=None, **kwargs): """ Reuse ``s3`` instance or construct a new S3FileSystem from storage_options. >>> isinstance(_get_s3(), S3FileSystem) True >>> s3 = _get_s3(anon=False) >>> s3.anon False """ if username is not None: if key is not None: raise KeyError("S3 storage options got secrets argument " "collision. Please, use either `key` " "storage option or password field in URLpath, " "not both options together.") key = username if key is not None: kwargs['key'] = key if password is not None: if secret is not None: raise KeyError("S3 storage options got secrets argument " "collision. Please, use either `secret` " "storage option or password field in URLpath, " "not both options together.") secret = password if secret is not None: kwargs['secret'] = secret return S3FileSystem(**kwargs)
def __init__(self, s3_file_system: s3fs.S3FileSystem, dir_path: str, zarr_kwargs: Dict[str, Any] = None, ds_id: str = None, chunk_cache_capacity: int = None, exception_type: type = ValueError): level_paths = {} entries = s3_file_system.ls(dir_path, detail=False) for entry in entries: level_dir = entry.split("/")[-1] basename, ext = os.path.splitext(level_dir) if basename.isdigit(): level = int(basename) if entry.endswith(".zarr") and s3_file_system.isdir(entry): level_paths[level] = (ext, dir_path + "/" + level_dir) elif entry.endswith(".link") and s3_file_system.isfile(entry): level_paths[level] = (ext, dir_path + "/" + level_dir) num_levels = len(level_paths) # Consistency check for level in range(num_levels): if level not in level_paths: raise exception_type( f"Invalid multi-level dataset {ds_id!r}: missing level {level} in {dir_path}" ) super().__init__(ds_id=ds_id, parameters=zarr_kwargs) self._s3_file_system = s3_file_system self._dir_path = dir_path self._level_paths = level_paths self._num_levels = num_levels self._chunk_cache_capacities = None if chunk_cache_capacity: weights = [] weigth_sum = 0 for level in range(num_levels): weight = 2**(num_levels - 1 - level) weight *= weight weigth_sum += weight weights.append(weight) self._chunk_cache_capacities = [ round(chunk_cache_capacity * weight / weigth_sum) for weight in weights ]
def write_run_config_to_s3(self, config_string): s3_key = self._project_parameters.compile_path({}, 'run_config', 'toml') s3_path = Path(s3_key) backup_path = Path(*s3_path.parts[:-1], f'run_config_until_{datetime.now()}.toml') with S3FileSystem().open( f'{self._aws_parameters.s3_config_bucket}/{backup_path}', 'wb') as f: f.write(toml.dumps(self._run_config).encode('utf-8')) with S3FileSystem().open( f'{self._aws_parameters.s3_config_bucket}/{s3_key}', 'wb') as f: f.write(config_string.encode('utf-8')) message = f'New config written to {self._aws_parameters.s3_config_bucket}/{s3_key}' print(message) return message
def run(self): table = pq.read_table(self.input().path) pq.write_to_dataset( table, root_path='s3://aws-meetup-almaty/yellow-taxi-ds', partition_cols=['pickup_date'], filesystem=S3FileSystem(), )
def open(self, path, mode='rb'): s3_path = self._trim_filename(path) f = S3FileSystem.open(self, s3_path, mode=mode) return f
def open(self, path, mode='rb', **kwargs): bucket = kwargs.pop('host', '') s3_path = bucket + path return S3FileSystem.open(self, s3_path, mode=mode)
def glob(self, path, **kwargs): bucket = kwargs.pop('host', '') s3_path = bucket + path return S3FileSystem.glob(self, s3_path)
def glob(self, path): s3_path = self._trim_filename(path) return ['s3://%s' % s for s in S3FileSystem.glob(self, s3_path)]