def load_model(filesystem: S3FS, config: PredictionConfig) -> ResUnetA: """ Copy the model locally if not existing and load it """ if not os.path.exists(f'{config.temp_model_path}/{config.model_name}'): if not filesystem.exists( f'{config.model_path}/{config.model_name}/checkpoints/'): filesystem.makedirs( f'{config.model_path}/{config.model_name}/checkpoints/') copy_dir(filesystem, f'{config.model_path}/{config.model_name}/checkpoints/', f'{config.temp_model_path}/{config.model_name}', 'checkpoints') copy_file(filesystem, f'{config.model_path}/{config.model_name}/model_cfg.json', f'{config.temp_model_path}/{config.model_name}', 'model_cfg.json') input_shape = dict( features=[None, config.height, config.width, config.n_channels]) with open(f'{config.temp_model_path}/{config.model_name}/model_cfg.json', 'r') as jfile: model_cfg = json.load(jfile) # initialise model from config, build, compile and load trained weights model = ResUnetA(model_cfg) model.build(input_shape) model.net.compile() model.net.load_weights( f'{config.temp_model_path}/{config.model_name}/checkpoints/model.ckpt') return model
def load_metadata(filesystem: S3FS, config: PredictionConfig) -> pd.DataFrame: """ Load DataFrame with info about normalisation factors """ metadata_dir = os.path.dirname(config.metadata_path) if not filesystem.exists(metadata_dir): filesystem.makedirs(metadata_dir) df = pd.read_csv(filesystem.open(f'{config.metadata_path}')) normalisation_factors = df.groupby( pd.to_datetime(df.timestamp).dt.to_period("M")).max() normalisation_factors['month'] = pd.to_datetime( normalisation_factors.timestamp).dt.month return normalisation_factors
def _configure_backing_store(self): try: backing_stores = [] for bs in self.config['Backing Store']: if 'Type' in bs: for key, item in bs.items(): bs[key] = _get_from_env(item) if bs['Type'].lower() == 's3': backing_stores.append(S3FS( bs['Bucket'], strict=False, aws_access_key_id=bs.get('Key ID', None), aws_secret_access_key=bs.get('Secret Key', None), endpoint_url=bs.get('Endpoint URL', None) )) elif 'dav' in bs['Type'].lower(): if not webdav_available: raise exceptions.NoWebdav("no webdavfs module was found") if bs['Root'][0] != '/': bs['Root'] = '/' + bs['Root'] backing_stores.append(WebDAVFS( url=bs['Base URL'], login=bs['Username'], password=bs['Password'], root=bs['Root'] )) else: _config_error("Unknown filesystem type.") else: backing_stores.append(fs.open_fs(bs['URI'], create=True)) except (KeyError, OSError, CreateFailed) as err: _config_error(err) return backing_stores
def _construct_norm_arrays(file_path: str, metadata_path: str, fold: int = None, filesystem: S3FS = None) -> \ Tuple[np.ndarray, np.ndarray, np.ndarray, np.ndarray]: """ Return arrays with normalisation factors to be used """ chunk_name = os.path.basename(file_path) df = pd.read_csv( filesystem.open(metadata_path) ) if filesystem is not None else pd.read_csv(metadata_path) df = df[df.chunk == chunk_name] if fold is not None: df = df[df.fold == fold] perc99 = df[[ 'norm_perc99_b0', 'norm_perc99_b1', 'norm_perc99_b2', 'norm_perc99_b3' ]].values meanstd_mean = df[[ 'norm_meanstd_mean_b0', 'norm_meanstd_mean_b1', 'norm_meanstd_mean_b2', 'norm_meanstd_mean_b3' ]].values meanstd_median = df[[ 'norm_meanstd_median_b0', 'norm_meanstd_median_b1', 'norm_meanstd_median_b2', 'norm_meanstd_median_b3' ]].values meanstd_std = df[[ 'norm_meanstd_std_b0', 'norm_meanstd_std_b1', 'norm_meanstd_std_b2', 'norm_meanstd_std_b3' ]].values return perc99, meanstd_mean, meanstd_median, meanstd_std
def load_s3_filesystem(path, strict=False, config=None): """ Loads AWS s3 filesystem from a path :param path: A path to a folder on s3 bucket that will be the base folder in this filesystem :type path: str :param strict: If `True` the filesystem will be making additional checks to the s3. Default is `False`. :type strict: bool :param config: A configuration object with AWS credentials. By default is set to None and in this case the default configuration will be taken. :type config: SHConfig or None :return: A S3 filesystem object :rtype: fs_s3fs.S3FS """ if not path.startswith('s3://'): raise ValueError( "AWS path has to start with s3:// but found '{}'".format(path)) if config is None: config = SHConfig() path_chunks = path.split('/', 3)[2:] bucket_name = path_chunks[0] dir_path = path_chunks[1] if len(path_chunks) > 1 else '/' return S3FS(bucket_name=bucket_name, dir_path=dir_path, aws_access_key_id=config.aws_access_key_id if config.aws_access_key_id else None, aws_secret_access_key=config.aws_secret_access_key if config.aws_secret_access_key else None, strict=strict)
def test_upload_args(self): s3 = S3FS("foo", acl="acl", cache_control="cc") self.assertDictEqual( s3._get_upload_args("test.jpg"), { "ACL": "acl", "CacheControl": "cc", "ContentType": "image/jpeg" }, ) self.assertDictEqual( s3._get_upload_args("test.mp3"), { "ACL": "acl", "CacheControl": "cc", "ContentType": "audio/mpeg" }, ) self.assertDictEqual( s3._get_upload_args("test.json"), { "ACL": "acl", "CacheControl": "cc", "ContentType": "application/json" }, ) self.assertDictEqual( s3._get_upload_args("unknown.unknown"), { "ACL": "acl", "CacheControl": "cc", "ContentType": "binary/octet-stream" }, )
def get_services(**options): """Instantiate an S3 filesystem service for loading and saving files from the ETL.""" return { 'fs': S3FS(options["bucket"], aws_access_key_id=options["key"], aws_secret_access_key=options["secret_key"], endpoint_url=options["endpoint_url"],) }
def get_s3fs(namespace): """ Helper method to get_filesystem for a file system on S3 """ key_id = DJFS_SETTINGS.get('aws_access_key_id', None) key_secret = DJFS_SETTINGS.get('aws_secret_access_key', None) fullpath = namespace if 'prefix' in DJFS_SETTINGS: fullpath = os.path.join(DJFS_SETTINGS['prefix'], fullpath) s3fs = S3FS(DJFS_SETTINGS['bucket'], fullpath, aws_secret_access_key=key_secret, aws_access_key_id=key_id, acl=DJFS_SETTINGS.get('acl', None)) def get_s3_url(self, filename, timeout=60): """ Patch method to returns a signed S3 url for the given filename Note that this will return a url whether or not the requested file exsits. Arguments: self (obj): S3FS instance that this function has been patched onto filename (str): The name of the file we are retrieving a url for timeout (int): How long the url should be valid for; S3 enforces this limit Returns: str: A signed url to the requested file in S3 """ global S3CONN try: if not S3CONN: S3CONN = S3Connection(aws_access_key_id=key_id, aws_secret_access_key=key_secret) return S3CONN.generate_url(timeout, 'GET', bucket=DJFS_SETTINGS['bucket'], key=os.path.join(fullpath, filename)) except Exception: # pylint: disable=broad-except # Retry on error; typically, if the connection has timed out, but # the broad except covers all errors. S3CONN = S3Connection(aws_access_key_id=key_id, aws_secret_access_key=key_secret) return S3CONN.generate_url(timeout, 'GET', bucket=DJFS_SETTINGS['bucket'], key=os.path.join(fullpath, filename)) s3fs = patch_fs(s3fs, namespace, get_s3_url) return s3fs
def load_dates(filesystem: S3FS, tile_name: str) -> List[datetime]: """ Load a json file with dates from the bucket and parse out dates """ path = f'/{tile_name}/userdata.json' with filesystem.open(path, 'r') as fp: userdata = json.load(fp) dates_list = json.loads(userdata['dates']) return [parse(date) for date in dates_list]
def _resolve_neural_files_bom(neural_files_or_bom: list = None): """ This function is typically used internally by map_video_to_neural_and_sleep_state(...), Use save_neural_files_bom to create the CSV once. This function resolves a list of neural filenames, or a CSV bill of materials containing a list of the neural files with their sizes and ecube timestamps to a list of (ecube_time, file_size, neural_filename) :param neural_files_or_bom: a list of neural files (non-globs), or a list of a single CSV file which is the bill of materials (BOM) CSV file containing a list of all neural data files in format: ecube_time, file_size, neural_filename :return: list in the form [(ecube_time, file_size, neural_filename), (...), ...] """ assert neural_files_or_bom is not None and len( neural_files_or_bom) > 0, 'No neural files found.' uses_s3 = any([f.startswith('s3://') for f in neural_files_or_bom]) if uses_s3: _verify_s3_support() if len(neural_files_or_bom) == 1 and neural_files_or_bom[0].endswith( '.csv'): with open(neural_files_or_bom[0], 'r') as csv_file: csv_reader = csv.reader(csv_file) result = [tuple(row) for row in csv_reader] else: result = [] for nfile in neural_files_or_bom: if nfile.startswith('s3://'): o = urllib.parse.urlparse(nfile) bucket = o.netloc key = o.path with S3FS(bucket, endpoint_url=os.environ.get('ENDPOINT_URL', None), strict=False) as s3fs: s3f = s3fs.openbin(key) ecube_time = np.frombuffer(s3f.read(8), dtype=np.uint64)[0] file_size = s3f.size else: with open(nfile, 'rb') as f: ecube_time = np.fromfile(f, dtype=np.uint64, count=1)[0] file_size = os.fstat(f.fileno()).st_size filename = os.path.split(nfile)[-1] result.append((ecube_time, file_size, filename)) return result
def _create_new_s3_fs(): """Creates a new empty mocked s3 bucket. If one such bucket already exists it deletes it first.""" bucket_name = "mocked-test-bucket" s3resource = boto3.resource("s3", region_name="eu-central-1") bucket = s3resource.Bucket(bucket_name) if bucket.creation_date: # If bucket already exists for key in bucket.objects.all(): key.delete() bucket.delete() s3resource.create_bucket( Bucket=bucket_name, CreateBucketConfiguration={"LocationConstraint": "eu-central-1"}) return S3FS(bucket_name=bucket_name)
def _resolve_glob(file_glob): if file_glob.startswith('s3://'): _verify_s3_support() o = urllib.parse.urlparse(file_glob) bucket = o.netloc key = o.path s3fs = S3FS(bucket, endpoint_url=os.environ.get('ENDPOINT_URL', None), strict=False) result = [ 's3://{}{}'.format(bucket, match.path) for match in s3fs.glob(key) ] else: result = glob.glob(file_glob) return result
def __enter__(self) -> S3FS: AwsSessionHook.__enter__(self) self.bucket = self.conn_params.extra['bucket'] self.base_path = self.conn_params.extra.get('base_path') if self.conn_params.login and self.conn_params.password: kwargs = { 'aws_access_key_id': self.conn_params.login, 'aws_secret_access_key': self.conn_params.password } elif self.session: # Get session token client = self.session.client('sts') session_token = client.get_session_token() kwargs = {'aws_session_token': session_token} else: kwargs = {} self.conn = S3FS(self.bucket, dir_path=self.base_path, **kwargs) return self.conn
def _configure_backing_store(self): try: bs = self.config['Backing Store'] if 'Type' in bs: for key, item in bs.items(): bs[key] = _get_from_env(item) if bs['Type'].lower() == 's3': return S3FS( bs['Bucket'], strict=False, aws_access_key_id=bs.get('Key ID', None), aws_secret_access_key=bs.get('Secret Key', None), endpoint_url=bs.get('Endpoint URL', None) ) else: return fs.open_fs(bs['URI'], create=True) except (KeyError, OSError, CreateFailed) as err: _config_error(err)
def open_fs(fs_url, **kwargs): """Open a pyfs filesystem. Like fs.open_fs will simply return FS if an instance if given as the fs_url parameter. """ if isinstance(fs_url, fs.base.FS): return fs_url # Now assume a string that may be a path (no ://) or else a filesystem URL if "://" not in fs_url: # A path, assume this is not URI escaped which is what the OSFS(..) # creator assumes (as opposed to open_fs(..)) return OSFS(fs_url, **kwargs) # We have a URL, parse it parse_result = fs.opener.parse(fs_url) if parse_result.protocol == 's3': # And S3 URL, mostly repeat # https://github.com/PyFilesystem/s3fs/blob/master/fs_s3fs/opener.py # but adjust the handling of strict to default to strict=False bucket_name, _, dir_path = parse_result.resource.partition("/") if not bucket_name: raise fs.opener.errors.OpenerError( "invalid bucket name in '{}'".format(fs_url)) # Instead of allowing this to be turned on by a strict=1 in the # URL query params, allow it to be turned off by strict!=1 strict = (parse_result.params["strict"] == "1" if "strict" in parse_result.params else False) s3fs = S3FS(bucket_name, dir_path=dir_path or "/", aws_access_key_id=parse_result.username or None, aws_secret_access_key=parse_result.password or None, endpoint_url=parse_result.params.get("endpoint_url", None), acl=parse_result.params.get("acl", None), cache_control=parse_result.params.get( "cache_control", None), strict=strict) # Patch in version of getinfo method that doesn't check parent directory s3fs.getinfo = s3fs._getinfo # pylint: disable=protected-access return s3fs # Non-S3 URL return fs.open_fs(fs_url, **kwargs)
def ocfl_opendir(pyfs, dir, **kwargs): """Open directory while handling the case of S3 without directory objects. FIXME - DIRTY HACK """ if isinstance(pyfs, S3FS): # Hack for S3 because the standard opendir(..) fails when there # isn't a directory object (even with strict=False) new_dir_path = fs.path.join(pyfs.dir_path, dir) s3fs = S3FS( pyfs._bucket_name, # pylint: disable=protected-access dir_path=new_dir_path, aws_access_key_id=pyfs.aws_access_key_id, aws_secret_access_key=pyfs.aws_secret_access_key, endpoint_url=pyfs.endpoint_url, # acl=pyfs.acl, # cache_control=pyfs.cache_control), strict=pyfs.strict) # Patch in version of getinfo method that doesn't check parent directory s3fs.getinfo = s3fs._getinfo # pylint: disable=protected-access return s3fs # Not S3, just use regular opendir(..) return pyfs.opendir(dir, **kwargs)
def test_upload_args(self): s3 = S3FS('foo', acl='acl', cache_control='cc') self.assertDictEqual(s3._get_upload_args('test.jpg'), { 'ACL': 'acl', 'CacheControl': 'cc', 'ContentType': 'image/jpeg' }) self.assertDictEqual(s3._get_upload_args('test.mp3'), { 'ACL': 'acl', 'CacheControl': 'cc', 'ContentType': 'audio/mpeg' }) self.assertDictEqual(s3._get_upload_args('test.json'), { 'ACL': 'acl', 'CacheControl': 'cc', 'ContentType': 'application/json' }) self.assertDictEqual( s3._get_upload_args('unknown.unknown'), { 'ACL': 'acl', 'CacheControl': 'cc', 'ContentType': 'binary/octet-stream' })
def _open_fs(directory): if directory.startswith("s3://"): """Manually fetch the permissions from the environment Requires the following env variables: - S3_ACCESS_KEY - S3_SECRET_KEY - S3_URL """ from fs_s3fs import S3FS if not directory.endswith("/"): directory += "/" bucket, fpath = directory[len("s3://"):].split("/", 1) return S3FS(bucket, dir_path=fpath, aws_access_key_id=os.environ.get("S3_ACCESS_KEY", None), aws_secret_access_key=os.environ.get( 'S3_SECRET_KEY', None), strict=False, endpoint_url=os.environ.get('S3_URL', None)) else: return open_fs(directory)
def load_s3_filesystem(path: str, strict: bool = False, config: Optional[SHConfig] = None, aws_profile: Optional[str] = None) -> S3FS: """Loads AWS s3 filesystem from a path. :param path: A path to a folder on s3 bucket that will be the base folder in this filesystem :type path: str :param strict: If `True` the filesystem will be making additional checks to the s3. Default is `False`. :type strict: bool :param config: A configuration object with AWS credentials. By default is set to None and in this case the default configuration will be taken. :type config: SHConfig or None :param aws_profile: A name of AWS profile. If given, AWS credentials will be taken from there. :return: A S3 filesystem object :rtype: fs_s3fs.S3FS """ if not is_s3_path(path): raise ValueError( f"AWS path has to start with s3:// but found '{path}'.") config = config or SHConfig() if aws_profile: config = get_aws_credentials(aws_profile, config=config) path_chunks = path.split("/", 3)[2:] bucket_name = path_chunks[0] dir_path = path_chunks[1] if len(path_chunks) > 1 else "/" return S3FS( bucket_name=bucket_name, dir_path=dir_path, aws_access_key_id=config.aws_access_key_id or None, aws_secret_access_key=config.aws_secret_access_key or None, aws_session_token=config.aws_session_token or None, strict=strict, )
def test_path_to_key_subdir(self): s3 = S3FS("foo", "/dir") self.assertEqual(s3._path_to_key("foo.bar"), "dir/foo.bar") self.assertEqual(s3._path_to_key("foo/bar"), "dir/foo/bar")
def test_path_to_key_subdir(self): s3 = S3FS('foo', '/dir') self.assertEqual(s3._path_to_key('foo.bar'), 'dir/foo.bar') self.assertEqual(s3._path_to_key('foo/bar'), 'dir/foo/bar')
def test_path_to_key(self): s3 = S3FS('foo') self.assertEqual(s3._path_to_key('foo.bar'), 'foo.bar') self.assertEqual(s3._path_to_key('foo/bar'), 'foo/bar')
def make_fs(self): self._delete_bucket_contents() self.s3.Object(self.bucket_name, 'subdirectory').put() return S3FS(self.bucket_name, dir_path='subdirectory')
def make_fs(self): self._delete_bucket_contents() return S3FS(self.bucket_name)
def prepare_filesystem(config: BaseConfig) -> S3FS: return S3FS(bucket_name=config.bucket_name, aws_access_key_id=config.aws_access_key_id, aws_secret_access_key=config.aws_secret_access_key, region=config.aws_region)
def npz_dir_dataset(file_dir_or_list: Union[str, List[str]], features: dict, metadata_path: str, fold: int = None, randomize: bool = True, num_parallel: int = 5, shuffle_size: int = 500, filesystem: S3FS = None, npz_from_s3: bool = False) -> tf.data.Dataset: """ Creates a tf.data.Dataset from a directory containing numpy .npz files. Files are loaded lazily when needed. `num_parallel` files are read in parallel and interleaved together. :param file_dir_or_list: directory containing .npz files or a list of paths to .npz files :param features: dict of (`field` -> `feature_name`) mappings, where `field` is the field in the .npz array and `feature_name` is the name of the feature it is saved to. :param fold: in k-fold validation, fold to consider when querying the patchlet info dataframe :param randomize: whether to shuffle the samples of the dataset or not, defaults to `True` :param num_parallel: number of files to read in parallel and intereleave, defaults to 5 :param shuffle_size: buffer size for shuffling file order, defaults to 500 :param metadata_path: path to input csv files with patchlet information :param filesystem: filesystem to access bucket, defaults to None :param npz_from_s3: if True, npz files are loaded from S3 bucket, otherwise from local disk :return: dataset containing examples merged from files """ files = file_dir_or_list if npz_from_s3: assert filesystem is not None # If dir, then list files if isinstance(file_dir_or_list, str): if filesystem and not filesystem.isdir(file_dir_or_list): filesystem.makedirs(file_dir_or_list) dir_list = os.listdir( file_dir_or_list) if not npz_from_s3 else filesystem.listdir( file_dir_or_list) files = [os.path.join(file_dir_or_list, f) for f in dir_list] fields = list(features.keys()) # Read one file for shape info file = next(iter(files)) data = np.load(file) if not npz_from_s3 else np.load( filesystem.openbin(file)) np_arrays = [data[f] for f in fields] # Append norm arrays perc99, meanstd_mean, meanstd_median, meanstd_std = _construct_norm_arrays( file, metadata_path, fold, filesystem) np_arrays.append(perc99) np_arrays.append(meanstd_mean) np_arrays.append(meanstd_median) np_arrays.append(meanstd_std) # Read shape and type info # types = tuple(arr.dtype for arr in np_arrays) types = (tf.uint16, tf.float32, tf.float32, tf.float32, tf.float64, tf.float64, tf.float64, tf.float64) shapes = tuple(arr.shape[1:] for arr in np_arrays) # Create datasets datasets = [ _npz_file_lazy_dataset(file, fields, types, shapes, metadata_path, fold=fold, filesystem=filesystem, npz_from_s3=npz_from_s3) for file in files ] ds = tf.data.Dataset.from_tensor_slices(datasets) # Shuffle files and interleave multiple files in parallel if randomize: ds = ds.shuffle(shuffle_size) ds = ds.interleave(lambda x: x, cycle_length=num_parallel) return ds
def _open_fs(self, user_context): props = self._serialization_props(user_context) handle = S3FS(**props) return handle
from fs_s3fs import S3FS s3fs = S3FS(u'fsexample') print(s3fs) with s3fs.openbin(u'test.bin', u'w') as f: f.write(b'a') f.write(b'b') f.write(b'c') print s3fs.getinfo(u'test.bin', namespaces=['s3']).raw import io f = io.BytesIO(b'Hello, World') s3fs.setbinfile(u'b', f) print(s3fs.geturl(u'b')) s3fs.makedir(u'foo', recreate=True) print(s3fs.geturl(u'/foo')) s3fs.settext(u'/foo/bar', u'Hello') s3fs = S3FS(u'fsexample', dir_path='foo') print(s3fs) print(s3fs._prefix) print(s3fs.listdir(u'/')) print(s3fs._path_to_dir_key(u'/')) print(s3fs._path_to_dir_key(u'')) print(s3fs._path_to_dir_key(u'bar')) print(s3fs._path_to_dir_key(u'/bar')) # f = s3fs.openbin(u'newfile', 'ab')
def test_path_to_key(self): s3 = S3FS("foo") self.assertEqual(s3._path_to_key("foo.bar"), "foo.bar") self.assertEqual(s3._path_to_key("foo/bar"), "foo/bar")
def _prepare_filesystem(sampling_config: SamplingConfig) -> S3FS: return S3FS(bucket_name=sampling_config.bucket_name, aws_access_key_id=sampling_config.aws_access_key_id, aws_secret_access_key=sampling_config.aws_secret_access_key, region=sampling_config.aws_region)