class GcsUnstructuredProvider(UnstructuredStorageProvider): """This class allows you to upload arbitrary bytes to GCS. They will be stored under bucket_name/base_path/filename """ file_system: GCSFileSystem def __init__( self, project: str, bucket_name: str, base_path: str, token: str = None, ) -> None: super().__init__() self.project = project self.bucket_name = bucket_name self.base_path = base_path self.token = token self.base_path = f"{bucket_name}/{base_path}/{{filename}}" self.file_name_cache: Set[str] = set() """The set of all filenames ever uploaded, checked before uploading""" self.logger = logging.getLogger("openwpm") async def init(self) -> None: await super(GcsUnstructuredProvider, self).init() self.file_system = GCSFileSystem(project=self.project, token=self.token, access="read_write") async def store_blob(self, filename: str, blob: bytes, overwrite: bool = False) -> None: target_path = self.base_path.format(filename=filename) if not overwrite and (filename in self.file_name_cache or self.file_system.exists(target_path)): self.logger.info("Not saving out file %s as it already exists", filename) return self.file_system.start_transaction() with self.file_system.open(target_path, mode="wb") as f: f.write(blob) self.file_system.end_transaction() self.file_name_cache.add(filename) async def flush_cache(self) -> None: pass async def shutdown(self) -> None: pass
def load_model_from_path(path, project_name=None, key=None): if path[:5] == 'gs://': if project_name is None: fs = GCSFileSystem() else: fs = GCSFileSystem(project_name) file = fs.open(path) else: file = path return load_model(file, custom_objects={'Swish': Swish, 'InstanceNormalization': InstanceNormalization})
def main(month, type_, outfile): spark = build_spark() raw_dat = spark.read.parquet('gs://spain-tweets/rehydrated/lake').where(f'month = {month}') dat = get_dat(spark, raw_dat) tweets = get_tweets(dat) if type_ == 'tweets': nodes, edges = build_tweet_graph(tweets, dat) G = create_graph(nodes, edges, 'id_str') elif type_ == 'users': nodes, edges = build_user_graph(tweets) G = create_graph(nodes, edges, 'user') else: raise TypeError(f'Unrecognized type_ parameter: {type_}') fs = GCSFileSystem(project = 'trollhunters') with fs.open(outfile, 'wb') as f: nx.write_graphml(G, f)
def load_npz(path, project_name=None, key=None): if path[:5] == 'gs://': if project_name is None: fs = GCSFileSystem(token=key) else: fs = GCSFileSystem(project_name, token=key) file = fs.open(path) else: file = path print(f'Loading file {path.rsplit("/", 1)[-1]}') with np.load(file, allow_pickle=True) as npz: print(f'Available files: {npz.files}') X = npz[npz.files[0]] X = np.expand_dims(X, -1)[0]['sunset_ims'] return X
class GCSFS(Operations): def __init__(self, path='.', gcs=None, **fsargs): if gcs is None: self.gcs = GCSFileSystem(**fsargs) else: self.gcs = gcs self.cache = {} self.counter = 0 self.root = path def getattr(self, path, fh=None): try: info = self.gcs.info(''.join([self.root, path])) except FileNotFoundError: raise FuseOSError(ENOENT) data = {'st_uid': 1000, 'st_gid': 1000} perm = 0o777 if info['storageClass'] == 'DIRECTORY' or 'bucket' in info['kind']: data['st_atime'] = 0 data['st_ctime'] = 0 data['st_mtime'] = 0 data['st_mode'] = (stat.S_IFDIR | perm) data['st_size'] = 0 data['st_blksize'] = 0 else: data['st_atime'] = str_to_time(info['timeStorageClassUpdated']) data['st_ctime'] = str_to_time(info['timeCreated']) data['st_mtime'] = str_to_time(info['updated']) data['st_mode'] = (stat.S_IFREG | perm) data['st_size'] = info['size'] data['st_blksize'] = 5 * 2**20 data['st_nlink'] = 1 return data def readdir(self, path, fh): path = ''.join([self.root, path]) files = self.gcs.ls(path) files = [f.rstrip('/').rsplit('/', 1)[1] for f in files] return ['.', '..'] + files def mkdir(self, path, mode): bucket, key = core.split_path(path) if not self.gcs.info(path): self.gcs.dirs['bucket'].append({ 'bucket': bucket, 'kind': 'storage#object', 'size': 0, 'storageClass': 'DIRECTORY', 'name': path.rstrip('/') + '/' }) def rmdir(self, path): info = self.gcs.info(path) if info['storageClass':'DIRECTORY']: self.gcs.rm(path, False) def read(self, path, size, offset, fh): print('read', path, size, offset, fh) fn = ''.join([self.root, path]) f = self.cache[fn] f.seek(offset) out = f.read(size) return out def write(self, path, data, offset, fh): print('write', path, offset, fh) f = self.cache[fh] f.write(data) return len(data) def create(self, path, flags): print('create', path, oct(flags)) fn = ''.join([self.root, path]) self.gcs.touch( fn) # this makes sure directory entry exists - wasteful! # write (but ignore creation flags) f = self.gcs.open(fn, 'wb') self.cache[self.counter] = f self.counter += 1 return self.counter - 1 def open(self, path, flags): print('open', path, oct(flags)) fn = ''.join([self.root, path]) if flags % 2 == 0: # read f = self.gcs.open(fn, 'rb') else: # write (but ignore creation flags) f = self.gcs.open(fn, 'wb') self.cache[self.counter] = f self.counter += 1 return self.counter - 1 def truncate(self, path, length, fh=None): print('truncate', path, length, fh) fn = ''.join([self.root, path]) if length != 0: raise NotImplementedError # maybe should be no-op since open with write sets size to zero anyway self.gcs.touch(fn) def unlink(self, path): print('delete', path) fn = ''.join([self.root, path]) try: self.gcs.rm(fn, False) except (IOError, FileNotFoundError): raise FuseOSError(EIO) def release(self, path, fh): print('close', path, fh) try: f = self.cache[fh] f.close() self.cache.pop(fh, None) # should release any cache memory except Exception as e: print(e) return 0 def chmod(self, path, mode): raise NotImplementedError
def read_schema(path): fs = GCSFileSystem(project='trollhunters') with fs.open(path, 'rb') as f: schema = pickle.load(f) return schema
class GCSFS(Operations): def __init__(self, path='.', gcs=None, nfiles=10, **fsargs): if gcs is None: # minimum block size: still read on 5MB boundaries. self.gcs = GCSFileSystem(block_size=30 * 2 ** 20, cache_timeout=6000, **fsargs) else: self.gcs = gcs self.cache = SmallChunkCacher(self.gcs, nfiles=nfiles) self.write_cache = {} self.counter = 0 self.root = path @_tracemethod def getattr(self, path, fh=None): path = ''.join([self.root, path]) try: info = self.gcs.info(path) except FileNotFoundError: parent = path.rsplit('/', 1)[0] if path in self.gcs.ls(parent): info = True else: raise FuseOSError(ENOENT) data = {'st_uid': 1000, 'st_gid': 1000} perm = 0o777 if (info is True or info['storageClass'] == 'DIRECTORY' or 'bucket' in info['kind']): data['st_atime'] = 0 data['st_ctime'] = 0 data['st_mtime'] = 0 data['st_mode'] = (stat.S_IFDIR | perm) data['st_size'] = 0 data['st_blksize'] = 0 else: data['st_atime'] = str_to_time(info['timeStorageClassUpdated']) data['st_ctime'] = str_to_time(info['timeCreated']) data['st_mtime'] = str_to_time(info['updated']) data['st_mode'] = (stat.S_IFREG | perm) data['st_size'] = info['size'] data['st_blksize'] = 5 * 2**20 data['st_nlink'] = 1 return data @_tracemethod def readdir(self, path, fh): path = ''.join([self.root, path]) logger.info("List {}, {}".format(path, fh)) files = self.gcs.ls(path) files = [os.path.basename(f.rstrip('/')) for f in files] return ['.', '..'] + files @_tracemethod def mkdir(self, path, mode): path = ''.join([self.root, path]) logger.info("Mkdir {}".format(path)) parent, name = path.rsplit('/', 1) prefixes = self.gcs._listing_cache[parent + '/'][1]['prefixes'] if name not in prefixes: prefixes.append(name) return 0 @_tracemethod def rmdir(self, path): info = self.gcs.info(path) if info['storageClass': 'DIRECTORY']: self.gcs.rm(path, False) @_tracemethod def read(self, path, size, offset, fh): fn = ''.join([self.root, path]) logger.info('read #{} ({}) offset: {}, size: {}'.format( fh, fn, offset, size)) out = self.cache.read(fn, offset, size) return out @_tracemethod def write(self, path, data, offset, fh): fn = ''.join([self.root, path]) logger.info('write #{} ({}) offset'.format(fh, fn, offset)) f = self.write_cache[fh] f.write(data) return len(data) @_tracemethod def create(self, path, flags): fn = ''.join([self.root, path]) logger.info('create {} {}'.format(fn, oct(flags))) self.gcs.touch(fn) # this makes sure directory entry exists - wasteful! # write (but ignore creation flags) f = self.gcs.open(fn, 'wb') self.write_cache[self.counter] = f logger.info('-> fh #{}'.format(self.counter)) self.counter += 1 return self.counter - 1 @_tracemethod def open(self, path, flags): fn = ''.join([self.root, path]) logger.info('open {} {}'.format(fn, oct(flags))) if flags % 2 == 0: # read self.cache.open(fn) else: # write (but ignore creation flags) self.gcs.open(fn, 'wb') self.write_cache[self.counter] = f logger.info('-> fh #{}'.format(self.counter)) self.counter += 1 return self.counter - 1 @_tracemethod def truncate(self, path, length, fh=None): fn = ''.join([self.root, path]) logger.info('truncate #{} ({}) to {}'.format(fh, fn, length)) if length != 0: raise NotImplementedError # maybe should be no-op since open with write sets size to zero anyway self.gcs.touch(fn) @_tracemethod def unlink(self, path): fn = ''.join([self.root, path]) logger.info('delete', fn) try: self.gcs.rm(fn, False) except (IOError, FileNotFoundError): raise FuseOSError(EIO) @_tracemethod def release(self, path, fh): fn = ''.join([self.root, path]) logger.info('close #{} ({})'.format(fh, fn)) try: if fh in self.write_cache: # write mode f = self.write_cache[fh] f.close() self.write_cache.pop(fh, None) except Exception as e: logger.exception("exception on release:" + str(e)) return 0 @_tracemethod def chmod(self, path, mode): raise NotImplementedError