class GCSHook(FileSystemHookInterface): conn_type = 'gcs' credentials_env_var = 'GOOGLE_APPLICATION_CREDENTIALS' def __init__(self, conn_params: ConnectionParams): self.conn_params = conn_params def __enter__(self) -> 'GCSFS': from fs_gcsfs import GCSFS self.saved_credentials = os.environ.get(GCSHook.credentials_env_var) credentials_path = self.conn_params.extra.get('credentials_path') if credentials_path: os.environ[GCSHook.credentials_env_var] = credentials_path self.bucket = self.conn_params.extra['bucket'] self.base_path = self.conn_params.extra.get('base_path') self.create = self.conn_params.extra.get('create', False) self.strict = self.conn_params.extra.get('strict', True) self.conn = GCSFS(bucket_name=self.bucket, root_path=self.base_path, create=self.create, strict=self.strict) return self.conn def __exit__(self, exc_type, exc_val, exc_tb): if self.saved_credentials: os.environ[GCSHook.credentials_env_var] = self.saved_credentials self.conn.close() self.conn = None
def test_path_to_key(path, root_path, expected, client_mock): gcs_fs = GCSFS(bucket_name=TEST_BUCKET, root_path=root_path, client=client_mock, strict=False) assert gcs_fs._path_to_key(path) == expected assert gcs_fs._path_to_dir_key(path) == expected + GCSFS.DELIMITER
def getfiles(request): #set gcs params gcs_bucket_name = 'your-gcs-bucket-name' gcsfs = GCSFS(bucket_name=gcs_bucket_name) gcs_bucket_string = 'gs://' + gcs_bucket_name + '/' #set ftp params ftp = FTP('ftp.your-ftp-site.com') ftp.login('your-username', 'your-password') # get files within the root directory into list filenames = ftp.nlst() #loop through each each file and copy into gcs if it is a .zip file (your specific use case may be different) #here you can look for specific file names or remove the if statement if you are pulling everything form the ftp source for filename in filenames: if '.zip' in filename: gcs_target_filename = gcs_bucket_string + filename print('Retrieving file: ' + filename) #this line pulls the file from the FTP site and writes it to a file in the GCS bucket ftp.retrbinary('RETR ' + filename, gcsfs.open(filename, 'wb').write) print('Successfully created: ' + gcs_target_filename) ftp.delete(filename) print('Deleted from remote host: ' + filename) else: #write line for stackdriver logging print('No .Zip files found on remote host.') #close ftp connection ftp.quit() #write line for stackdriver logging print('done')
def read_all_shards(partition, data_dir, bucket_name): """Combines different CSVs into a single dataframe.""" shards = [] gcsfs = GCSFS(bucket_name) for fn in gcsfs.listdir(os.path.join(data_dir, partition)): with gcsfs.open(os.path.join(data_dir, partition, fn)) as f: shards.append(pd.read_csv(f, index_col=None)) return pd.concat(shards)
def test_create_property_does_not_create_file_if_emptyish_root_path( root_path, client): """Regression test for a bug fixed in 0.4.1""" gcs_fs = GCSFS(bucket_name=TEST_BUCKET, root_path=root_path, client=client, create=True) assert gcs_fs.bucket.get_blob(root_path + GCSFS.DELIMITER) is None
def __enter__(self) -> 'GCSFS': from fs_gcsfs import GCSFS self.saved_credentials = os.environ.get(GCSHook.credentials_env_var) credentials_path = self.conn_params.extra.get('credentials_path') if credentials_path: os.environ[GCSHook.credentials_env_var] = credentials_path self.bucket = self.conn_params.extra['bucket'] self.base_path = self.conn_params.extra.get('base_path') self.create = self.conn_params.extra.get('create', False) self.strict = self.conn_params.extra.get('strict', True) self.conn = GCSFS(bucket_name=self.bucket, root_path=self.base_path, create=self.create, strict=self.strict) return self.conn
def tmp_gcsfs(bucket, client): """Yield a temporary `GCSFS` at a unique 'root-blob' within the test bucket.""" path = "gcsfs/" + str(uuid.uuid4()) yield GCSFS(bucket_name=bucket.name, root_path=path, client=client, create=True) for blob in bucket.list_blobs(prefix=path): blob.delete()
def test_path_to_key_fails_if_path_is_parent_of_root_path(client_mock): gcs_fs = GCSFS(bucket_name=TEST_BUCKET, client=client_mock, strict=False) with pytest.raises(IllegalBackReference): gcs_fs._path_to_key("..") gcs_fs_with_root_path = GCSFS(bucket_name="bucket", root_path="root_path", client=client_mock, strict=False) with pytest.raises(IllegalBackReference): gcs_fs_with_root_path._path_to_key("..")
def _open_fs(self, user_context): props = self._serialization_props(user_context) bucket_name = props.pop('bucket_name', None) root_path = props.pop('root_path', None) project = props.pop('project', None) args = {} if props.get('anonymous'): args['client'] = Client.create_anonymous_client() elif props.get('token'): args['client'] = Client(project=project, credentials=Credentials(**props)) handle = GCSFS(bucket_name, root_path=root_path, retry=0, **args) return handle
def make_fs(self): return GCSFS(bucket_name=TEST_BUCKET, root_path=self.root_path, client=self.client, create=True)
def test_instantiation_with_create_false_fails_for_non_existing_root_path(): with pytest.raises(CreateFailed): GCSFS(bucket_name=TEST_BUCKET, root_path=str(uuid.uuid4()), create=False)
def test_instantiation_fails_if_no_access_to_bucket(): with pytest.raises(CreateFailed): GCSFS(bucket_name=str(uuid.uuid4()))
def test_listdir_works_on_bucket_as_root_directory(client): """Regression test for a bug fixed in 0.2.1""" gcs_fs = GCSFS(bucket_name=TEST_BUCKET, client=client, create=True) blob = str(uuid.uuid4()) directory = str(uuid.uuid4()) gcs_fs.touch(blob) gcs_fs.makedir(directory) result = gcs_fs.listdir("") # Manual clean-up of the created blobs gcs_fs.remove(blob) gcs_fs.removedir(directory) assert blob in result assert directory in result
def main(_): if FLAGS.use_transformer: assert ( FLAGS.encoder_fn_name == 'transformer' ), 'encoder_fn_name must be transformer if use_transformer is True!' assert (FLAGS.epochs % FLAGS.measurements == 0 ), 'Number of measurements must divide number of epochs!' measurement_epochs = FLAGS.epochs // FLAGS.measurements assert FLAGS.results_save_dir != '', 'Specify results_save_dir!' assert FLAGS.label != '', 'Specify label!' if FLAGS.load_model: assert FLAGS.load_model_dir != '', 'Specify load_model_dir!' assert FLAGS.load_model_step > 0, 'Loaded model must have been trained for more than 0 steps.' if FLAGS.save_model: assert FLAGS.save_model_dir != '', 'Specify save_model_dir!' datum = { 'label': FLAGS.label, 'encoder_fn_name': FLAGS.encoder_fn_name, 'encoder_fn_kwargs_path': FLAGS.encoder_fn_kwargs_path, 'reduce_fn_name': FLAGS.reduce_fn_name, 'reduce_fn_kwargs_path': FLAGS.reduce_fn_kwargs_path, 'epochs': FLAGS.epochs, 'measurements': FLAGS.measurements, 'lens_batch_size': FLAGS.lens_batch_size, 'knn_batch_size': FLAGS.knn_batch_size, 'encoder_lr': FLAGS.encoder_lr, 'lens_lr': FLAGS.lens_lr, 'predictor_lr': FLAGS.predictor_lr, 'encoder_wd': FLAGS.encoder_wd, 'lens_wd': FLAGS.lens_wd, 'predictor_wd': FLAGS.predictor_wd, 'train_families': FLAGS.train_families, 'lens_train_samples': FLAGS.lens_train_samples, 'first_test_family': FLAGS.first_test_family, 'last_test_family': FLAGS.last_test_family, 'lens_shuffle_seed': FLAGS.lens_shuffle_seed, 'lens_sample_random_state': FLAGS.lens_sample_random_state, 'knn_shuffle_seed': FLAGS.knn_shuffle_seed, 'knn_sample_random_state': FLAGS.knn_sample_random_state, 'random_key': FLAGS.random_key, 'use_transformer': FLAGS.use_transformer, 'use_bert': FLAGS.use_bert, 'restore_transformer_dir': FLAGS.restore_transformer_dir, 'gcs_bucket': FLAGS.gcs_bucket, 'data_partitions_dirpath': FLAGS.data_partitions_dirpath, 'results_save_dir': FLAGS.results_save_dir, 'load_model': FLAGS.load_model, 'load_model_dir': FLAGS.load_model_dir, 'load_model_step': FLAGS.load_model_step, 'save_model': FLAGS.save_model, 'save_model_dir': FLAGS.save_model_dir } gcsfs = GCSFS(FLAGS.gcs_bucket) print(datum) df = pd.DataFrame([datum]) with gcsfs.open(os.path.join(FLAGS.results_save_dir, FLAGS.label + '.csv'), 'w') as gcs_file: df.to_csv(gcs_file, index=False) knn_train_samples_ = [1, 5, 10, 50] num_families = len(family_ids) loss_fn_kwargs = {'num_classes': num_families} lens_knn_train_family_accessions = [] for _ in range(1, FLAGS.train_families + 1): family_name = 'PF%05d' % _ lens_knn_train_family_accessions.append(family_name) knn_test_family_accessions = [] for _ in range(FLAGS.first_test_family, FLAGS.last_test_family + 1): family_name = 'PF%05d' % _ knn_test_family_accessions.append(family_name) encoder_fn = encoder_fn_name_to_fn(FLAGS.encoder_fn_name) encoder_fn_kwargs = json.load( open( resource_filename( 'contextual_lenses.resources', os.path.join('encoder_fn_kwargs_resources', FLAGS.encoder_fn_kwargs_path + '.json')))) reduce_fn = reduce_fn_name_to_fn(FLAGS.reduce_fn_name) reduce_fn_kwargs = json.load( open( resource_filename( 'contextual_lenses.resources', os.path.join('reduce_fn_kwargs_resources', FLAGS.reduce_fn_kwargs_path + '.json')))) layers, trainable_encoder = architecture_to_layers(FLAGS.encoder_fn_name, FLAGS.reduce_fn_name) embedding_model = create_model( use_transformer=FLAGS.use_transformer, use_bert=FLAGS.use_bert, restore_transformer_dir=FLAGS.restore_transformer_dir, encoder_fn=encoder_fn, encoder_fn_kwargs=encoder_fn_kwargs, reduce_fn=reduce_fn, reduce_fn_kwargs=reduce_fn_kwargs, layers=layers, output='embedding') datum.update( measure_nearest_neighbor_performance( accuracy_label= 'train_knn_accuracy_untrained_lens_1_knn_train_samples', encoder=embedding_model, family_accessions=lens_knn_train_family_accessions, batch_size=FLAGS.knn_batch_size, train_samples=1, shuffle_seed=FLAGS.knn_shuffle_seed, sample_random_state=FLAGS.knn_sample_random_state)) for knn_train_samples in knn_train_samples_: datum.update( measure_nearest_neighbor_performance( accuracy_label='test_knn_accuracy_untrained_lens_' + str(knn_train_samples) + '_knn_train_samples', encoder=embedding_model, family_accessions=knn_test_family_accessions, batch_size=FLAGS.knn_batch_size, train_samples=knn_train_samples, shuffle_seed=FLAGS.knn_shuffle_seed, sample_random_state=FLAGS.knn_sample_random_state)) encoder_fn_params = None reduce_fn_params = None predict_fn_params = None model = create_model(use_transformer=FLAGS.use_transformer, use_bert=FLAGS.use_bert, restore_transformer_dir=FLAGS.restore_transformer_dir, encoder_fn=encoder_fn, encoder_fn_kwargs=encoder_fn_kwargs, reduce_fn=reduce_fn, reduce_fn_kwargs=reduce_fn_kwargs, layers=layers, output='prediction', encoder_fn_params=encoder_fn_params, reduce_fn_params=reduce_fn_params, predict_fn_params=predict_fn_params) optimizer = create_optimizer( model=model, learning_rate=[FLAGS.encoder_lr, FLAGS.lens_lr, FLAGS.predictor_lr], weight_decay=[FLAGS.encoder_wd, FLAGS.lens_wd, FLAGS.predictor_wd], layers=layers) if FLAGS.load_model: optimizer = checkpoints.restore_checkpoint(ckpt_dir=os.path.join( 'gs://' + FLAGS.gcs_bucket, FLAGS.load_model_dir), target=optimizer, step=FLAGS.load_model_step) trained_params = optimizer.target.params embedding_model = set_model_parameters(model=embedding_model, params=trained_params) if FLAGS.save_model: checkpoints.save_checkpoint(ckpt_dir=os.path.join( 'gs://' + FLAGS.gcs_bucket, FLAGS.save_model_dir), target=optimizer, step=FLAGS.load_model_step) for i in range(FLAGS.measurements): train_batches, train_indexes = create_pfam_batches( family_accessions=lens_knn_train_family_accessions, batch_size=FLAGS.lens_batch_size, samples=FLAGS.lens_train_samples, epochs=measurement_epochs, drop_remainder=True, shuffle_seed=FLAGS.lens_shuffle_seed + i, sample_random_state=FLAGS.lens_sample_random_state) optimizer = train( model=optimizer.target, train_data=train_batches, loss_fn=cross_entropy_loss, loss_fn_kwargs=loss_fn_kwargs, learning_rate=[ FLAGS.encoder_lr, FLAGS.lens_lr, FLAGS.predictor_lr ], weight_decay=[FLAGS.encoder_wd, FLAGS.lens_wd, FLAGS.predictor_wd], layers=layers) results, preds = pfam_evaluate( predict_fn=optimizer.target, test_family_accessions=lens_knn_train_family_accessions, title=None, loss_fn_kwargs=loss_fn_kwargs, batch_size=FLAGS.lens_batch_size, data_partitions_dirpath=FLAGS.data_partitions_dirpath, gcs_bucket=FLAGS.gcs_bucket) lens_accuracy = results['accuracy'] datum['lens_accuracy' + '_measurement_' + str(i)] = lens_accuracy lens_cross_entropy = float(results['cross_entropy']) datum['lens_cross_entropy' + '_measurement_' + str(i)] = lens_cross_entropy trained_params = optimizer.target.params embedding_model = set_model_parameters(model=embedding_model, params=trained_params) datum.update( measure_nearest_neighbor_performance( accuracy_label= 'train_knn_accuracy_trained_lens_1_knn_train_samples' + '_measurement_' + str(i), encoder=embedding_model, family_accessions=lens_knn_train_family_accessions, batch_size=FLAGS.knn_batch_size, train_samples=1, shuffle_seed=FLAGS.knn_shuffle_seed, sample_random_state=FLAGS.knn_sample_random_state)) for knn_train_samples in knn_train_samples_: datum.update( measure_nearest_neighbor_performance( accuracy_label='test_knn_accuracy_trained_lens_' + str(knn_train_samples) + '_knn_train_samples' + '_measurement_' + str(i), encoder=embedding_model, family_accessions=knn_test_family_accessions, batch_size=FLAGS.knn_batch_size, train_samples=knn_train_samples, shuffle_seed=FLAGS.knn_shuffle_seed, sample_random_state=FLAGS.knn_sample_random_state)) print(datum) df = pd.DataFrame([datum]) with gcsfs.open(os.path.join(FLAGS.results_save_dir, FLAGS.label + '.csv'), 'w') as gcs_file: df.to_csv(gcs_file, index=False) if FLAGS.save_model: checkpoints.save_checkpoint(ckpt_dir=os.path.join( 'gs://' + FLAGS.gcs_bucket, FLAGS.save_model_dir), target=optimizer, step=FLAGS.load_model_step + FLAGS.epochs)
import dash import dash_core_components as dcc import dash_html_components as html from dash.dependencies import Input, Output, State import plotly.graph_objs as go from plotly import tools from fs_gcsfs import GCSFS import pandas as pd import json gcsfs = GCSFS(bucket_name="pockets-data") external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css'] def download_csv_to_df(source_name): """Opens csv from google cloud and returns a dataframe""" with gcsfs.open(source_name, 'r') as f: df = pd.read_csv(f) return df dash_app = dash.Dash(__name__, external_stylesheets=external_stylesheets) dash_app.title = 'Simple Pockets' app = dash_app.server DAYS = 255 options = []
import fs from fs.zipfs import ZipFS from fs_gcsfs import GCSFS # Already fixed: # data/train/real # data/train/masked # data/train/reference # data/validation with fs.open_fs('mem://') as mem_fs: # with fs.open_fs('gs://two-face-inpainting-mlengine/sample-data?strict=False') as gcsfs: with GCSFS(bucket_name="two-face-inpainting-mlengine", root_path='data/validation') as gcsfs: # with gcsfs.open('sample.zip', 'rb') as zip_file: # with ZipFS(zip_file) as zip_fs: # fs.copy.copy_dir(zip_fs, '.', mem_fs, '.') gcsfs.fix_storage() # gcsfs.tree() # mem_fs.tree() # walker = Walker() # for path in walker.files(mem_fs): # print(path)