class GCSHook(FileSystemHookInterface):
    conn_type = 'gcs'
    credentials_env_var = 'GOOGLE_APPLICATION_CREDENTIALS'

    def __init__(self, conn_params: ConnectionParams):
        self.conn_params = conn_params

    def __enter__(self) -> 'GCSFS':
        from fs_gcsfs import GCSFS

        self.saved_credentials = os.environ.get(GCSHook.credentials_env_var)
        credentials_path = self.conn_params.extra.get('credentials_path')
        if credentials_path:
            os.environ[GCSHook.credentials_env_var] = credentials_path

        self.bucket = self.conn_params.extra['bucket']
        self.base_path = self.conn_params.extra.get('base_path')
        self.create = self.conn_params.extra.get('create', False)
        self.strict = self.conn_params.extra.get('strict', True)
        self.conn = GCSFS(bucket_name=self.bucket,
                          root_path=self.base_path,
                          create=self.create,
                          strict=self.strict)
        return self.conn

    def __exit__(self, exc_type, exc_val, exc_tb):
        if self.saved_credentials:
            os.environ[GCSHook.credentials_env_var] = self.saved_credentials
        self.conn.close()
        self.conn = None
Exemplo n.º 2
0
def test_path_to_key(path, root_path, expected, client_mock):
    gcs_fs = GCSFS(bucket_name=TEST_BUCKET,
                   root_path=root_path,
                   client=client_mock,
                   strict=False)
    assert gcs_fs._path_to_key(path) == expected
    assert gcs_fs._path_to_dir_key(path) == expected + GCSFS.DELIMITER
def getfiles(request):

    #set gcs params
    gcs_bucket_name = 'your-gcs-bucket-name'
    gcsfs = GCSFS(bucket_name=gcs_bucket_name)
    gcs_bucket_string = 'gs://' + gcs_bucket_name + '/'

    #set ftp params
    ftp = FTP('ftp.your-ftp-site.com')
    ftp.login('your-username', 'your-password')

    # get files within the root directory into list
    filenames = ftp.nlst()

    #loop through each each file and copy into gcs if it is a .zip file (your specific use case may be different)
    #here you can look for specific file names or remove the if statement if you are pulling everything form the ftp source
    for filename in filenames:
        if '.zip' in filename:
            gcs_target_filename = gcs_bucket_string + filename
            print('Retrieving file: ' + filename)
            #this line pulls the file from the FTP site and writes it to a file in the GCS bucket
            ftp.retrbinary('RETR ' + filename,
                           gcsfs.open(filename, 'wb').write)
            print('Successfully created: ' + gcs_target_filename)
            ftp.delete(filename)
            print('Deleted from remote host: ' + filename)
        else:
            #write line for stackdriver logging
            print('No .Zip files found on remote host.')

    #close ftp connection
    ftp.quit()

    #write line for stackdriver logging
    print('done')
Exemplo n.º 4
0
def read_all_shards(partition, data_dir, bucket_name):
    """Combines different CSVs into a single dataframe."""

    shards = []
    gcsfs = GCSFS(bucket_name)
    for fn in gcsfs.listdir(os.path.join(data_dir, partition)):
        with gcsfs.open(os.path.join(data_dir, partition, fn)) as f:
            shards.append(pd.read_csv(f, index_col=None))

    return pd.concat(shards)
Exemplo n.º 5
0
def test_create_property_does_not_create_file_if_emptyish_root_path(
        root_path, client):
    """Regression test for a bug fixed in 0.4.1"""
    gcs_fs = GCSFS(bucket_name=TEST_BUCKET,
                   root_path=root_path,
                   client=client,
                   create=True)
    assert gcs_fs.bucket.get_blob(root_path + GCSFS.DELIMITER) is None
    def __enter__(self) -> 'GCSFS':
        from fs_gcsfs import GCSFS

        self.saved_credentials = os.environ.get(GCSHook.credentials_env_var)
        credentials_path = self.conn_params.extra.get('credentials_path')
        if credentials_path:
            os.environ[GCSHook.credentials_env_var] = credentials_path

        self.bucket = self.conn_params.extra['bucket']
        self.base_path = self.conn_params.extra.get('base_path')
        self.create = self.conn_params.extra.get('create', False)
        self.strict = self.conn_params.extra.get('strict', True)
        self.conn = GCSFS(bucket_name=self.bucket,
                          root_path=self.base_path,
                          create=self.create,
                          strict=self.strict)
        return self.conn
Exemplo n.º 7
0
def tmp_gcsfs(bucket, client):
    """Yield a temporary `GCSFS` at a unique 'root-blob' within the test bucket."""
    path = "gcsfs/" + str(uuid.uuid4())
    yield GCSFS(bucket_name=bucket.name,
                root_path=path,
                client=client,
                create=True)
    for blob in bucket.list_blobs(prefix=path):
        blob.delete()
Exemplo n.º 8
0
def test_path_to_key_fails_if_path_is_parent_of_root_path(client_mock):
    gcs_fs = GCSFS(bucket_name=TEST_BUCKET, client=client_mock, strict=False)
    with pytest.raises(IllegalBackReference):
        gcs_fs._path_to_key("..")

    gcs_fs_with_root_path = GCSFS(bucket_name="bucket",
                                  root_path="root_path",
                                  client=client_mock,
                                  strict=False)
    with pytest.raises(IllegalBackReference):
        gcs_fs_with_root_path._path_to_key("..")
Exemplo n.º 9
0
 def _open_fs(self, user_context):
     props = self._serialization_props(user_context)
     bucket_name = props.pop('bucket_name', None)
     root_path = props.pop('root_path', None)
     project = props.pop('project', None)
     args = {}
     if props.get('anonymous'):
         args['client'] = Client.create_anonymous_client()
     elif props.get('token'):
         args['client'] = Client(project=project,
                                 credentials=Credentials(**props))
     handle = GCSFS(bucket_name, root_path=root_path, retry=0, **args)
     return handle
Exemplo n.º 10
0
 def make_fs(self):
     return GCSFS(bucket_name=TEST_BUCKET,
                  root_path=self.root_path,
                  client=self.client,
                  create=True)
Exemplo n.º 11
0
def test_instantiation_with_create_false_fails_for_non_existing_root_path():
    with pytest.raises(CreateFailed):
        GCSFS(bucket_name=TEST_BUCKET,
              root_path=str(uuid.uuid4()),
              create=False)
Exemplo n.º 12
0
def test_instantiation_fails_if_no_access_to_bucket():
    with pytest.raises(CreateFailed):
        GCSFS(bucket_name=str(uuid.uuid4()))
Exemplo n.º 13
0
def test_listdir_works_on_bucket_as_root_directory(client):
    """Regression test for a bug fixed in 0.2.1"""
    gcs_fs = GCSFS(bucket_name=TEST_BUCKET, client=client, create=True)

    blob = str(uuid.uuid4())
    directory = str(uuid.uuid4())

    gcs_fs.touch(blob)
    gcs_fs.makedir(directory)

    result = gcs_fs.listdir("")

    # Manual clean-up of the created blobs
    gcs_fs.remove(blob)
    gcs_fs.removedir(directory)

    assert blob in result
    assert directory in result
def main(_):

    if FLAGS.use_transformer:
        assert (
            FLAGS.encoder_fn_name == 'transformer'
        ), 'encoder_fn_name must be transformer if use_transformer is True!'

    assert (FLAGS.epochs % FLAGS.measurements == 0
            ), 'Number of measurements must divide number of epochs!'
    measurement_epochs = FLAGS.epochs // FLAGS.measurements

    assert FLAGS.results_save_dir != '', 'Specify results_save_dir!'

    assert FLAGS.label != '', 'Specify label!'

    if FLAGS.load_model:
        assert FLAGS.load_model_dir != '', 'Specify load_model_dir!'
        assert FLAGS.load_model_step > 0, 'Loaded model must have been trained for more than 0 steps.'

    if FLAGS.save_model:
        assert FLAGS.save_model_dir != '', 'Specify save_model_dir!'

    datum = {
        'label': FLAGS.label,
        'encoder_fn_name': FLAGS.encoder_fn_name,
        'encoder_fn_kwargs_path': FLAGS.encoder_fn_kwargs_path,
        'reduce_fn_name': FLAGS.reduce_fn_name,
        'reduce_fn_kwargs_path': FLAGS.reduce_fn_kwargs_path,
        'epochs': FLAGS.epochs,
        'measurements': FLAGS.measurements,
        'lens_batch_size': FLAGS.lens_batch_size,
        'knn_batch_size': FLAGS.knn_batch_size,
        'encoder_lr': FLAGS.encoder_lr,
        'lens_lr': FLAGS.lens_lr,
        'predictor_lr': FLAGS.predictor_lr,
        'encoder_wd': FLAGS.encoder_wd,
        'lens_wd': FLAGS.lens_wd,
        'predictor_wd': FLAGS.predictor_wd,
        'train_families': FLAGS.train_families,
        'lens_train_samples': FLAGS.lens_train_samples,
        'first_test_family': FLAGS.first_test_family,
        'last_test_family': FLAGS.last_test_family,
        'lens_shuffle_seed': FLAGS.lens_shuffle_seed,
        'lens_sample_random_state': FLAGS.lens_sample_random_state,
        'knn_shuffle_seed': FLAGS.knn_shuffle_seed,
        'knn_sample_random_state': FLAGS.knn_sample_random_state,
        'random_key': FLAGS.random_key,
        'use_transformer': FLAGS.use_transformer,
        'use_bert': FLAGS.use_bert,
        'restore_transformer_dir': FLAGS.restore_transformer_dir,
        'gcs_bucket': FLAGS.gcs_bucket,
        'data_partitions_dirpath': FLAGS.data_partitions_dirpath,
        'results_save_dir': FLAGS.results_save_dir,
        'load_model': FLAGS.load_model,
        'load_model_dir': FLAGS.load_model_dir,
        'load_model_step': FLAGS.load_model_step,
        'save_model': FLAGS.save_model,
        'save_model_dir': FLAGS.save_model_dir
    }

    gcsfs = GCSFS(FLAGS.gcs_bucket)

    print(datum)
    df = pd.DataFrame([datum])
    with gcsfs.open(os.path.join(FLAGS.results_save_dir, FLAGS.label + '.csv'),
                    'w') as gcs_file:
        df.to_csv(gcs_file, index=False)

    knn_train_samples_ = [1, 5, 10, 50]

    num_families = len(family_ids)
    loss_fn_kwargs = {'num_classes': num_families}

    lens_knn_train_family_accessions = []
    for _ in range(1, FLAGS.train_families + 1):
        family_name = 'PF%05d' % _
        lens_knn_train_family_accessions.append(family_name)

    knn_test_family_accessions = []
    for _ in range(FLAGS.first_test_family, FLAGS.last_test_family + 1):
        family_name = 'PF%05d' % _
        knn_test_family_accessions.append(family_name)

    encoder_fn = encoder_fn_name_to_fn(FLAGS.encoder_fn_name)
    encoder_fn_kwargs = json.load(
        open(
            resource_filename(
                'contextual_lenses.resources',
                os.path.join('encoder_fn_kwargs_resources',
                             FLAGS.encoder_fn_kwargs_path + '.json'))))

    reduce_fn = reduce_fn_name_to_fn(FLAGS.reduce_fn_name)
    reduce_fn_kwargs = json.load(
        open(
            resource_filename(
                'contextual_lenses.resources',
                os.path.join('reduce_fn_kwargs_resources',
                             FLAGS.reduce_fn_kwargs_path + '.json'))))

    layers, trainable_encoder = architecture_to_layers(FLAGS.encoder_fn_name,
                                                       FLAGS.reduce_fn_name)

    embedding_model = create_model(
        use_transformer=FLAGS.use_transformer,
        use_bert=FLAGS.use_bert,
        restore_transformer_dir=FLAGS.restore_transformer_dir,
        encoder_fn=encoder_fn,
        encoder_fn_kwargs=encoder_fn_kwargs,
        reduce_fn=reduce_fn,
        reduce_fn_kwargs=reduce_fn_kwargs,
        layers=layers,
        output='embedding')

    datum.update(
        measure_nearest_neighbor_performance(
            accuracy_label=
            'train_knn_accuracy_untrained_lens_1_knn_train_samples',
            encoder=embedding_model,
            family_accessions=lens_knn_train_family_accessions,
            batch_size=FLAGS.knn_batch_size,
            train_samples=1,
            shuffle_seed=FLAGS.knn_shuffle_seed,
            sample_random_state=FLAGS.knn_sample_random_state))

    for knn_train_samples in knn_train_samples_:

        datum.update(
            measure_nearest_neighbor_performance(
                accuracy_label='test_knn_accuracy_untrained_lens_' +
                str(knn_train_samples) + '_knn_train_samples',
                encoder=embedding_model,
                family_accessions=knn_test_family_accessions,
                batch_size=FLAGS.knn_batch_size,
                train_samples=knn_train_samples,
                shuffle_seed=FLAGS.knn_shuffle_seed,
                sample_random_state=FLAGS.knn_sample_random_state))

    encoder_fn_params = None
    reduce_fn_params = None
    predict_fn_params = None

    model = create_model(use_transformer=FLAGS.use_transformer,
                         use_bert=FLAGS.use_bert,
                         restore_transformer_dir=FLAGS.restore_transformer_dir,
                         encoder_fn=encoder_fn,
                         encoder_fn_kwargs=encoder_fn_kwargs,
                         reduce_fn=reduce_fn,
                         reduce_fn_kwargs=reduce_fn_kwargs,
                         layers=layers,
                         output='prediction',
                         encoder_fn_params=encoder_fn_params,
                         reduce_fn_params=reduce_fn_params,
                         predict_fn_params=predict_fn_params)

    optimizer = create_optimizer(
        model=model,
        learning_rate=[FLAGS.encoder_lr, FLAGS.lens_lr, FLAGS.predictor_lr],
        weight_decay=[FLAGS.encoder_wd, FLAGS.lens_wd, FLAGS.predictor_wd],
        layers=layers)

    if FLAGS.load_model:
        optimizer = checkpoints.restore_checkpoint(ckpt_dir=os.path.join(
            'gs://' + FLAGS.gcs_bucket, FLAGS.load_model_dir),
                                                   target=optimizer,
                                                   step=FLAGS.load_model_step)

        trained_params = optimizer.target.params
        embedding_model = set_model_parameters(model=embedding_model,
                                               params=trained_params)

    if FLAGS.save_model:
        checkpoints.save_checkpoint(ckpt_dir=os.path.join(
            'gs://' + FLAGS.gcs_bucket, FLAGS.save_model_dir),
                                    target=optimizer,
                                    step=FLAGS.load_model_step)

    for i in range(FLAGS.measurements):

        train_batches, train_indexes = create_pfam_batches(
            family_accessions=lens_knn_train_family_accessions,
            batch_size=FLAGS.lens_batch_size,
            samples=FLAGS.lens_train_samples,
            epochs=measurement_epochs,
            drop_remainder=True,
            shuffle_seed=FLAGS.lens_shuffle_seed + i,
            sample_random_state=FLAGS.lens_sample_random_state)

        optimizer = train(
            model=optimizer.target,
            train_data=train_batches,
            loss_fn=cross_entropy_loss,
            loss_fn_kwargs=loss_fn_kwargs,
            learning_rate=[
                FLAGS.encoder_lr, FLAGS.lens_lr, FLAGS.predictor_lr
            ],
            weight_decay=[FLAGS.encoder_wd, FLAGS.lens_wd, FLAGS.predictor_wd],
            layers=layers)

        results, preds = pfam_evaluate(
            predict_fn=optimizer.target,
            test_family_accessions=lens_knn_train_family_accessions,
            title=None,
            loss_fn_kwargs=loss_fn_kwargs,
            batch_size=FLAGS.lens_batch_size,
            data_partitions_dirpath=FLAGS.data_partitions_dirpath,
            gcs_bucket=FLAGS.gcs_bucket)

        lens_accuracy = results['accuracy']
        datum['lens_accuracy' + '_measurement_' + str(i)] = lens_accuracy

        lens_cross_entropy = float(results['cross_entropy'])
        datum['lens_cross_entropy' + '_measurement_' +
              str(i)] = lens_cross_entropy

        trained_params = optimizer.target.params
        embedding_model = set_model_parameters(model=embedding_model,
                                               params=trained_params)

        datum.update(
            measure_nearest_neighbor_performance(
                accuracy_label=
                'train_knn_accuracy_trained_lens_1_knn_train_samples' +
                '_measurement_' + str(i),
                encoder=embedding_model,
                family_accessions=lens_knn_train_family_accessions,
                batch_size=FLAGS.knn_batch_size,
                train_samples=1,
                shuffle_seed=FLAGS.knn_shuffle_seed,
                sample_random_state=FLAGS.knn_sample_random_state))

        for knn_train_samples in knn_train_samples_:

            datum.update(
                measure_nearest_neighbor_performance(
                    accuracy_label='test_knn_accuracy_trained_lens_' +
                    str(knn_train_samples) + '_knn_train_samples' +
                    '_measurement_' + str(i),
                    encoder=embedding_model,
                    family_accessions=knn_test_family_accessions,
                    batch_size=FLAGS.knn_batch_size,
                    train_samples=knn_train_samples,
                    shuffle_seed=FLAGS.knn_shuffle_seed,
                    sample_random_state=FLAGS.knn_sample_random_state))

    print(datum)
    df = pd.DataFrame([datum])
    with gcsfs.open(os.path.join(FLAGS.results_save_dir, FLAGS.label + '.csv'),
                    'w') as gcs_file:
        df.to_csv(gcs_file, index=False)

    if FLAGS.save_model:
        checkpoints.save_checkpoint(ckpt_dir=os.path.join(
            'gs://' + FLAGS.gcs_bucket, FLAGS.save_model_dir),
                                    target=optimizer,
                                    step=FLAGS.load_model_step + FLAGS.epochs)
Exemplo n.º 15
0
import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output, State

import plotly.graph_objs as go
from plotly import tools

from fs_gcsfs import GCSFS
import pandas as pd
import json

gcsfs = GCSFS(bucket_name="pockets-data")
external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']


def download_csv_to_df(source_name):
    """Opens csv from google cloud and returns a dataframe"""
    with gcsfs.open(source_name, 'r') as f:
        df = pd.read_csv(f)

    return df


dash_app = dash.Dash(__name__, external_stylesheets=external_stylesheets)
dash_app.title = 'Simple Pockets'

app = dash_app.server
DAYS = 255

options = []
Exemplo n.º 16
0
import fs
from fs.zipfs import ZipFS
from fs_gcsfs import GCSFS

# Already fixed:
# data/train/real
# data/train/masked
# data/train/reference
# data/validation

with fs.open_fs('mem://') as mem_fs:
    # with fs.open_fs('gs://two-face-inpainting-mlengine/sample-data?strict=False') as gcsfs:
    with GCSFS(bucket_name="two-face-inpainting-mlengine",
               root_path='data/validation') as gcsfs:
        # with gcsfs.open('sample.zip', 'rb') as zip_file:
        #   with ZipFS(zip_file) as zip_fs:
        #     fs.copy.copy_dir(zip_fs, '.', mem_fs, '.')
        gcsfs.fix_storage()
        # gcsfs.tree()
    # mem_fs.tree()
    # walker = Walker()
    # for path in walker.files(mem_fs):
    #   print(path)