示例#1
0
def preprocess(source, chunksize):
    reader = read_csv(source, header=0, chunksize=chunksize)
    if gfile.Exists(prep_data_path):
        gfile.Remove(prep_data_path)

    for data in reader:
        data = data.fillna(0)
        data.replace(('yes', 'no'), (1, 0), inplace=True)
        product_type, sub_area, ecology = \
          data['product_type'].values, data['sub_area'].values, data['ecology'].values
        data['product_type'] = np.reshape([
            one_hot(x, n=np.unique(product_type).shape[0] + 1, filters='')
            for x in product_type
        ], product_type.shape)
        sub_area = np.array([
            s.replace(' ', '').replace('-', '').replace('\'',
                                                        '').replace(',', '')
            for s in sub_area
        ])
        data['sub_area'] = np.reshape(
            [one_hot(x, n=np.unique(sub_area).shape[0] + 1) for x in sub_area],
            sub_area.shape)
        ecology = np.array([
            s.replace(' ', '').replace('-', '').replace('\'',
                                                        '').replace(',', '')
            for s in ecology
        ])
        data['ecology'] = np.reshape(
            [one_hot(x, n=np.unique(ecology).shape[0] + 1) for x in ecology],
            ecology.shape)
        data.to_csv(
            prep_data_path) if not isfile(prep_data_path) else data.to_csv(
                prep_data_path, mode='a', header=False)
def load_model(net, output_path, file_name):
    weights_path = os.path.join(output_path, file_name, '.h5')
    local_filename = weights_path.split('/')[-1]
    tmp_filename = os.path.join(tempfile.gettempdir(),
                                str(int(time.time())) + '_' + local_filename)
    gfile.Copy(weights_path, tmp_filename)
    net.load_weights(tmp_filename)
    gfile.Remove(tmp_filename)
示例#3
0
def copy_to_experiment_dir(config_file):
    # copy config file to the experiment directory
    saved_config_file_path = _config_file_path_to_copy(config_file)

    # HACK: This is for tensorflow bug workaround.
    # We can remove following 2 lines once it's been resolved in tensorflow
    # issue link: https://github.com/tensorflow/tensorflow/issues/28508
    if gfile.Exists(saved_config_file_path):
        gfile.Remove(saved_config_file_path)

    gfile.Copy(config_file, saved_config_file_path)
def save_model(net, model_json, output_path, file_name):
    """serialize weights to HDF5."""
    with gfile.Open(output_path + file_name + '.json', 'w') as json_file:
        json_file.write(model_json)
    # serialize weights to HDF5
    weight_path = os.path.join(output_path, file_name, '.h5')
    local_filename = weight_path.split('/')[-1]
    tmp_filename = os.path.join(tempfile.gettempdir(),
                                str(int(time.time())) + '_' + local_filename)
    net.save_weights(tmp_filename)
    gfile.Copy(tmp_filename, weight_path, overwrite=True)
    gfile.Remove(tmp_filename)
示例#5
0
    def run(self, corner, subvol_size, reset_counters=True):
        """Runs FFN inference over a subvolume.

    Args:
      corner: start of the subvolume (z, y, x)
      subvol_size: size of the subvolume (z, y, x)
      reset_counters: whether to reset the counters

    Returns:
      Canvas object with the segmentation or None if the canvas could not
      be created or the segmentation subvolume already exists.
    """
        if reset_counters:
            self.counters.reset()

        seg_path = storage.segmentation_path(
            self.request.segmentation_output_dir, corner)
        prob_path = storage.object_prob_path(
            self.request.segmentation_output_dir, corner)
        cpoint_path = storage.checkpoint_path(
            self.request.segmentation_output_dir, corner)

        if gfile.Exists(seg_path):
            if pyborgletinfo.RunningUnderBorglet():
                pywrapborgletlib.BorgletLib.SetStatusMsg(
                    'Segmentation already complete; exiting.')
            return None

        canvas, alignment = self.make_canvas(corner, subvol_size)
        if canvas is None:
            return None

        if gfile.Exists(cpoint_path):
            canvas.restore_checkpoint(cpoint_path)

        if self.request.alignment_options.save_raw:
            image_path = storage.subvolume_path(
                self.request.segmentation_output_dir, corner, 'align')
            with storage.atomic_file(image_path) as fd:
                np.savez_compressed(fd, im=canvas.image)

        canvas.segment_all(
            seed_policy=self.get_seed_policy(corner, subvol_size))
        self.save_segmentation(canvas, alignment, seg_path, prob_path)

        # Attempt to remove the checkpoint file now that we no longer need it.
        try:
            gfile.Remove(cpoint_path)
        except:  # pylint: disable=bare-except
            pass

        return canvas
示例#6
0
def evaluate_model(
    dataset_path: str, model_path: str, metric_name: str
) -> NamedTuple('Outputs', [('metric_name', str), ('metric_value', float),
                            ('mlpipeline_metrics', 'Metrics')]):
    """Evaluates a trained sklearn model."""
    import joblib
    #     import pickle
    import json
    import pandas as pd
    import subprocess
    import sys

    from tensorflow import gfile
    from sklearn.metrics import accuracy_score, recall_score

    df_test = pd.read_csv(dataset_path)

    X_test = df_test.drop('Cover_Type', axis=1)
    y_test = df_test['Cover_Type']

    # Copy the model from GCS
    model_filename = 'model.pkl'
    gcs_model_filepath = '{}/{}'.format(model_path, model_filename)
    print(gcs_model_filepath)

    if gfile.Exists(model_filename):
        gfile.Remove(model_filename)

    gfile.Copy(gcs_model_filepath, model_filename)

    with open(model_filename, 'rb') as model_file:
        model = joblib.load(model_file)

    y_hat = model.predict(X_test)

    if metric_name == 'accuracy':
        metric_value = accuracy_score(y_test, y_hat)
    elif metric_name == 'recall':
        metric_value = recall_score(y_test, y_hat)
    else:
        metric_name = 'N/A'
        metric_value = 0

    # Export the metric
    metrics = {
        'metrics': [{
            'name': metric_name,
            'numberValue': float(metric_value)
        }]
    }

    return metric_name, metric_value, json.dumps(metrics)
示例#7
0
def save_config_file(config_file, dest_dir):
    if not gfile.Exists(dest_dir):
        gfile.MkDir(dest_dir)

    config_file_dest = os.path.join(dest_dir, 'blueoil_config.yaml')

    # HACK: This is for tensorflow bug workaround.
    # We can remove following 2 lines once it's been resolved in tensorflow
    # issue link: https://github.com/tensorflow/tensorflow/issues/28508
    if gfile.Exists(config_file_dest):
        gfile.Remove(config_file_dest)

    return gfile.Copy(config_file, config_file_dest)
示例#8
0
def write_production():
    """Copies staged templates to production directory.

  This function assumes that the template and associated metadata files are
  stored in a folder of the form gs://<template_staging_bucket>/<release_name>.
  It copies the templates from the <release_name> folder to two new locations:
  gs://<prod_bucket>/<release_name> and gs://<prod_bucket>/latest. Both
  folders contain identical contents; the <release_name> bucket is to allow
  customers to pin to a specific release and the `latest` folder gives the UI
  a location at which to point.

  Raises:
    GOSError if there was an error reading or writing a file.
  """
    prod_root = FLAGS.template_prod_bucket
    template_staging_root = FLAGS.template_staging_bucket

    template_dir = os.path.join(template_staging_root, FLAGS.candidate_name)
    if not gfile.IsDirectory(template_dir):
        logging.fatal(
            'Template staging directory %s does not exist or is not a '
            'directory.', template_dir)

    release_dir = os.path.join(prod_root, FLAGS.release_name)
    if gfile.IsDirectory(release_dir):
        logging.fatal(
            'Template release directory %s already exists. Aborting.',
            template_dir)

    logging.info('Copying folder from %s to %s.', template_dir, release_dir)
    gfile.MkDir(release_dir)
    CopyRecursively(template_dir, release_dir)

    # TODO: If we ever delete templates, they will stick around in
    # `latest`; evaluate something rsync-like in the future.
    latest_dir = os.path.join(prod_root, LATEST_FOLDER_NAME)
    if gfile.Exists(latest_dir):
        if not gfile.IsDirectory(latest_dir):
            gfile.Remove(latest_dir)
            gfile.MkDir(latest_dir)
    else:
        gfile.MkDir(latest_dir)

    logging.info('Copying folder from %s to %s.', template_dir, latest_dir)
    CopyRecursively(template_dir, latest_dir, overwrite=True)
def embed_data(x, dset, path):
    """embeds x into the code space using the autoencoder."""

    if x:
        return np.zeros(shape=(0, 10))
    # load model and weights
    json_path = os.path.join(path, 'ae_{}.json'.format(dset))
    print('load model from json file:', json_path)
    with gfile.Open(json_path) as f:
        pt_ae = model_from_json(f.read())
    weights_path = os.path.join(path, 'ae_{}_weights.h5'.format(dset))
    print('load code spase from:', weights_path)
    local_filename = weights_path.split('/')[-1]
    tmp_filename = os.path.join(tempfile.gettempdir(),
                                str(int(time.time())) + '_' + local_filename)
    gfile.Copy(weights_path, tmp_filename)
    pt_ae.load_weights(tmp_filename)
    gfile.Remove(tmp_filename)

    print('***********************', x.shape)
    x = x.reshape(-1, np.prod(x.shape[1:]))
    print('***********************', x.shape)

    get_embeddings = K.function([pt_ae.input], [pt_ae.layers[3].output])

    get_reconstruction = K.function([pt_ae.layers[4].input], [pt_ae.output])
    x_embedded = predict_with_k_fn(get_embeddings, x)[0]
    x_recon = predict_with_k_fn(get_reconstruction, x_embedded)[0]
    reconstruction_mse = np.mean(np.square(x - x_recon))
    print(
        'using pretrained embeddings; sanity check, total reconstruction error:',
        np.mean(reconstruction_mse))

    del pt_ae

    return x_embedded
示例#10
0
def train_evaluate(job_dir, training_dataset_path, validation_dataset_path,
                   alpha, max_iter, hptune):
    with gfile.Open(training_dataset_path, 'r') as f:
        # Assume there is no header
        df_train = pd.read_csv(f, nrows=1000)

    with gfile.Open(validation_dataset_path, 'r') as f:
        # Assume there is no header
        df_validation = pd.read_csv(f, nrows=100)

    if not hptune:
        df_train = pd.concat([df_train, df_validation])

    numeric_feature_indexes = slice(0, 10)
    categorical_feature_indexes = slice(10, 12)

    preprocessor = ColumnTransformer(
        transformers=[('num', StandardScaler(), numeric_feature_indexes
                       ), ('cat', OneHotEncoder(),
                           categorical_feature_indexes)])

    pipeline = Pipeline([('preprocessor', preprocessor),
                         ('classifier', SGDClassifier(loss='log'))])

    num_features_type_map = {
        feature: 'float64'
        for feature in df_train.columns[numeric_feature_indexes]
    }
    df_train = df_train.astype(num_features_type_map)
    df_validation = df_validation.astype(num_features_type_map)

    print('Starting training: alpha={}, max_iter={}'.format(alpha, max_iter))
    X_train = df_train.drop('Cover_Type', axis=1)
    y_train = df_train['Cover_Type']

    pipeline.set_params(classifier__alpha=alpha, classifier__max_iter=max_iter)
    pipeline.fit(X_train, y_train)

    if hptune:
        X_validation = df_validation.drop('Cover_Type', axis=1)
        y_validation = df_validation['Cover_Type']
        accuracy = pipeline.score(X_validation, y_validation)
        print('Model accuracy: {}'.format(accuracy))
        # Log it with hypertune
        hpt = hypertune.HyperTune()
        hpt.report_hyperparameter_tuning_metric(
            hyperparameter_metric_tag='accuracy', metric_value=accuracy)

    # Save the model
    if not hptune:

        model_filename = 'model.pkl'
        gcs_model_path = "{}/{}".format(job_dir, model_filename)

        if gfile.Exists(gcs_model_path):
            gfile.Remove(gcs_model_path)

        with gfile.Open(gcs_model_path, 'w') as wf:
            pickle.dump(pipeline, wf)

        print("Saved model in: {}".format(gcs_model_path))
示例#11
0
def remove_tmp_files():
    """Removes temporary files created by the profiler."""
    for file_name in gfile.ListDirectory(PROFILER_LOG_DIR):
        if 'profiler-ui.' in file_name:
            gfile.Remove(os.path.join(PROFILER_LOG_DIR, file_name))