def preprocess(source, chunksize): reader = read_csv(source, header=0, chunksize=chunksize) if gfile.Exists(prep_data_path): gfile.Remove(prep_data_path) for data in reader: data = data.fillna(0) data.replace(('yes', 'no'), (1, 0), inplace=True) product_type, sub_area, ecology = \ data['product_type'].values, data['sub_area'].values, data['ecology'].values data['product_type'] = np.reshape([ one_hot(x, n=np.unique(product_type).shape[0] + 1, filters='') for x in product_type ], product_type.shape) sub_area = np.array([ s.replace(' ', '').replace('-', '').replace('\'', '').replace(',', '') for s in sub_area ]) data['sub_area'] = np.reshape( [one_hot(x, n=np.unique(sub_area).shape[0] + 1) for x in sub_area], sub_area.shape) ecology = np.array([ s.replace(' ', '').replace('-', '').replace('\'', '').replace(',', '') for s in ecology ]) data['ecology'] = np.reshape( [one_hot(x, n=np.unique(ecology).shape[0] + 1) for x in ecology], ecology.shape) data.to_csv( prep_data_path) if not isfile(prep_data_path) else data.to_csv( prep_data_path, mode='a', header=False)
def load_model(net, output_path, file_name): weights_path = os.path.join(output_path, file_name, '.h5') local_filename = weights_path.split('/')[-1] tmp_filename = os.path.join(tempfile.gettempdir(), str(int(time.time())) + '_' + local_filename) gfile.Copy(weights_path, tmp_filename) net.load_weights(tmp_filename) gfile.Remove(tmp_filename)
def copy_to_experiment_dir(config_file): # copy config file to the experiment directory saved_config_file_path = _config_file_path_to_copy(config_file) # HACK: This is for tensorflow bug workaround. # We can remove following 2 lines once it's been resolved in tensorflow # issue link: https://github.com/tensorflow/tensorflow/issues/28508 if gfile.Exists(saved_config_file_path): gfile.Remove(saved_config_file_path) gfile.Copy(config_file, saved_config_file_path)
def save_model(net, model_json, output_path, file_name): """serialize weights to HDF5.""" with gfile.Open(output_path + file_name + '.json', 'w') as json_file: json_file.write(model_json) # serialize weights to HDF5 weight_path = os.path.join(output_path, file_name, '.h5') local_filename = weight_path.split('/')[-1] tmp_filename = os.path.join(tempfile.gettempdir(), str(int(time.time())) + '_' + local_filename) net.save_weights(tmp_filename) gfile.Copy(tmp_filename, weight_path, overwrite=True) gfile.Remove(tmp_filename)
def run(self, corner, subvol_size, reset_counters=True): """Runs FFN inference over a subvolume. Args: corner: start of the subvolume (z, y, x) subvol_size: size of the subvolume (z, y, x) reset_counters: whether to reset the counters Returns: Canvas object with the segmentation or None if the canvas could not be created or the segmentation subvolume already exists. """ if reset_counters: self.counters.reset() seg_path = storage.segmentation_path( self.request.segmentation_output_dir, corner) prob_path = storage.object_prob_path( self.request.segmentation_output_dir, corner) cpoint_path = storage.checkpoint_path( self.request.segmentation_output_dir, corner) if gfile.Exists(seg_path): if pyborgletinfo.RunningUnderBorglet(): pywrapborgletlib.BorgletLib.SetStatusMsg( 'Segmentation already complete; exiting.') return None canvas, alignment = self.make_canvas(corner, subvol_size) if canvas is None: return None if gfile.Exists(cpoint_path): canvas.restore_checkpoint(cpoint_path) if self.request.alignment_options.save_raw: image_path = storage.subvolume_path( self.request.segmentation_output_dir, corner, 'align') with storage.atomic_file(image_path) as fd: np.savez_compressed(fd, im=canvas.image) canvas.segment_all( seed_policy=self.get_seed_policy(corner, subvol_size)) self.save_segmentation(canvas, alignment, seg_path, prob_path) # Attempt to remove the checkpoint file now that we no longer need it. try: gfile.Remove(cpoint_path) except: # pylint: disable=bare-except pass return canvas
def evaluate_model( dataset_path: str, model_path: str, metric_name: str ) -> NamedTuple('Outputs', [('metric_name', str), ('metric_value', float), ('mlpipeline_metrics', 'Metrics')]): """Evaluates a trained sklearn model.""" import joblib # import pickle import json import pandas as pd import subprocess import sys from tensorflow import gfile from sklearn.metrics import accuracy_score, recall_score df_test = pd.read_csv(dataset_path) X_test = df_test.drop('Cover_Type', axis=1) y_test = df_test['Cover_Type'] # Copy the model from GCS model_filename = 'model.pkl' gcs_model_filepath = '{}/{}'.format(model_path, model_filename) print(gcs_model_filepath) if gfile.Exists(model_filename): gfile.Remove(model_filename) gfile.Copy(gcs_model_filepath, model_filename) with open(model_filename, 'rb') as model_file: model = joblib.load(model_file) y_hat = model.predict(X_test) if metric_name == 'accuracy': metric_value = accuracy_score(y_test, y_hat) elif metric_name == 'recall': metric_value = recall_score(y_test, y_hat) else: metric_name = 'N/A' metric_value = 0 # Export the metric metrics = { 'metrics': [{ 'name': metric_name, 'numberValue': float(metric_value) }] } return metric_name, metric_value, json.dumps(metrics)
def save_config_file(config_file, dest_dir): if not gfile.Exists(dest_dir): gfile.MkDir(dest_dir) config_file_dest = os.path.join(dest_dir, 'blueoil_config.yaml') # HACK: This is for tensorflow bug workaround. # We can remove following 2 lines once it's been resolved in tensorflow # issue link: https://github.com/tensorflow/tensorflow/issues/28508 if gfile.Exists(config_file_dest): gfile.Remove(config_file_dest) return gfile.Copy(config_file, config_file_dest)
def write_production(): """Copies staged templates to production directory. This function assumes that the template and associated metadata files are stored in a folder of the form gs://<template_staging_bucket>/<release_name>. It copies the templates from the <release_name> folder to two new locations: gs://<prod_bucket>/<release_name> and gs://<prod_bucket>/latest. Both folders contain identical contents; the <release_name> bucket is to allow customers to pin to a specific release and the `latest` folder gives the UI a location at which to point. Raises: GOSError if there was an error reading or writing a file. """ prod_root = FLAGS.template_prod_bucket template_staging_root = FLAGS.template_staging_bucket template_dir = os.path.join(template_staging_root, FLAGS.candidate_name) if not gfile.IsDirectory(template_dir): logging.fatal( 'Template staging directory %s does not exist or is not a ' 'directory.', template_dir) release_dir = os.path.join(prod_root, FLAGS.release_name) if gfile.IsDirectory(release_dir): logging.fatal( 'Template release directory %s already exists. Aborting.', template_dir) logging.info('Copying folder from %s to %s.', template_dir, release_dir) gfile.MkDir(release_dir) CopyRecursively(template_dir, release_dir) # TODO: If we ever delete templates, they will stick around in # `latest`; evaluate something rsync-like in the future. latest_dir = os.path.join(prod_root, LATEST_FOLDER_NAME) if gfile.Exists(latest_dir): if not gfile.IsDirectory(latest_dir): gfile.Remove(latest_dir) gfile.MkDir(latest_dir) else: gfile.MkDir(latest_dir) logging.info('Copying folder from %s to %s.', template_dir, latest_dir) CopyRecursively(template_dir, latest_dir, overwrite=True)
def embed_data(x, dset, path): """embeds x into the code space using the autoencoder.""" if x: return np.zeros(shape=(0, 10)) # load model and weights json_path = os.path.join(path, 'ae_{}.json'.format(dset)) print('load model from json file:', json_path) with gfile.Open(json_path) as f: pt_ae = model_from_json(f.read()) weights_path = os.path.join(path, 'ae_{}_weights.h5'.format(dset)) print('load code spase from:', weights_path) local_filename = weights_path.split('/')[-1] tmp_filename = os.path.join(tempfile.gettempdir(), str(int(time.time())) + '_' + local_filename) gfile.Copy(weights_path, tmp_filename) pt_ae.load_weights(tmp_filename) gfile.Remove(tmp_filename) print('***********************', x.shape) x = x.reshape(-1, np.prod(x.shape[1:])) print('***********************', x.shape) get_embeddings = K.function([pt_ae.input], [pt_ae.layers[3].output]) get_reconstruction = K.function([pt_ae.layers[4].input], [pt_ae.output]) x_embedded = predict_with_k_fn(get_embeddings, x)[0] x_recon = predict_with_k_fn(get_reconstruction, x_embedded)[0] reconstruction_mse = np.mean(np.square(x - x_recon)) print( 'using pretrained embeddings; sanity check, total reconstruction error:', np.mean(reconstruction_mse)) del pt_ae return x_embedded
def train_evaluate(job_dir, training_dataset_path, validation_dataset_path, alpha, max_iter, hptune): with gfile.Open(training_dataset_path, 'r') as f: # Assume there is no header df_train = pd.read_csv(f, nrows=1000) with gfile.Open(validation_dataset_path, 'r') as f: # Assume there is no header df_validation = pd.read_csv(f, nrows=100) if not hptune: df_train = pd.concat([df_train, df_validation]) numeric_feature_indexes = slice(0, 10) categorical_feature_indexes = slice(10, 12) preprocessor = ColumnTransformer( transformers=[('num', StandardScaler(), numeric_feature_indexes ), ('cat', OneHotEncoder(), categorical_feature_indexes)]) pipeline = Pipeline([('preprocessor', preprocessor), ('classifier', SGDClassifier(loss='log'))]) num_features_type_map = { feature: 'float64' for feature in df_train.columns[numeric_feature_indexes] } df_train = df_train.astype(num_features_type_map) df_validation = df_validation.astype(num_features_type_map) print('Starting training: alpha={}, max_iter={}'.format(alpha, max_iter)) X_train = df_train.drop('Cover_Type', axis=1) y_train = df_train['Cover_Type'] pipeline.set_params(classifier__alpha=alpha, classifier__max_iter=max_iter) pipeline.fit(X_train, y_train) if hptune: X_validation = df_validation.drop('Cover_Type', axis=1) y_validation = df_validation['Cover_Type'] accuracy = pipeline.score(X_validation, y_validation) print('Model accuracy: {}'.format(accuracy)) # Log it with hypertune hpt = hypertune.HyperTune() hpt.report_hyperparameter_tuning_metric( hyperparameter_metric_tag='accuracy', metric_value=accuracy) # Save the model if not hptune: model_filename = 'model.pkl' gcs_model_path = "{}/{}".format(job_dir, model_filename) if gfile.Exists(gcs_model_path): gfile.Remove(gcs_model_path) with gfile.Open(gcs_model_path, 'w') as wf: pickle.dump(pipeline, wf) print("Saved model in: {}".format(gcs_model_path))
def remove_tmp_files(): """Removes temporary files created by the profiler.""" for file_name in gfile.ListDirectory(PROFILER_LOG_DIR): if 'profiler-ui.' in file_name: gfile.Remove(os.path.join(PROFILER_LOG_DIR, file_name))