def train(_): training_dir = pjoin(FLAGS.training_set, FLAGS.train_subdir) feature_context = Datasets.get_context(training_dir) (feature_names, label_names) = feature_context.multispec_feature_groups training_dataset = Datasets.dict.read_dataset(training_dir) (feature_train_data, labels_train_data) = transform_dataset(feature_context, training_dataset) params = { 'objective': 'multi:softprob', 'verbose': False, 'num_class': len(label_names), 'max_depth': 6, 'nthread': 4, 'silent': 1 } xg_train = xgb.DMatrix(feature_train_data, label=labels_train_data) xg_model = xgb.train(params, xg_train, FLAGS.rounds) model_path = pjoin(FLAGS.local_dir, "iterator.model") xg_model.save_model(model_path) output_path = pjoin(FLAGS.training_set, "xgboost/iterator.model") file_io.copy(model_path, output_path, overwrite=True)
def _save_and_write_assets(self, assets_collection_to_add=None): """Saves asset to the meta graph and writes asset files to disk. Args: assets_collection_to_add: The collection where the asset paths are setup. """ asset_source_filepath_list = _maybe_save_assets(assets_collection_to_add) # Return if there are no assets to write. if len(asset_source_filepath_list) is 0: tf_logging.info("No assets to write.") return assets_destination_dir = os.path.join( compat.as_bytes(self._export_dir), compat.as_bytes(constants.ASSETS_DIRECTORY)) if not file_io.file_exists(assets_destination_dir): file_io.recursive_create_dir(assets_destination_dir) # Copy each asset from source path to destination path. for asset_source_filepath in asset_source_filepath_list: asset_source_filename = os.path.basename(asset_source_filepath) asset_destination_filepath = os.path.join( compat.as_bytes(assets_destination_dir), compat.as_bytes(asset_source_filename)) # Only copy the asset file to the destination if it does not already # exist. This is to ensure that an asset with the same name defined as # part of multiple graphs is only copied the first time. if not file_io.file_exists(asset_destination_filepath): file_io.copy(asset_source_filepath, asset_destination_filepath) tf_logging.info("Assets written to: %s", assets_destination_dir)
def testCopyOverwriteFalse(self): file_path = os.path.join(self._base_dir, "temp_file") file_io.FileIO(file_path, mode="w").write("testing") copy_path = os.path.join(self._base_dir, "copy_file") file_io.FileIO(copy_path, mode="w").write("copy") with self.assertRaises(errors.AlreadyExistsError): file_io.copy(file_path, copy_path, overwrite=False)
def _save_and_write_assets(self, assets_collection_to_add=None): """Saves asset to the meta graph and writes asset files to disk. Args: assets_collection_to_add: The collection where the asset paths are setup. """ asset_filename_map = _maybe_save_assets(assets_collection_to_add) # Return if there are no assets to write. if not asset_filename_map: tf_logging.info("No assets to write.") return assets_destination_dir = saved_model_utils.get_or_create_assets_dir( self._export_dir) # Copy each asset from source path to destination path. for asset_basename, asset_source_filepath in asset_filename_map.items( ): asset_destination_filepath = os.path.join( compat.as_bytes(assets_destination_dir), compat.as_bytes(asset_basename)) # Only copy the asset file to the destination if it does not already # exist. This is to ensure that an asset with the same name defined as # part of multiple graphs is only copied the first time. if not file_io.file_exists(asset_destination_filepath): file_io.copy(asset_source_filepath, asset_destination_filepath) tf_logging.info("Assets written to: %s", compat.as_text(assets_destination_dir))
def down(id, cloud_path=None): shard = get_shard(id) to_path = "%s/%d" % (emb_path, shard) if not os.path.exists(to_path): try: os.mkdir(to_path) except OSError as e: if e.errno != 17: # File exists raise e to_filepath = "%s/%d.emb" % (to_path, id) url = 'http://ml.daangn.com/articles/image_embeddings/%s' % id_to_path(id) logging.info('down: %s', url) result = call( ['curl', '-f', '--connect-timeout', '2', '-o', to_filepath, url]) if not os.path.exists(to_filepath): return 0 if os.stat(to_filepath).st_size < 1: os.remove(filepath) return 0 if cloud_path: to_gs_filepath = '%s/%s' % (cloud_path, to_filepath) if file_io.file_exists(to_gs_filepath): return 0 to_gs_path = '%s/%s' % (cloud_path, to_path) if not file_io.is_directory(to_gs_path): file_io.create_dir(to_gs_path) file_io.copy(to_filepath, to_gs_filepath) return 1
def _save_and_write_assets(self, assets_collection_to_add=None): """Saves asset to the meta graph and writes asset files to disk. Args: assets_collection_to_add: The collection where the asset paths are setup. """ asset_source_filepath_list = self._save_assets( assets_collection_to_add) # Return if there are no assets to write. if len(asset_source_filepath_list) is 0: tf_logging.info("No assets to write.") return assets_destination_dir = os.path.join( compat.as_bytes(self._export_dir), compat.as_bytes(constants.ASSETS_DIRECTORY)) if not file_io.file_exists(assets_destination_dir): file_io.recursive_create_dir(assets_destination_dir) # Copy each asset from source path to destination path. for asset_source_filepath in asset_source_filepath_list: asset_source_filename = os.path.basename(asset_source_filepath) asset_destination_filepath = os.path.join( compat.as_bytes(assets_destination_dir), compat.as_bytes(asset_source_filename)) file_io.copy(asset_source_filepath, asset_destination_filepath, overwrite=True) tf_logging.info("Assets written to: %s", assets_destination_dir)
def testCopy(self): file_path = os.path.join(self._base_dir, "temp_file") file_io.FileIO(file_path, mode="w").write("testing") copy_path = os.path.join(self._base_dir, "copy_file") file_io.copy(file_path, copy_path) self.assertTrue(file_io.file_exists(copy_path)) self.assertEqual(b"testing", file_io.read_file_to_string(file_path))
def copy_file(src, dest): if not file_io.file_exists(src): raise Exception("Src file doesn't exist at %s" % src) if file_io.is_directory(src): copy_dir(src, dest) return file_io.copy(src, dest, overwrite=True)
def _save_and_write_assets(self, assets_collection_to_add=None): """Saves asset to the meta graph and writes asset files to disk. Args: assets_collection_to_add: The collection where the asset paths are setup. """ asset_source_filepath_list = self._save_assets(assets_collection_to_add) # Return if there are no assets to write. if len(asset_source_filepath_list) is 0: tf_logging.info("No assets to write.") return assets_destination_dir = os.path.join( compat.as_bytes(self._export_dir), compat.as_bytes(constants.ASSETS_DIRECTORY)) if not file_io.file_exists(assets_destination_dir): file_io.recursive_create_dir(assets_destination_dir) # Copy each asset from source path to destination path. for asset_source_filepath in asset_source_filepath_list: asset_source_filename = os.path.basename(asset_source_filepath) asset_destination_filepath = os.path.join( compat.as_bytes(assets_destination_dir), compat.as_bytes(asset_source_filename)) file_io.copy( asset_source_filepath, asset_destination_filepath, overwrite=True) tf_logging.info("Assets written to: %s", assets_destination_dir)
def _load_tf_custom_op(model_path): """Loads a custom TF OP (in .so format) from /assets.extra directory.""" assets_dir = os.path.join(model_path, _CUSTOM_OP_DIRECTORY_NAME) if file_io.is_directory(assets_dir): custom_ops_pattern = os.path.join(assets_dir, _CUSTOM_OP_SUFFIX) for custom_op_path_original in file_io.get_matching_files( custom_ops_pattern): logging.info("Found custom op file: %s", custom_op_path_original) if custom_op_path_original.startswith("gs://"): if not os.path.isdir(_CUSTOM_OP_LOCAL_DIR): os.makedirs(_CUSTOM_OP_LOCAL_DIR) custom_op_path_local = os.path.join( _CUSTOM_OP_LOCAL_DIR, os.path.basename(custom_op_path_original)) logging.info("Copying custop op from: %s to: %s", custom_op_path_original, custom_op_path_local) file_io.copy(custom_op_path_original, custom_op_path_local, True) else: custom_op_path_local = custom_op_path_original try: import tensorflow as tf # pylint: disable=g-import-not-at-top logging.info("Loading custom op: %s", custom_op_path_local) logging.info("TF Version: %s", tf.__version__) tf.load_op_library(custom_op_path_local) except RuntimeError as e: logging.exception( "Failed to load custom op: %s with error: %s. Prediction " "will likely fail due to missing operations.", custom_op_path_local, e)
def _save_and_write_assets(self, assets_collection_to_add=None): """Saves asset to the meta graph and writes asset files to disk. Args: assets_collection_to_add: The collection where the asset paths are setup. """ asset_filename_map = _maybe_save_assets(assets_collection_to_add) # Return if there are no assets to write. if not asset_filename_map: tf_logging.info("No assets to write.") return assets_destination_dir = saved_model_utils.get_or_create_assets_dir( self._export_dir) # Copy each asset from source path to destination path. for asset_basename, asset_source_filepath in asset_filename_map.items(): asset_destination_filepath = os.path.join( compat.as_bytes(assets_destination_dir), compat.as_bytes(asset_basename)) # Only copy the asset file to the destination if it does not already # exist. This is to ensure that an asset with the same name defined as # part of multiple graphs is only copied the first time. if not file_io.file_exists(asset_destination_filepath): file_io.copy(asset_source_filepath, asset_destination_filepath) tf_logging.info("Assets written to: %s", compat.as_text(assets_destination_dir))
def testCopyOverwriteFalse(self): file_path = os.path.join(self._base_dir, "temp_file") file_io.write_string_to_file(file_path, "testing") copy_path = os.path.join(self._base_dir, "copy_file") file_io.write_string_to_file(copy_path, "copy") with self.assertRaises(errors.AlreadyExistsError): file_io.copy(file_path, copy_path, overwrite=False)
def testCopyOverwrite(self): file_path = os.path.join(self._base_dir, "temp_file") file_io.FileIO(file_path, mode="w").write("testing") copy_path = os.path.join(self._base_dir, "copy_file") file_io.FileIO(copy_path, mode="w").write("copy") file_io.copy(file_path, copy_path, overwrite=True) self.assertTrue(file_io.file_exists(copy_path)) self.assertEqual("testing", file_io.FileIO(file_path, mode="r").read())
def testCopyOverwrite(self): file_path = os.path.join(self._base_dir, "temp_file") file_io.write_string_to_file(file_path, "testing") copy_path = os.path.join(self._base_dir, "copy_file") file_io.write_string_to_file(copy_path, "copy") file_io.copy(file_path, copy_path, overwrite=True) self.assertTrue(file_io.file_exists(copy_path)) self.assertEqual(b"testing", file_io.read_file_to_string(file_path))
def testCopyOverwrite(self): file_path = os.path.join(self._base_dir, "temp_file") file_io.FileIO(file_path, mode="w").write("testing") copy_path = os.path.join(self._base_dir, "copy_file") file_io.FileIO(copy_path, mode="w").write("copy") file_io.copy(file_path, copy_path, overwrite=True) self.assertTrue(file_io.file_exists(copy_path)) self.assertEqual(b"testing", file_io.FileIO(file_path, mode="r").read())
def testCopy(self): file_path = os.path.join(self.get_temp_dir(), "temp_file") file_io.write_string_to_file(file_path, "testing") copy_path = os.path.join(self.get_temp_dir(), "copy_file") file_io.copy(file_path, copy_path) self.assertTrue(file_io.file_exists(copy_path)) self.assertEqual(b"testing", file_io.read_file_to_string(file_path)) file_io.delete_file(file_path) file_io.delete_file(copy_path)
def testCopy(self): file_path = os.path.join(self._base_dir, "temp_file") file_io.FileIO(file_path, mode="w").write("testing") copy_path = os.path.join(self._base_dir, "copy_file") file_io.copy(file_path, copy_path) self.assertTrue(file_io.file_exists(copy_path)) f = file_io.FileIO(file_path, mode="r") self.assertEqual("testing", f.read()) self.assertEqual(7, f.tell())
def preprocess(train_dataset, output_dir, eval_dataset, checkpoint, pipeline_option): """Preprocess data in Cloud with DataFlow.""" import apache_beam as beam import google.datalab.utils from . import _preprocess if checkpoint is None: checkpoint = _util._DEFAULT_CHECKPOINT_GSURL job_name = ('preprocess-image-classification-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')) staging_package_url = _util.repackage_to_staging(output_dir) tmpdir = tempfile.mkdtemp() # suppress DataFlow warnings about wheel package as extra package. original_level = logging.getLogger().getEffectiveLevel() logging.getLogger().setLevel(logging.ERROR) try: # Workaround for DataFlow 2.0, which doesn't work well with extra packages in GCS. # Remove when the issue is fixed and new version of DataFlow is included in Datalab. extra_packages = [staging_package_url, _TF_GS_URL, _PROTOBUF_GS_URL] local_packages = [os.path.join(tmpdir, os.path.basename(p)) for p in extra_packages] for source, dest in zip(extra_packages, local_packages): file_io.copy(source, dest, overwrite=True) options = { 'staging_location': os.path.join(output_dir, 'tmp', 'staging'), 'temp_location': os.path.join(output_dir, 'tmp'), 'job_name': job_name, 'project': _util.default_project(), 'extra_packages': local_packages, 'teardown_policy': 'TEARDOWN_ALWAYS', 'no_save_main_session': True } if pipeline_option is not None: options.update(pipeline_option) opts = beam.pipeline.PipelineOptions(flags=[], **options) p = beam.Pipeline('DataflowRunner', options=opts) _preprocess.configure_pipeline(p, train_dataset, eval_dataset, checkpoint, output_dir, job_name) job_results = p.run() finally: shutil.rmtree(tmpdir) logging.getLogger().setLevel(original_level) if (_util.is_in_IPython()): import IPython dataflow_url = 'https://console.developers.google.com/dataflow?project=%s' % \ _util.default_project() html = 'Job "%s" submitted.' % job_name html += '<p>Click <a href="%s" target="_blank">here</a> to track preprocessing job. <br/>' \ % dataflow_url IPython.display.display_html(html, raw=True) return google.datalab.utils.DataflowJob(job_results)
def testCopy(self, join): file_path = join(self._base_dir, "temp_file") file_io.FileIO(file_path, mode="w").write("testing") copy_path = join(self._base_dir, "copy_file") file_io.copy(file_path, copy_path) self.assertTrue(file_io.file_exists(copy_path)) f = file_io.FileIO(file_path, mode="r") self.assertEqual("testing", f.read()) self.assertEqual(7, f.tell())
def batch_predict(dataset, model_dir, output_csv, output_bq_table, pipeline_option): """Batch predict running in cloud.""" import apache_beam as beam import google.datalab.utils from . import _predictor if output_csv is None and output_bq_table is None: raise ValueError('output_csv and output_bq_table cannot both be None.') if 'temp_location' not in pipeline_option: raise ValueError('"temp_location" is not set in cloud.') job_name = ('batch-predict-image-classification-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')) staging_package_url = _util.repackage_to_staging(pipeline_option['temp_location']) tmpdir = tempfile.mkdtemp() # suppress DataFlow warnings about wheel package as extra package. original_level = logging.getLogger().getEffectiveLevel() logging.getLogger().setLevel(logging.ERROR) try: # Workaround for DataFlow 2.0, which doesn't work well with extra packages in GCS. # Remove when the issue is fixed and new version of DataFlow is included in Datalab. extra_packages = [staging_package_url, _TF_GS_URL, _PROTOBUF_GS_URL] local_packages = [os.path.join(tmpdir, os.path.basename(p)) for p in extra_packages] for source, dest in zip(extra_packages, local_packages): file_io.copy(source, dest, overwrite=True) options = { 'staging_location': os.path.join(pipeline_option['temp_location'], 'staging'), 'job_name': job_name, 'project': _util.default_project(), 'extra_packages': local_packages, 'teardown_policy': 'TEARDOWN_ALWAYS', 'no_save_main_session': True } options.update(pipeline_option) opts = beam.pipeline.PipelineOptions(flags=[], **options) p = beam.Pipeline('DataflowRunner', options=opts) _predictor.configure_pipeline(p, dataset, model_dir, output_csv, output_bq_table) job_results = p.run() finally: shutil.rmtree(tmpdir) logging.getLogger().setLevel(original_level) if (_util.is_in_IPython()): import IPython dataflow_url = ('https://console.developers.google.com/dataflow?project=%s' % _util.default_project()) html = 'Job "%s" submitted.' % job_name html += ('<p>Click <a href="%s" target="_blank">here</a> to track batch prediction job. <br/>' % dataflow_url) IPython.display.display_html(html, raw=True) return google.datalab.utils.DataflowJob(job_results)
def run_analysis(args): """Builds an analysis files for training.""" # Read the schema and input feature types schema_list = json.loads(file_io.read_file_to_string(args.schema_file)) run_numerical_categorical_analysis(args, schema_list) # Also save a copy of the schema in the output folder. file_io.copy(args.schema_file, os.path.join(args.output_dir, SCHEMA_FILE), overwrite=True)
def session( cls, session, # type: tf.Session path, # type: str network # type: Union[tf.Tensor, str, List[str]] ): # type: (...) -> str """ Freeze a graph by taking a session and a network and storing the results into a pb file at the given path. This function wil convert variables to constants which is necessary for JVM serving. :param session: TF Session :param path: Where the graph will be written, this can be local filesystem or GCS :param network: Tensor, Operation name, or list of Operation names :return: Path to the written graph """ input_graph_def = tf.get_default_graph().as_graph_def() time = timeit.default_timer() logger.info("Freezing model at {}".format(time)) if isinstance(network, tf.Tensor): output_node_names = [t.op.name for t in [network]] elif isinstance(network, str): output_node_names = [network] elif isinstance(network, list): output_node_names = network else: raise ValueError( "Network must be a Tensor, String or List of Strings") output_graph_def = tf.graph_util.convert_variables_to_constants( session, input_graph_def, output_node_names, variable_names_blacklist=["global_step"]) if FreezeGraph.__is_gcs(path): import tempfile local_path = tempfile.mktemp("local_temp_graph") file_io.write_string_to_file(local_path, output_graph_def.SerializeToString()) file_io.copy(local_path, path, overwrite=True) else: file_io.write_string_to_file(path, output_graph_def.SerializeToString()) logger.info("Froze graph in %4d seconds" % (timeit.default_timer() - time)) return path
def run_analysis(args): """Builds an analysis files for training.""" # Read the schema and input feature types schema_list = json.loads( file_io.read_file_to_string(args.schema_file)) run_numerical_categorical_analysis(args, schema_list) # Also save a copy of the schema in the output folder. file_io.copy(args.schema_file, os.path.join(args.output_dir, SCHEMA_FILE), overwrite=True)
def open(self, filename: str, mode: str): if is_external_location(filename): # there seem to be an issue with GzipFile and fileobj with tempfile.TemporaryDirectory(suffix='-gzip') as gzip_dir: local_gzip_file = os.path.join(gzip_dir, os.path.basename(filename)) with ClosingGzipFile(filename=local_gzip_file, mode=mode) as local_fp: yield local_fp tf_file_io.copy(local_gzip_file, filename, overwrite=True) else: with ClosingGzipFile(filename=filename, mode=mode) as local_fp: yield local_fp
def main(_): input_url = 's3://' + args.inputbucket + "/" output_url = 's3://' + args.outputbucket + "/" os.makedirs(args.datadir) # first, we copy files from pachyderm into a convenient # local directory for processing. input_uri = os.path.join(input_url, args.trainingdata) training_data_path = os.path.join(args.datadir, args.trainingdata) print("copying {} to {}".format(input_uri, training_data_path)) file_io.copy(input_uri, training_data_path, True) (train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data(path=training_data_path) train_labels = train_labels[:1000] test_labels = test_labels[:1000] train_images = train_images[:1000].reshape(-1, 28 * 28) / 255.0 test_images = test_images[:1000].reshape(-1, 28 * 28) / 255.0 # Returns a short sequential model def create_model(): model = tf.keras.models.Sequential([ keras.layers.Dense(512, activation=tf.keras.activations.relu, input_shape=(784,)), keras.layers.Dropout(0.2), keras.layers.Dense(10, activation=tf.keras.activations.softmax) ]) model.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.sparse_categorical_crossentropy, metrics=['accuracy']) return model # Create a basic model instance model = create_model() model.summary() model.fit(train_images, train_labels, batch_size=32, epochs=5, validation_data=(test_images, test_labels)) # Save entire model to a HDF5 file model_file = os.path.join(args.datadir,args.modelfile) model.save(model_file) # Copy file over to Pachyderm output_uri = os.path.join(output_url,args.modelfile) print("copying {} to {}".format(model_file, output_uri)) file_io.copy(model_file, output_uri, True)
def recursive_copy(src_dir, dest_dir): """Copy the contents of src_dir into the folder dest_dir. Args: src_dir: gsc or local path. dest_dir: gcs or local path. """ file_io.recursive_create_dir(dest_dir) for file_name in file_io.list_directory(src_dir): old_path = os.path.join(src_dir, file_name) new_path = os.path.join(dest_dir, file_name) if file_io.is_directory(old_path): recursive_copy(old_path, new_path) else: file_io.copy(old_path, new_path, overwrite=True)
def _recursive_copy(src_dir, dest_dir): """Copy the contents of src_dir into the folder dest_dir. Args: src_dir: gsc or local path. dest_dir: gcs or local path. When called, dest_dir should exist. """ src_dir = python_portable_string(src_dir) dest_dir = python_portable_string(dest_dir) file_io.recursive_create_dir(dest_dir) for file_name in file_io.list_directory(src_dir): old_path = os.path.join(src_dir, file_name) new_path = os.path.join(dest_dir, file_name) if file_io.is_directory(old_path): _recursive_copy(old_path, new_path) else: file_io.copy(old_path, new_path, overwrite=True)
def _copy_assets_to_destination_dir(self, asset_filename_map): """Copy all assets from source path to destination path.""" assets_destination_dir = saved_model_utils.get_or_create_assets_dir( self._export_dir) # Copy each asset from source path to destination path. for asset_basename, asset_source_filepath in asset_filename_map.items(): asset_destination_filepath = os.path.join( compat.as_bytes(assets_destination_dir), compat.as_bytes(asset_basename)) # Only copy the asset file to the destination if it does not already # exist. This is to ensure that an asset with the same name defined as # part of multiple graphs is only copied the first time. if not file_io.file_exists(asset_destination_filepath): file_io.copy(asset_source_filepath, asset_destination_filepath) tf_logging.info("Assets written to: %s", compat.as_text(assets_destination_dir))
def copy_assets_to_destination_dir(asset_filename_map, destination_dir): """Copy all assets from source path to destination path.""" assets_destination_dir = saved_model_utils.get_or_create_assets_dir( destination_dir) # Copy each asset from source path to destination path. for asset_basename, asset_source_filepath in asset_filename_map.items(): asset_destination_filepath = os.path.join( compat.as_bytes(assets_destination_dir), compat.as_bytes(asset_basename)) # Only copy the asset file to the destination if it does not already # exist. This is to ensure that an asset with the same name defined as # part of multiple graphs is only copied the first time. if not file_io.file_exists(asset_destination_filepath): file_io.copy(asset_source_filepath, asset_destination_filepath) tf_logging.info("Assets written to: %s", compat.as_text(assets_destination_dir))
def datahtml(bucket_name, commit_sha, train_file_path): import json import seaborn as sns import matplotlib.pyplot as plt import os image_path = os.path.join(bucket_name, commit_sha, 'visualization.png') image_url = os.path.join('https://storage.googleapis.com', bucket_name.lstrip('gs://'), commit_sha, 'visualization.png') html_path = os.path.join(bucket_name, 'kaggle.html') # ouptut visualization to a file import pandas as pd df_train = pd.read_csv(train_file_path) sns.set() cols = [ 'SalePrice', 'OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'YearBuilt' ] sns.pairplot(df_train[cols], size=3) plt.savefig('visualization.png') from tensorflow.python.lib.io import file_io file_io.copy('visualization.png', image_path) rendered_template = """ <html> <head> <title>correlation image</title> </head> <body> <img src={}> </body> </html>""".format(image_url) file_io.write_string_to_file(html_path, rendered_template) metadata = { 'outputs': [{ 'type': 'web-app', 'storage': 'gcs', 'source': html_path, }] } with file_io.FileIO('/mlpipeline-ui-metadata.json', 'w') as f: json.dump(metadata, f)
def load(self) -> T5ForConditionalGeneration: try: if not self.flush_cache: return self._fix_t5_model( T5ForConditionalGeneration.from_pretrained( str(self.model_cache_dir), from_tf=True, force_download=False)) except (RuntimeError, OSError): logging.info('T5 model weights not in cache.') m = re.search(r'model_checkpoint_path: "(.+?)"', self.ckpt_prefix) assert m is not None, 'checkpoint file malformed' # Copy over checkpoint data ckpt_patt = re.compile( rf'^{m.group(1)}\.(data-\d+-of-\d+|index|meta)$') for name in file_io.list_directory(self.url): if not ckpt_patt.match(name): continue url = os.path.join(self.url, name) url_stat = file_io.stat(url) cache_file_path = self.model_cache_dir / ckpt_patt.sub( rf'{TRANSFO_PREFIX}.\1', name) try: cs = os.stat(str(cache_file_path)) if cs.st_size == url_stat.length and cs.st_mtime_ns > url_stat.mtime_nsec and not self.flush_cache: logging.info(f'Skipping {name}...') continue except FileNotFoundError: pass logging.info(f'Caching {name}...') file_io.copy(url, str(cache_file_path), overwrite=True) # Transformers expects a model config.json config = T5Config.from_pretrained(self.model_type) with open(str(self.model_cache_dir / 'config.json'), 'w') as f: json.dump(config.__dict__, f, indent=4) return self._fix_t5_model( T5ForConditionalGeneration.from_pretrained(str( self.model_cache_dir), from_tf=True, force_download=False))
def main(argv=None): args = parse_arguments(sys.argv if argv is None else argv) if args.cloud: tmpdir = tempfile.mkdtemp() try: local_packages = [ os.path.join(tmpdir, os.path.basename(p)) for p in args.extra_package ] for source, dest in zip(args.extra_package, local_packages): file_io.copy(source, dest, overwrite=True) options = { 'staging_location': os.path.join(args.output_dir, 'tmp', 'staging'), 'temp_location': os.path.join(args.output_dir, 'tmp', 'staging'), 'job_name': args.job_name, 'project': args.project_id, 'no_save_main_session': True, 'extra_packages': local_packages, 'teardown_policy': 'TEARDOWN_ALWAYS', } opts = beam.pipeline.PipelineOptions(flags=[], **options) # Or use BlockingDataflowPipelineRunner p = beam.Pipeline('DataflowRunner', options=opts) make_prediction_pipeline(p, args) print( ('Dataflow Job submitted, see Job %s at ' 'https://console.developers.google.com/dataflow?project=%s') % (options['job_name'], args.project_id)) sys.stdout.flush() runner_results = p.run() finally: shutil.rmtree(tmpdir) else: p = beam.Pipeline('DirectRunner') make_prediction_pipeline(p, args) runner_results = p.run() return runner_results
def main(_): # The Tensorflow file_io.walk() function has an issue # with iterating over the top level of a bucket. # It requires a directory within the bucket. # So, we give it one. input_url = 's3://' + args.inputbucket + "/data/" output_url = 's3://' + args.outputbucket + "/data/" os.makedirs(args.datadir) # first, we copy files from pachyderm into a convenient # local directory for processing. The files have been # placed into the inputpath directory in the s3path bucket. print("walking {} for copying files".format(input_url)) for dirpath, dirs, files in file_io.walk(input_url, True): for file in files: uri = os.path.join(dirpath, file) newpath = os.path.join(args.datadir, file) print("copying {} to {}".format(uri, newpath)) file_io.copy(uri, newpath, True) # here is where you would apply your training to the data in args.datadir # it might operate on the data directly, or place additional # data in the same directory # finally, we copy the output from those operations to # another pachyderm repo print("walking {} for copying to {}".format(args.datadir, output_url)) for dirpath, dirs, files in os.walk(args.datadir, topdown=True): for file in files: uri = os.path.join(dirpath, file) newpath = output_url + file print("copying {} to {}".format(uri, newpath)) file_io.copy(uri, newpath, True)
def main(argv=None): args = parse_arguments(sys.argv if argv is None else argv) if args.cloud: tmpdir = tempfile.mkdtemp() try: local_packages = [os.path.join(tmpdir, os.path.basename(p)) for p in args.extra_package] for source, dest in zip(args.extra_package, local_packages): file_io.copy(source, dest, overwrite=True) options = { 'staging_location': os.path.join(args.output_dir, 'tmp', 'staging'), 'temp_location': os.path.join(args.output_dir, 'tmp', 'staging'), 'job_name': args.job_name, 'project': args.project_id, 'no_save_main_session': True, 'extra_packages': local_packages, 'teardown_policy': 'TEARDOWN_ALWAYS', } opts = beam.pipeline.PipelineOptions(flags=[], **options) # Or use BlockingDataflowPipelineRunner p = beam.Pipeline('DataflowRunner', options=opts) make_prediction_pipeline(p, args) print(('Dataflow Job submitted, see Job %s at ' 'https://console.developers.google.com/dataflow?project=%s') % (options['job_name'], args.project_id)) sys.stdout.flush() runner_results = p.run() finally: shutil.rmtree(tmpdir) else: p = beam.Pipeline('DirectRunner') make_prediction_pipeline(p, args) runner_results = p.run() return runner_results
def _copy_all(src_files, dest_dir): # file_io.copy does not copy files into folders directly. for src_file in src_files: file_name = os.path.basename(src_file) new_file_location = os.path.join(dest_dir, file_name) file_io.copy(src_file, new_file_location)
def test_local_bigquery_transform(self): """Test transfrom locally, but the data comes from bigquery.""" # Make a BQ table, and insert 1 row. try: bucket_name = 'temp_pydatalab_test_%s' % uuid.uuid4().hex bucket_root = 'gs://%s' % bucket_name bucket = storage.Bucket(bucket_name) bucket.create() project_id = dl.Context.default().project_id dataset_name = 'test_transform_raw_data_%s' % uuid.uuid4().hex table_name = 'tmp_table' dataset = bq.Dataset((project_id, dataset_name)).create() table = bq.Table((project_id, dataset_name, table_name)) table.create([{'name': 'key_col', 'type': 'INTEGER'}, {'name': 'target_col', 'type': 'FLOAT'}, {'name': 'cat_col', 'type': 'STRING'}, {'name': 'num_col', 'type': 'FLOAT'}, {'name': 'img_col', 'type': 'STRING'}]) img1_file = os.path.join(self.source_dir, 'img1.jpg') dest_file = os.path.join(bucket_root, 'img1.jpg') file_io.copy(img1_file, dest_file) data = [ { 'key_col': 1, 'target_col': 1.0, 'cat_col': 'Monday', 'num_col': 23.0, 'img_col': dest_file, }, ] table.insert(data=data) cmd = ['python ' + os.path.join(CODE_PATH, 'transform.py'), '--bigquery=%s.%s.%s' % (project_id, dataset_name, table_name), '--analysis=' + self.analysis_dir, '--prefix=features', '--project-id=' + project_id, '--output=' + self.output_dir] print('cmd ', ' '.join(cmd)) subprocess.check_call(' '.join(cmd), shell=True) # Read the tf record file. There should only be one file. record_filepath = os.path.join(self.output_dir, 'features-00000-of-00001.tfrecord.gz') options = tf.python_io.TFRecordOptions( compression_type=tf.python_io.TFRecordCompressionType.GZIP) serialized_examples = list(tf.python_io.tf_record_iterator(record_filepath, options=options)) self.assertEqual(len(serialized_examples), 1) example = tf.train.Example() example.ParseFromString(serialized_examples[0]) transformed_number = example.features.feature['num_col'].float_list.value[0] self.assertAlmostEqual(transformed_number, 23.0) transformed_category = example.features.feature['cat_col'].int64_list.value[0] self.assertEqual(transformed_category, 2) image_bytes = example.features.feature['img_col'].float_list.value self.assertEqual(len(image_bytes), 2048) self.assertTrue(any(x != 0.0 for x in image_bytes)) finally: dataset.delete(delete_contents=True) for obj in bucket.objects(): obj.delete() bucket.delete()
def export_fn(estimator, export_dir_base, checkpoint_path=None, eval_result=None): with ops.Graph().as_default() as g: contrib_variables.create_global_step(g) input_ops = feature_transforms.build_csv_serving_tensors( args.output_dir_from_analysis_step, features, schema, stats, keep_target) model_fn_ops = estimator._call_model_fn( input_ops.features, None, model_fn_lib.ModeKeys.INFER) output_fetch_tensors = make_prediction_output_tensors( args=args, features=features, input_ops=input_ops, model_fn_ops=model_fn_ops, keep_target=keep_target) # Don't use signature_def_utils.predict_signature_def as that renames # tensor names if there is only 1 input/output tensor! signature_inputs = { key: tf.saved_model.utils.build_tensor_info(tensor) for key, tensor in six.iteritems(input_ops.default_inputs) } signature_outputs = { key: tf.saved_model.utils.build_tensor_info(tensor) for key, tensor in six.iteritems(output_fetch_tensors) } signature_def_map = { 'serving_default': signature_def_utils.build_signature_def( signature_inputs, signature_outputs, tf.saved_model.signature_constants.PREDICT_METHOD_NAME) } if not checkpoint_path: # Locate the latest checkpoint checkpoint_path = saver.latest_checkpoint(estimator._model_dir) if not checkpoint_path: raise ValueError("Couldn't find trained model at %s." % estimator._model_dir) export_dir = saved_model_export_utils.get_timestamped_export_dir( export_dir_base) with tf_session.Session('') as session: variables.local_variables_initializer() data_flow_ops.tables_initializer() saver_for_restore = saver.Saver(variables.global_variables(), sharded=True) saver_for_restore.restore(session, checkpoint_path) init_op = control_flow_ops.group( variables.local_variables_initializer(), data_flow_ops.tables_initializer()) # Perform the export builder = saved_model_builder.SavedModelBuilder(export_dir) builder.add_meta_graph_and_variables( session, [tag_constants.SERVING], signature_def_map=signature_def_map, assets_collection=ops.get_collection( ops.GraphKeys.ASSET_FILEPATHS), legacy_init_op=init_op) builder.save(False) # Add the extra assets if assets_extra: assets_extra_path = os.path.join( compat.as_bytes(export_dir), compat.as_bytes('assets.extra')) for dest_relative, source in assets_extra.items(): dest_absolute = os.path.join( compat.as_bytes(assets_extra_path), compat.as_bytes(dest_relative)) dest_path = os.path.dirname(dest_absolute) file_io.recursive_create_dir(dest_path) file_io.copy(source, dest_absolute) # only keep the last 3 models saved_model_export_utils.garbage_collect_exports(export_dir_base, exports_to_keep=3) # save the last model to the model folder. # export_dir_base = A/B/intermediate_models/ if keep_target: final_dir = os.path.join(args.job_dir, 'evaluation_model') else: final_dir = os.path.join(args.job_dir, 'model') if file_io.is_directory(final_dir): file_io.delete_recursively(final_dir) file_io.recursive_create_dir(final_dir) recursive_copy(export_dir, final_dir) return export_dir
def copy(cls, oldpath, newpath, overwrite=False): file_io.copy(oldpath, newpath, overwrite)
def batch_predict(dataset, model_dir, output_csv, output_bq_table, pipeline_option): """Batch predict running in cloud.""" import apache_beam as beam import google.datalab.utils from . import _predictor if output_csv is None and output_bq_table is None: raise ValueError( 'output_csv and output_bq_table cannot both be None.') if 'temp_location' not in pipeline_option: raise ValueError('"temp_location" is not set in cloud.') job_name = ('batch-predict-image-classification-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')) staging_package_url = _util.repackage_to_staging( pipeline_option['temp_location']) tmpdir = tempfile.mkdtemp() # suppress DataFlow warnings about wheel package as extra package. original_level = logging.getLogger().getEffectiveLevel() logging.getLogger().setLevel(logging.ERROR) try: # Workaround for DataFlow 2.0, which doesn't work well with extra packages in GCS. # Remove when the issue is fixed and new version of DataFlow is included in Datalab. extra_packages = [ staging_package_url, _TF_GS_URL, _PROTOBUF_GS_URL ] local_packages = [ os.path.join(tmpdir, os.path.basename(p)) for p in extra_packages ] for source, dest in zip(extra_packages, local_packages): file_io.copy(source, dest, overwrite=True) options = { 'staging_location': os.path.join(pipeline_option['temp_location'], 'staging'), 'job_name': job_name, 'project': _util.default_project(), 'extra_packages': local_packages, 'teardown_policy': 'TEARDOWN_ALWAYS', 'no_save_main_session': True } options.update(pipeline_option) opts = beam.pipeline.PipelineOptions(flags=[], **options) p = beam.Pipeline('DataflowRunner', options=opts) _predictor.configure_pipeline(p, dataset, model_dir, output_csv, output_bq_table) job_results = p.run() finally: shutil.rmtree(tmpdir) logging.getLogger().setLevel(original_level) if (_util.is_in_IPython()): import IPython dataflow_url = ( 'https://console.developers.google.com/dataflow?project=%s' % _util.default_project()) html = 'Job "%s" submitted.' % job_name html += ( '<p>Click <a href="%s" target="_blank">here</a> to track batch prediction job. <br/>' % dataflow_url) IPython.display.display_html(html, raw=True) return google.datalab.utils.DataflowJob(job_results)
def preprocess(train_dataset, output_dir, eval_dataset, checkpoint, pipeline_option): """Preprocess data in Cloud with DataFlow.""" import apache_beam as beam import google.datalab.utils from . import _preprocess if checkpoint is None: checkpoint = _util._DEFAULT_CHECKPOINT_GSURL job_name = ('preprocess-image-classification-' + datetime.datetime.now().strftime('%y%m%d-%H%M%S')) staging_package_url = _util.repackage_to_staging(output_dir) tmpdir = tempfile.mkdtemp() # suppress DataFlow warnings about wheel package as extra package. original_level = logging.getLogger().getEffectiveLevel() logging.getLogger().setLevel(logging.ERROR) try: # Workaround for DataFlow 2.0, which doesn't work well with extra packages in GCS. # Remove when the issue is fixed and new version of DataFlow is included in Datalab. extra_packages = [ staging_package_url, _TF_GS_URL, _PROTOBUF_GS_URL ] local_packages = [ os.path.join(tmpdir, os.path.basename(p)) for p in extra_packages ] for source, dest in zip(extra_packages, local_packages): file_io.copy(source, dest, overwrite=True) options = { 'staging_location': os.path.join(output_dir, 'tmp', 'staging'), 'temp_location': os.path.join(output_dir, 'tmp'), 'job_name': job_name, 'project': _util.default_project(), 'extra_packages': local_packages, 'teardown_policy': 'TEARDOWN_ALWAYS', 'no_save_main_session': True } if pipeline_option is not None: options.update(pipeline_option) opts = beam.pipeline.PipelineOptions(flags=[], **options) p = beam.Pipeline('DataflowRunner', options=opts) _preprocess.configure_pipeline(p, train_dataset, eval_dataset, checkpoint, output_dir, job_name) job_results = p.run() finally: shutil.rmtree(tmpdir) logging.getLogger().setLevel(original_level) if (_util.is_in_IPython()): import IPython dataflow_url = 'https://console.developers.google.com/dataflow?project=%s' % \ _util.default_project() html = 'Job "%s" submitted.' % job_name html += '<p>Click <a href="%s" target="_blank">here</a> to track preprocessing job. <br/>' \ % dataflow_url IPython.display.display_html(html, raw=True) return google.datalab.utils.DataflowJob(job_results)
def export_fn(estimator, export_dir_base, checkpoint_path=None, eval_result=None): with ops.Graph().as_default() as g: contrib_variables.create_global_step(g) input_ops = feature_transforms.build_csv_serving_tensors_for_training_step( args.analysis, features, schema, stats, keep_target) model_fn_ops = estimator._call_model_fn(input_ops.features, None, model_fn_lib.ModeKeys.INFER) output_fetch_tensors = make_prediction_output_tensors( args=args, features=features, input_ops=input_ops, model_fn_ops=model_fn_ops, keep_target=keep_target) # Don't use signature_def_utils.predict_signature_def as that renames # tensor names if there is only 1 input/output tensor! signature_inputs = {key: tf.saved_model.utils.build_tensor_info(tensor) for key, tensor in six.iteritems(input_ops.default_inputs)} signature_outputs = {key: tf.saved_model.utils.build_tensor_info(tensor) for key, tensor in six.iteritems(output_fetch_tensors)} signature_def_map = { 'serving_default': signature_def_utils.build_signature_def( signature_inputs, signature_outputs, tf.saved_model.signature_constants.PREDICT_METHOD_NAME)} if not checkpoint_path: # Locate the latest checkpoint checkpoint_path = saver.latest_checkpoint(estimator._model_dir) if not checkpoint_path: raise ValueError("Couldn't find trained model at %s." % estimator._model_dir) export_dir = saved_model_export_utils.get_timestamped_export_dir( export_dir_base) if (model_fn_ops.scaffold is not None and model_fn_ops.scaffold.saver is not None): saver_for_restore = model_fn_ops.scaffold.saver else: saver_for_restore = saver.Saver(sharded=True) with tf_session.Session('') as session: saver_for_restore.restore(session, checkpoint_path) init_op = control_flow_ops.group( variables.local_variables_initializer(), resources.initialize_resources(resources.shared_resources()), tf.tables_initializer()) # Perform the export builder = saved_model_builder.SavedModelBuilder(export_dir) builder.add_meta_graph_and_variables( session, [tag_constants.SERVING], signature_def_map=signature_def_map, assets_collection=ops.get_collection( ops.GraphKeys.ASSET_FILEPATHS), legacy_init_op=init_op) builder.save(False) # Add the extra assets if assets_extra: assets_extra_path = os.path.join(compat.as_bytes(export_dir), compat.as_bytes('assets.extra')) for dest_relative, source in assets_extra.items(): dest_absolute = os.path.join(compat.as_bytes(assets_extra_path), compat.as_bytes(dest_relative)) dest_path = os.path.dirname(dest_absolute) file_io.recursive_create_dir(dest_path) file_io.copy(source, dest_absolute) # only keep the last 3 models saved_model_export_utils.garbage_collect_exports( export_dir_base, exports_to_keep=3) # save the last model to the model folder. # export_dir_base = A/B/intermediate_models/ if keep_target: final_dir = os.path.join(args.job_dir, 'evaluation_model') else: final_dir = os.path.join(args.job_dir, 'model') if file_io.is_directory(final_dir): file_io.delete_recursively(final_dir) file_io.recursive_create_dir(final_dir) recursive_copy(export_dir, final_dir) return export_dir