def download_dir_contents(self, prefix, local_dir): util.mkdir_p(local_dir) prefix = util.ensure_suffix(prefix, "/") for key in self._get_matching_s3_keys_generator(prefix): rel_path = util.trim_prefix(key, prefix) local_dest_path = os.path.join(local_dir, rel_path) self.download_file(key, local_dest_path)
def _get_dir(self, prefix, local_dir): prefix = util.add_suffix_unless_present(prefix, "/") util.mkdir_p(local_dir) for key in self._get_matching_s3_keys_generator(prefix): rel_path = util.remove_prefix_if_present(key, prefix) local_dest_path = os.path.join(local_dir, rel_path) self.download_file(key, local_dest_path)
def download_dir_contents(self, prefix: str, local_dir: str): util.mkdir_p(local_dir) prefix = util.ensure_suffix(prefix, "/") for blob in self.gcs.list_blobs(prefix=prefix): if blob.name.endswith("/"): continue relative_path = util.trim_prefix(blob.name, prefix) local_dest_path = os.path.join(local_dir, relative_path) self.download_file(blob.name, local_dest_path)
def download_file(self, key, local_path): util.mkdir_p(os.path.dirname(local_path)) try: self.s3.download_file(self.bucket, key, local_path) return local_path except Exception as e: raise CortexException( 'key "{}" in bucket "{}" could not be accessed; '.format(key, self.bucket) + "it may not exist, or you may not have sufficient permissions" ) from e
def download_file_external(self, s3_path, local_path): util.mkdir_p(os.path.dirname(local_path)) bucket, key = self.deconstruct_s3_path(s3_path) try: self.s3.download_file(bucket, key, local_path) return local_path except Exception as e: raise CortexException( 'key "{}" in bucket "{}" could not be accessed; '.format( key, bucket) + "it may not exist, or you may not have suffienct permissions" ) from e
def download_file(self, key: str, local_path: str): """ Download file to the specified local path. """ blob = self.gcs.get_blob(blob_name=key) if not isinstance(blob, storage.blob.Blob): raise CortexException(f'key "{key}" in bucket "{self.gcs.name}" not found') util.mkdir_p(os.path.dirname(local_path)) try: blob.download_to_filename(local_path) except gexp.NotFound: raise CortexException(f'key "{key}" in bucket "{self.gcs.name}" not found')
def __init__(self, **kwargs): if "cache_dir" in kwargs: self.cache_dir = kwargs["cache_dir"] elif "local_path" in kwargs: local_path_dir = os.path.dirname(os.path.abspath(kwargs["local_path"])) self.cache_dir = os.path.join(local_path_dir, "cache") else: raise ValueError("cache_dir must be specified (or inferred from local_path)") util.mkdir_p(self.cache_dir) if "local_path" in kwargs: self.ctx = util.read_msgpack(kwargs["local_path"]) elif "obj" in kwargs: self.ctx = kwargs["obj"] elif "raw_obj" in kwargs: self.ctx = kwargs["raw_obj"] elif "s3_path": local_ctx_path = os.path.join(self.cache_dir, "context.msgpack") bucket, key = S3.deconstruct_s3_path(kwargs["s3_path"]) S3(bucket, client_config={}).download_file(key, local_ctx_path) self.ctx = util.read_msgpack(local_ctx_path) else: raise ValueError("invalid context args: " + kwargs) self.workload_id = kwargs.get("workload_id") self.id = self.ctx["id"] self.key = self.ctx["key"] self.metadata_root = self.ctx["metadata_root"] self.cortex_config = self.ctx["cortex_config"] self.deployment_version = self.ctx["deployment_version"] self.root = self.ctx["root"] self.status_prefix = self.ctx["status_prefix"] self.app = self.ctx["app"] self.apis = self.ctx["apis"] or {} self.api_version = self.cortex_config["api_version"] self.monitoring = None self.project_id = self.ctx["project_id"] self.project_key = self.ctx["project_key"] if "local_storage_path" in kwargs: self.storage = LocalStorage(base_dir=kwargs["local_storage_path"]) else: self.storage = S3( bucket=self.cortex_config["bucket"], region=self.cortex_config["region"], client_config={}, ) host_ip = os.environ["HOST_IP"] datadog.initialize(statsd_host=host_ip, statsd_port="8125") self.statsd = datadog.statsd if self.api_version != consts.CORTEX_VERSION: raise ValueError( "API version mismatch (Context: {}, Image: {})".format( self.api_version, consts.CORTEX_VERSION ) ) # This affects Tensorflow S3 access os.environ["AWS_REGION"] = self.cortex_config.get("region", "") # ID maps self.apis_id_map = ResourceMap(self.apis) if self.apis else None self.id_map = self.apis_id_map
def download_and_unzip_external(self, s3_path, local_dir): util.mkdir_p(local_dir) local_zip = os.path.join(local_dir, "zip.zip") self.download_file_external(s3_path, local_zip) util.extract_zip(local_zip, delete_zip_file=True)
def __init__(self, **kwargs): if "cache_dir" in kwargs: self.cache_dir = kwargs["cache_dir"] elif "local_path" in kwargs: local_path_dir = os.path.dirname(os.path.abspath(kwargs["local_path"])) self.cache_dir = os.path.join(local_path_dir, "cache") else: raise ValueError("cache_dir must be specified (or inferred from local_path)") util.mkdir_p(self.cache_dir) if "local_path" in kwargs: ctx_raw = util.read_msgpack(kwargs["local_path"]) self.ctx = _deserialize_raw_ctx(ctx_raw) elif "obj" in kwargs: self.ctx = kwargs["obj"] elif "raw_obj" in kwargs: ctx_raw = kwargs["raw_obj"] self.ctx = _deserialize_raw_ctx(ctx_raw) elif "s3_path": local_ctx_path = os.path.join(self.cache_dir, "context.msgpack") bucket, key = S3.deconstruct_s3_path(kwargs["s3_path"]) S3(bucket, client_config={}).download_file(key, local_ctx_path) ctx_raw = util.read_msgpack(local_ctx_path) self.ctx = _deserialize_raw_ctx(ctx_raw) else: raise ValueError("invalid context args: " + kwargs) self.workload_id = kwargs.get("workload_id") self.id = self.ctx["id"] self.key = self.ctx["key"] self.cortex_config = self.ctx["cortex_config"] self.dataset_version = self.ctx["dataset_version"] self.root = self.ctx["root"] self.raw_dataset = self.ctx["raw_dataset"] self.status_prefix = self.ctx["status_prefix"] self.app = self.ctx["app"] self.environment = self.ctx["environment"] self.python_packages = self.ctx["python_packages"] or {} self.raw_columns = self.ctx["raw_columns"] or {} self.transformed_columns = self.ctx["transformed_columns"] or {} self.transformers = self.ctx["transformers"] or {} self.aggregators = self.ctx["aggregators"] or {} self.aggregates = self.ctx["aggregates"] or {} self.constants = self.ctx["constants"] or {} self.models = self.ctx["models"] or {} self.estimators = self.ctx["estimators"] or {} self.apis = self.ctx["apis"] or {} self.training_datasets = {k: v["dataset"] for k, v in self.models.items()} self.api_version = self.cortex_config["api_version"] if "local_storage_path" in kwargs: self.storage = LocalStorage(base_dir=kwargs["local_storage_path"]) else: self.storage = S3( bucket=self.cortex_config["bucket"], region=self.cortex_config["region"], client_config={}, ) if self.api_version != consts.CORTEX_VERSION: raise ValueError( "API version mismatch (Context: {}, Image: {})".format( self.api_version, consts.CORTEX_VERSION ) ) self.columns = util.merge_dicts_overwrite(self.raw_columns, self.transformed_columns) self.raw_column_names = list(self.raw_columns.keys()) self.transformed_column_names = list(self.transformed_columns.keys()) self.column_names = list(self.columns.keys()) # Internal caches self._transformer_impls = {} self._aggregator_impls = {} self._estimator_impls = {} self._metadatas = {} self._obj_cache = {} self.spark_uploaded_impls = {} # This affects Tensorflow S3 access os.environ["AWS_REGION"] = self.cortex_config.get("region", "") # Id map self.pp_id_map = ResourceMap(self.python_packages) if self.python_packages else None self.rf_id_map = ResourceMap(self.raw_columns) if self.raw_columns else None self.ag_id_map = ResourceMap(self.aggregates) if self.aggregates else None self.tf_id_map = ResourceMap(self.transformed_columns) if self.transformed_columns else None self.td_id_map = ResourceMap(self.training_datasets) if self.training_datasets else None self.models_id_map = ResourceMap(self.models) if self.models else None self.apis_id_map = ResourceMap(self.apis) if self.apis else None self.constants_id_map = ResourceMap(self.constants) if self.constants else None self.id_map = util.merge_dicts_overwrite( self.pp_id_map, self.rf_id_map, self.ag_id_map, self.tf_id_map, self.td_id_map, self.models_id_map, self.apis_id_map, self.constants_id_map, )
def train(model_name, estimator_impl, ctx, model_dir): model = ctx.models[model_name] util.mkdir_p(model_dir) util.rm_dir(model_dir) tf_lib.set_logging_verbosity(ctx.environment["log_level"]["tensorflow"]) run_config = tf.estimator.RunConfig( tf_random_seed=model["training"]["tf_random_seed"], save_summary_steps=model["training"]["save_summary_steps"], save_checkpoints_secs=model["training"]["save_checkpoints_secs"], save_checkpoints_steps=model["training"]["save_checkpoints_steps"], log_step_count_steps=model["training"]["log_step_count_steps"], keep_checkpoint_max=model["training"]["keep_checkpoint_max"], keep_checkpoint_every_n_hours=model["training"] ["keep_checkpoint_every_n_hours"], model_dir=model_dir, ) train_input_fn = generate_input_fn(model_name, ctx, "training", estimator_impl) eval_input_fn = generate_input_fn(model_name, ctx, "evaluation", estimator_impl) serving_input_fn = generate_json_serving_input_fn(model_name, ctx, estimator_impl) exporter = tf.estimator.FinalExporter("estimator", serving_input_fn, as_text=False) train_num_steps = model["training"]["num_steps"] dataset_metadata = ctx.get_metadata(model["dataset"]["id"]) if model["training"]["num_epochs"]: train_num_steps = (math.ceil(dataset_metadata["training_size"] / float(model["training"]["batch_size"])) * model["training"]["num_epochs"]) train_spec = tf.estimator.TrainSpec(train_input_fn, max_steps=train_num_steps) eval_num_steps = model["evaluation"]["num_steps"] if model["evaluation"]["num_epochs"]: eval_num_steps = (math.ceil(dataset_metadata["eval_size"] / float(model["evaluation"]["batch_size"])) * model["evaluation"]["num_epochs"]) eval_spec = tf.estimator.EvalSpec( eval_input_fn, steps=eval_num_steps, exporters=[exporter], name="estimator-eval", start_delay_secs=model["evaluation"]["start_delay_secs"], throttle_secs=model["evaluation"]["throttle_secs"], ) model_config = ctx.model_config(model_name) try: tf_estimator = estimator_impl.create_estimator(run_config, model_config) except Exception as e: raise UserRuntimeException("model " + model_name) from e target_col_name = util.get_resource_ref(model["target_column"]) if ctx.get_inferred_column_type( target_col_name) == consts.COLUMN_TYPE_FLOAT: tf_estimator = tf.contrib.estimator.add_metrics( tf_estimator, get_regression_eval_metrics) tf.estimator.train_and_evaluate(tf_estimator, train_spec, eval_spec) return model_dir