def test_upload_env(): with contextlib.ExitStack() as stack: # Mock all objects mock_is_archive = stack.enter_context( mock.patch(f"{MODULE_TO_TEST}._is_archive_up_to_date")) mock_get_packages = stack.enter_context( mock.patch( f"{MODULE_TO_TEST}.packaging.get_non_editable_requirements")) mock_resolve_fs = stack.enter_context( mock.patch( f"{MODULE_TO_TEST}.filesystem.resolve_filesystem_and_path")) mock_fs = mock.MagicMock() mock_resolve_fs.return_value = mock_fs, "" stack.enter_context( mock.patch(f"{MODULE_TO_TEST}._dump_archive_metadata")) stack.enter_context(mock.patch(f"{MODULE_TO_TEST}.shutil.rmtree")) mock_packer = stack.enter_context( mock.patch(f"{MODULE_TO_TEST}.packaging.pack_in_pex")) # Regenerate archive mock_is_archive.return_value = False mock_get_packages.return_value = [{ "name": "a", "version": "1.0" }, { "name": "b", "version": "2.0" }] mock_packer.return_value = MYARCHIVE_FILENAME cluster_pack.upload_env(MYARCHIVE_FILENAME, cluster_pack.PEX_PACKER) mock_packer.assert_called_once_with({ "a": "1.0", "b": "2.0" }, Any(str), []) mock_fs.put.assert_called_once_with(MYARCHIVE_FILENAME, MYARCHIVE_FILENAME) mock_packer.reset_mock() cluster_pack.upload_env(MYARCHIVE_FILENAME, cluster_pack.PEX_PACKER, additional_packages={"c": "3.0"}, ignored_packages=["a"]) mock_packer.assert_called_once_with({ "c": "3.0", "b": "2.0" }, Any(str), ["a"])
def launch_remote_check(file: str) -> Tuple[bool, str]: logging.info('Launching remote check') zip_hdfs, _ = cluster_pack.upload_env(packer=cluster_pack.PEX_PACKER) archive_name = os.path.basename(zip_hdfs) with skein.Client() as client: files = { archive_name: zip_hdfs, 'check_hadoop_env.py': __file__, } editable_packages = cluster_pack.get_editable_requirements() if 'tf_yarn' in editable_packages: tf_yarn_zip = cluster_pack.zip_path(editable_packages['tf_yarn'], False) logger.info(f"zip path for editable tf_yarn is {tf_yarn_zip}") files.update({'tf_yarn': tf_yarn_zip}) service = skein.Service( script=f'./{archive_name} check_hadoop_env.py --file {file}', resources=skein.Resources(2 * 1024, 1), env={ 'PEX_ROOT': '/tmp/{uuid.uuid4()}/', 'PYTHONPATH': '.:', }, files=files, instances=1) spec = skein.ApplicationSpec( {'HADOOP_ENV_CHECKER': service}, acls=skein.model.ACLs(enable=True, view_users=['*']), ) app = client.submit_and_connect(spec) logging.info('Remote check started') result = app.kv.wait('result').decode() app_id = app.id app.shutdown() return result == "True", app_id
def main(): # forcing call to model_to_estimator._save_first_checkpoint l457 # https://github.com/tensorflow/estimator/blob/ \ # 1d55f01d8af871a35ef83fc3354b9feaa671cbe1/tensorflow_estimator/python/estimator/keras.py # otherwise there is a race condition # when all workers try to save the first checkpoint at the same time experiment_fn(HDFS_DIR) pyenv_zip_path, env_name = cluster_pack.upload_env() editable_requirements = cluster_pack.get_editable_requirements() run_on_yarn(pyenv_zip_path, get_safe_exp_fn(), task_specs={ "chief": TaskSpec(memory="2 GiB", vcores=4), "worker": TaskSpec(memory="2 GiB", vcores=4, instances=4), "ps": TaskSpec(memory="2 GiB", vcores=4, instances=2), "evaluator": TaskSpec(memory="2 GiB", vcores=1) }, files={ **editable_requirements, os.path.basename(winequality.__file__): winequality.__file__, os.path.basename(__file__): __file__ })
def upload_env_to_hdfs( archive_on_hdfs: str = None, packer=None, additional_packages: Dict[str, str] = {}, ignored_packages: Collection[str] = []) -> Tuple[str, str]: return cluster_pack.upload_env(archive_on_hdfs, packer, additional_packages, ignored_packages)
def launch_pyspark(): from pyspark.sql import SparkSession import cluster_pack from cluster_pack.spark import spark_config_builder archive, _ = cluster_pack.upload_env() ssb = SparkSession.builder.master("yarn").config("spark.submit.deployMode", "client") spark_config_builder.add_packaged_environment(ssb, archive) sc = ssb.getOrCreate().sparkContext hdfs_cat_res = sc.parallelize([1], numSlices=1).map(env).collect()[0] print(f"pyspark result:{hdfs_cat_res}")
def test_upload_env_in_a_pex(): home_path = '/home/j.doe' home_fs_path = '/user/j.doe' with contextlib.ExitStack() as stack: mock_running_from_pex = stack.enter_context( mock.patch(f"{MODULE_TO_TEST}.packaging._running_from_pex")) mock_running_from_pex.return_value = True mock_pex_filepath = stack.enter_context( mock.patch(f"{MODULE_TO_TEST}.packaging.get_current_pex_filepath")) mock_pex_filepath.return_value = f"{home_path}/myapp.pex" mock_resolve_fs = stack.enter_context( mock.patch(f"{MODULE_TO_TEST}.filesystem.resolve_filesystem_and_path")) mock_fs = mock.MagicMock() mock_resolve_fs.return_value = mock_fs, "" mock__get_archive_metadata_path = stack.enter_context( mock.patch(f"{MODULE_TO_TEST}._get_archive_metadata_path") ) mock__get_archive_metadata_path.return_value = f"{home_fs_path}/blah.json" # metadata & pex already exists on fs mock_fs.exists.return_value = True mock_pex_info = stack.enter_context( mock.patch(f"{MODULE_TO_TEST}.PexInfo") ) def _from_pex(arg): if arg == f'{home_path}/myapp.pex': return PexInfo({"code_hash": 1}) else: return PexInfo({"code_hash": 2}) mock_pex_info.from_pex.side_effect = _from_pex result = cluster_pack.upload_env(f'{home_fs_path}/blah.pex') # Check copy pex to remote mock_fs.put.assert_any_call( f'{home_path}/myapp.pex', f'{home_fs_path}/blah.pex') # Check metadata has been cleaned mock_fs.rm.assert_called_once_with(f'{home_fs_path}/blah.json') # check envname assert 'myapp' == result[1]
def main(): pyenv_zip_path, _ = cluster_pack.upload_env() editable_requirements = cluster_pack.get_editable_requirements() run_on_yarn( pyenv_zip_path, get_safe_exp_fn(), task_specs={ "chief": TaskSpec(memory="2 GiB", vcores=4), "worker": TaskSpec(memory="2 GiB", vcores=4, instances=(HVD_SIZE - 1)), "evaluator": TaskSpec(memory="2 GiB", vcores=1) }, files={ **editable_requirements, os.path.basename(winequality.__file__): winequality.__file__, os.path.basename(__file__): __file__, }, custom_task_module="tf_yarn.tasks.gloo_allred_task" )
import logging import skein import tempfile import cluster_pack from cluster_pack.skein import skein_config_builder, skein_launcher if __name__ == "__main__": logging.basicConfig(level="INFO") package_path, _ = cluster_pack.upload_env() with tempfile.TemporaryDirectory() as tmp_dir: skein_config = skein_config_builder.build( module_name="skein_project.worker", package_path=package_path, tmp_dir=tmp_dir ) with skein.Client() as client: service = skein.Service( resources=skein.model.Resources("1 GiB", 1), files=skein_config.files, script=skein_config.script ) spec = skein.ApplicationSpec(services={"service": service}) app_id = client.submit(spec) skein_launcher.wait_for_finished(client, app_id)
def main(): def experiment_fn() -> Experiment: # To mitigate issue https://github.com/tensorflow/tensorflow/issues/32159 for tf >= 1.15 import tensorflow as tf def convert_to_tensor(x, y): return (tf.convert_to_tensor(value=list(x.values()), dtype=tf.float32), tf.convert_to_tensor(value=y, dtype=tf.int32)) def train_input_fn(): dataset = winequality.get_dataset(WINE_EQUALITY_FILE, split="train") return (dataset.map(convert_to_tensor).shuffle(1000).batch( 128).repeat()) def eval_input_fn(): dataset = winequality.get_dataset(WINE_EQUALITY_FILE, split="test") return (dataset.map(convert_to_tensor).shuffle(1000).batch(128)) model = keras.Sequential() model.add( keras.layers.Dense(units=300, activation="relu", input_shape=(11, ))) model.add(keras.layers.Dense(units=100, activation="relu")) model.add(keras.layers.Dense(units=10, activation="softmax")) model.summary() model.compile(loss='sparse_categorical_crossentropy', optimizer="sgd", metrics=['accuracy']) config = tf.estimator.RunConfig(model_dir=HDFS_DIR) estimator = tf.keras.estimator.model_to_estimator(model, config=config) return Experiment( estimator, tf.estimator.TrainSpec(train_input_fn, max_steps=1000), tf.estimator.EvalSpec(eval_input_fn, steps=10, start_delay_secs=0, throttle_secs=30)) # forcing call to model_to_estimator._save_first_checkpoint l457 # https://github.com/tensorflow/estimator/blob/ \ # 1d55f01d8af871a35ef83fc3354b9feaa671cbe1/tensorflow_estimator/python/estimator/keras.py # otherwise there is a race condition # when all workers try to save the first checkpoint at the same time experiment_fn() pyenv_zip_path, env_name = cluster_pack.upload_env() editable_requirements = cluster_pack.get_editable_requirements() run_on_yarn(pyenv_zip_path, experiment_fn, task_specs={ "chief": TaskSpec(memory="2 GiB", vcores=4), "worker": TaskSpec(memory="2 GiB", vcores=4, instances=4), "ps": TaskSpec(memory="2 GiB", vcores=4, instances=2), "evaluator": TaskSpec(memory="2 GiB", vcores=1) }, files={ **editable_requirements, os.path.basename(winequality.__file__): winequality.__file__, })
def test_upload_env_should_throw_error_if_wrong_extension(): with pytest.raises(ValueError): cluster_pack.upload_env("myarchive.tar.gz", packer=cluster_pack.CONDA_PACKER)
n_classes=winequality.get_n_classes(), optimizer=lambda: hvd.DistributedOptimizer(tf.train.AdamOptimizer())) return Experiment( estimator, tf.estimator.TrainSpec(train_input_fn, max_steps=10, hooks=[hvd.BroadcastGlobalVariablesHook(0)]), tf.estimator.EvalSpec(eval_input_fn, steps=10, start_delay_secs=0, throttle_secs=30)) if __name__ == "__main__": pyenv_zip_path, _ = cluster_pack.upload_env() editable_requirements = cluster_pack.get_editable_requirements() run_on_yarn(pyenv_zip_path, experiment_fn, task_specs={ "chief": TaskSpec(memory="2 GiB", vcores=4), "worker": TaskSpec(memory="2 GiB", vcores=4, instances=1), "evaluator": TaskSpec(memory="2 GiB", vcores=1), "tensorboard": TaskSpec(memory="2 GiB", vcores=1, tb_model_dir=HDFS_DIR) }, files={
return (dataset.shuffle(1000).batch(128)) estimator = tf.estimator.LinearClassifier( feature_columns=winequality.get_feature_columns(), model_dir=HDFS_DIR, n_classes=winequality.get_n_classes()) return Experiment( estimator, tf.estimator.TrainSpec(train_input_fn, max_steps=100), tf.estimator.EvalSpec(eval_input_fn, steps=10, start_delay_secs=0, throttle_secs=30)) if __name__ == "__main__": pyenv_zip_path, env_name = cluster_pack.upload_env() editable_requirements = cluster_pack.get_editable_requirements() run_on_yarn(pyenv_zip_path, experiment_fn, task_specs={ "chief": TaskSpec(memory="2 GiB", vcores=4), "evaluator": TaskSpec(memory="2 GiB", vcores=1), "tensorboard": TaskSpec(memory="2 GiB", vcores=1, tb_model_dir=HDFS_DIR) }, files={ **editable_requirements, os.path.basename(winequality.__file__): winequality.__file__,
def run_on_yarn( experiment_fn: ExperimentFn, task_specs: Dict[str, topologies.TaskSpec], *, pyenv_zip_path: Union[str, Dict[topologies.NodeLabel, str]] = None, skein_client: skein.Client = None, files: Dict[str, str] = None, env: Dict[str, str] = {}, queue: str = "default", acls: ACLs = _default_acls_all_access(), file_systems: List[str] = None, eval_monitor_log_thresholds: Dict[str, Tuple[float, float]] = None, nb_retries: int = 0, custom_task_module: Optional[str] = None, name: str = "RunOnYarn", pre_script_hook: Optional[str] = None) -> Optional[metrics.Metrics]: """Run an experiment on YARN. The implementation allocates a service with the requested number of instances for each distributed task type. Each instance runs ``_dispatch_task`` which roughly does the following. 1. Reserve a TCP port and communicate the resulting socket address (host/port pair) to other instances using the "init" barrier. 2. Spawn ``train_and_evaluate`` in a separate thread. 3. Synchronize the "ps" tasks on the "stop" barrier. The barrier compensates for the fact that "ps" tasks never terminate, and therefore should be killed once all other tasks are finished. Parameters ---------- experiment_fn A function constructing the estimator alongside the train and eval specs. task_specs Resources to allocate for each task type. The keys must be a subset of ``"chief"``, ``"worker"``, ``"ps"``, and ``"evaluator"``. The minimal spec must contain at least ``"chief"``. pyenv_zip_path Path to an archive of a python environment to be deployed It can be a zip conda env or a pex archive In case of GPU/CPU cluster, provide a dictionnary with both environments. If none is provided, the current python environment will be packaged in a pex skein_client Skein client used to submit yarn jobs files Local files or directories to upload to the container. The keys are the target locations of the resources relative to the container root, while the values -- their corresponding local sources. Note that container root is appended to ``PYTHONPATH``. Therefore, any listed Python module a package is automatically importable. env Environment variables to forward to the containers. queue YARN queue to use. acls Configures the application-level Access Control Lists (ACLs). Optional, defaults to ACLs all access. See `ACLs <https://jcrist.github.io/skein/specification.html#acls>` for details. file_systems A list of namenode URIs to acquire delegation tokens for in addition to ``fs.defaultFS``. eval_monitor_log_thresholds optional dictionnary of string to (float 1, float 2). Each couple (key, value) corresponds to an evaluation monitored metric and an associated range. The evaluation monitored metric is logged if it is in [float 1; float 2]. If the lower bound is None it is set to 0. If the upper bound is None, it is set to maximum value A monitored metric with no range is always logged. List of monitored metrics: 'awake_time_ratio': 'Awake/idle ratio', 'eval_step_mean_duration': 'Eval step mean duration (in sec)', 'last_training_step': 'Training set of last checkpoint', 'nb_eval_steps': 'Number of evaluation steps done' nb_retries Number of times the yarn application is retried in case of failures custom_task_module Provide the full module name of a custom task that is executed on each worker None by default (Module will be invoked with python -m {custom_task_module} on the cluster) Only for advanced use cases, can be useful for example, to bypass/tweek the existing estimator.train_and_evaluate pattern name Name of the yarn application pre_script_hook bash command to prepare Hadoop environment Raises ------ RunFailed If the final status of the YARN application is ``"FAILED"``. """ updated_files = _add_editable_requirements(files) _pyenv_zip_path = pyenv_zip_path if pyenv_zip_path else cluster_pack.upload_env( )[0] if nb_retries < 0: raise ValueError( f'nb_retries must be greater or equal to 0. Got {nb_retries}') pyenvs = _setup_pyenvs(_pyenv_zip_path) n_try = 0 while True: try: skein_cluster = _setup_skein_cluster( pyenvs=pyenvs, task_specs=task_specs, skein_client=skein_client, files=updated_files, env=env, queue=queue, acls=acls, file_systems=file_systems, name=name, n_try=n_try, custom_task_module=custom_task_module, pre_script_hook=pre_script_hook) with _shutdown_on_exception(skein_cluster.app): _setup_cluster_spec(skein_cluster.tasks, skein_cluster.app) return _run_on_cluster(experiment_fn, skein_cluster, eval_monitor_log_thresholds, n_try) except Exception: n_try += 1 if n_try == nb_retries + 1: raise logger.exception(f"Retrying user application ... " f"{nb_retries + 1 - n_try} remaining attempts") # Necessary for type checking return None
def main(): def experiment_fn() -> KerasExperiment: def convert_to_tensor(x, y): return (tf.convert_to_tensor(value=list(x.values()), dtype=tf.float32), tf.convert_to_tensor(value=y, dtype=tf.int32)) def input_data_fn(): dataset = winequality.get_dataset(WINE_EQUALITY_FILE, split="train") return (dataset.map(convert_to_tensor).shuffle(1000).batch( 128).repeat()) def validation_data_fn(): dataset = winequality.get_dataset(WINE_EQUALITY_FILE, split="test") return (dataset.map(convert_to_tensor).shuffle(1000).batch(128)) model = tf.keras.Sequential() model.add( tf.keras.layers.Dense(units=300, activation="relu", input_shape=(11, ))) model.add(tf.keras.layers.Dense(units=100, activation="relu")) model.add(tf.keras.layers.Dense(units=10, activation="softmax")) model.summary() opt = tf.keras.optimizers.Adadelta(1.0 * HVD_SIZE) opt = hvd.DistributedOptimizer(opt) model.compile(loss='sparse_categorical_crossentropy', optimizer=opt, metrics=['accuracy']) path_to_checkpoint = f"{HDFS_DIR}" + "/checkpoint-{epoch}" my_callbacks = [ tf.keras.callbacks.ModelCheckpoint(path_to_checkpoint), hvd.keras.callbacks.BroadcastGlobalVariablesCallback(0), ] train_params = {"steps_per_epoch": 1000, "callbacks": my_callbacks} return KerasExperiment(model=model, model_dir=HDFS_DIR, train_params=train_params, input_data_fn=input_data_fn, target_data_fn=None, validation_data_fn=validation_data_fn) pyenv_zip_path, _ = cluster_pack.upload_env() editable_requirements = cluster_pack.get_editable_requirements() run_on_yarn(pyenv_zip_path, experiment_fn, task_specs={ "chief": TaskSpec(memory="2 GiB", vcores=4), "worker": TaskSpec(memory="2 GiB", vcores=4, instances=(HVD_SIZE - 1)), "evaluator": TaskSpec(memory="2 GiB", vcores=1) }, files={ **editable_requirements, os.path.basename(winequality.__file__): winequality.__file__, }, custom_task_module="tf_yarn.tasks.gloo_allred_task")
logging.basicConfig(level="INFO") _logger = logging.getLogger(__name__) if __name__ == "__main__": # use local minio S3 instance # allowed parameters are here: # https://s3fs.readthedocs.io/en/latest/api.html#s3fs.core.S3FileSystem s3_args = { "use_ssl": False, "client_kwargs": { 'endpoint_url': "http://*****:*****@pandas_udf("double", PandasUDFType.GROUPED_AGG) def mean_udf(v: pd.Series) -> float: