示例#1
0
def _execute_and_await_termination(
        skein_cluster: SkeinCluster,
        serialized_fn: bytes,
        eval_monitor_log_thresholds: Dict[str, Tuple[float, float]] = None,
        n_try: int = 0,
        poll_every_secs: int = 10) -> Optional[metrics.Metrics]:
    skein_cluster.app.kv[constants.KV_EXPERIMENT_FN] = serialized_fn
    eval_metrics_logger = evaluator_metrics.EvaluatorMetricsLogger([
        task for task in _internal.iter_tasks(skein_cluster.tasks)
        if task.startswith('evaluator')
    ], skein_cluster.app, eval_monitor_log_thresholds)

    tensorboard_url_event_name = tensorboard.url_event_name(
        _internal.iter_tasks(skein_cluster.tasks))
    tensorboard_url_logger = metrics.OneShotMetricsLogger(
        skein_cluster.app,
        [(tensorboard_url_event_name,
          tensorboard.URL_EVENT_LABEL)] if tensorboard_url_event_name else [],
        n_try)

    state = None
    while True:
        report = skein_cluster.client.application_report(skein_cluster.app.id)
        logger.info(
            f"Application report for {skein_cluster.app.id} (state: {report.state})"
        )
        if state != report.state:
            logger.info(_format_app_report(report))

        if report.final_status != "undefined":
            skein_cluster.event_listener.join()
            log_events, result_metrics, container_status = _handle_events(
                skein_cluster.events, n_try)
            logger.info(log_events)

            containers = container_status.by_container_id()
            # add one for AM container
            wait_for_nb_logs = sum(
                [instances for task, instances in skein_cluster.tasks]) + 1
            logs = _get_app_logs(skein_cluster.client, skein_cluster.app,
                                 wait_for_nb_logs)
            _save_logs_to_mlflow(logs, containers, n_try)

            if report.final_status == "failed":
                raise RunFailed
            else:
                break
        else:
            eval_metrics_logger.log()
            tensorboard_url_logger.log()
        time.sleep(poll_every_secs)
        state = report.state

    result_metrics.log_mlflow(n_try)
    return result_metrics
示例#2
0
文件: client.py 项目: rom1504/tf-yarn
def _setup_cluster_spec(
    task_instances: List[Tuple[str, int]],
    app: skein.ApplicationClient
) -> tf.train.ClusterSpec:
    tasks_not_in_cluster = ['evaluator', 'tensorboard']
    cluster_instances = [t for t in task_instances if t[0] not in tasks_not_in_cluster]
    app.kv[constants.KV_CLUSTER_INSTANCES] = json.dumps(cluster_instances).encode()
    return tf.train.ClusterSpec(
        cluster.aggregate_spec(app, list(_internal.iter_tasks(cluster_instances)))
    )
示例#3
0
def _prepare_container(
) -> Tuple[skein.ApplicationClient, Dict[str, List[str]], List[str]]:
    tf.logging.info("Python " + sys.version)
    tf.logging.info("Skein " + skein.__version__)
    tf.logging.info(f"TensorFlow {tf.GIT_VERSION} {tf.VERSION}")
    client = skein.ApplicationClient.from_current()
    _setup_container_logs(client)
    cluster_tasks = list(
        iter_tasks(json.loads(client.kv.wait(KV_CLUSTER_INSTANCES).decode())))
    cluster_spec = cluster.start_cluster(client, cluster_tasks)
    return client, cluster_spec, cluster_tasks
示例#4
0
def _setup_cluster_spec(
    task_instances: List[Tuple[str, int]],
    app: skein.ApplicationClient,
    standalone_client_mode: bool
) -> tf.train.ClusterSpec:
    tasks_not_in_cluster = ['evaluator', 'tensorboard']
    # In standalone client mode the chief is also not part of the cluster
    if standalone_client_mode:
        tasks_not_in_cluster.append('chief')
    cluster_instances = [t for t in task_instances if t[0] not in tasks_not_in_cluster]
    app.kv[constants.KV_CLUSTER_INSTANCES] = json.dumps(cluster_instances).encode()
    return tf.train.ClusterSpec(
        cluster.aggregate_spec(app, list(_internal.iter_tasks(cluster_instances)))
    )
示例#5
0
def test__prepare_container():
    with contextlib.ExitStack() as stack:
        # mock modules
        mocked_client_call = stack.enter_context(
            patch(f"{MODULE_TO_TEST}.skein.ApplicationClient.from_current"))
        mocked_logs = stack.enter_context(patch(f'{MODULE_TO_TEST}._setup_container_logs'))
        mocked_cluster_spec = stack.enter_context(patch(f'{MODULE_TO_TEST}.cluster.start_cluster'))

        # fill client mock
        mocked_client = mock.MagicMock(spec=skein.ApplicationClient)
        host_port = ('localhost', 1234)
        instances = [('worker', 10), ('chief', 1)]
        mocked_client.kv.wait.return_value = json.dumps(instances).encode()
        mocked_client_call.return_value = mocked_client
        (client, cluster_spec, cluster_tasks) = _prepare_container(host_port)

        # checks
        mocked_logs.assert_called_once()
        mocked_cluster_spec.assert_called_once_with(host_port, mocked_client, cluster_tasks)
        assert client == mocked_client
        assert cluster_tasks == list(iter_tasks(instances))
示例#6
0
文件: client.py 项目: rom1504/tf-yarn
def _setup_skein_cluster(
        pyenvs: Dict[topologies.NodeLabel, _env.PythonEnvDescription],
        task_specs: Dict[str, topologies.TaskSpec] = TASK_SPEC_NONE,
        *,
        custom_task_module: Optional[str] = None,
        skein_client: skein.Client = None,
        files: Dict[str, str] = None,
        env: Dict[str, str] = {},
        queue: str = "default",
        acls: ACLs = None,
        file_systems: List[str] = None,
        name: str = "RunOnYarn",
        n_try: int = 0,
        pre_script_hook: Optional[str] = None
) -> SkeinCluster:
    os.environ["JAVA_TOOL_OPTIONS"] = \
        "-XX:ParallelGCThreads=1 -XX:CICompilerCount=2 "\
        f"{os.environ.get('JAVA_TOOL_OPTIONS', '')}"

    pre_script_hook = pre_script_hook if pre_script_hook else ""
    with tempfile.TemporaryDirectory() as tempdir:
        task_files, task_env = _setup_task_env(tempdir, files, env, n_try)
        services = {}
        for task_type, task_spec in list(task_specs.items()):
            pyenv = pyenvs[task_spec.label]
            service_env = task_env.copy()
            if task_spec.tb_termination_timeout_seconds >= 0:
                service_env["TB_TERMINATION_TIMEOUT_SECONDS"] = \
                    str(task_spec.tb_termination_timeout_seconds)
            if task_spec.tb_model_dir:
                service_env["TB_MODEL_DIR"] = str(task_spec.tb_model_dir)
            if task_spec.tb_extra_args:
                service_env["TB_EXTRA_ARGS"] = str(task_spec.tb_extra_args)

            services[task_type] = skein.Service(
                script=f'''
                            set -x
                            {pre_script_hook}
                            {_env.gen_task_cmd(
                                pyenv,
                                task_type,
                                custom_task_module)}
                        ''',
                resources=skein.model.Resources(task_spec.memory, task_spec.vcores),
                max_restarts=0,
                instances=task_spec.instances,
                node_label=task_spec.label.value,
                files={
                    **task_files,
                    pyenv.dest_path: pyenv.path_to_archive
                },
                env=service_env)

        # on the cluster we don't ask again for delegation tokens
        if "HADOOP_TOKEN_FILE_LOCATION" in os.environ:
            file_systems = None

        spec = skein.ApplicationSpec(
            services,
            queue=queue,
            acls=acls,
            file_systems=file_systems,
            name=name
        )

        if skein_client is None:
            skein_client = skein.Client()

        task_instances = [(task_type, spec.instances) for task_type, spec in task_specs.items()]
        events: Dict[str, Dict[str, str]] = \
            {task: {} for task in _internal.iter_tasks(task_instances)}
        app = skein_client.submit_and_connect(spec)

        # Start a thread which collects all events posted by all tasks in kv store
        event_listener = Thread(target=_aggregate_events, args=(app.kv, events))
        event_listener.start()

        return SkeinCluster(skein_client, app, task_instances, event_listener, events)
示例#7
0
def _get_cluster_tasks(client: skein.ApplicationClient) -> List[str]:
    return list(
        iter_tasks(
            json.loads(
                client.kv.wait(constants.KV_CLUSTER_INSTANCES).decode())))