def __init__(self): self.client = skein.ApplicationClient.from_current() self.task = get_task() self.step_counter = 0 self.eval_start_time = 0.0 self.eval_step_dur_accu = 0.0 self.start_time = time.time()
def _gen_monitored_train_and_evaluate(client: skein.ApplicationClient): task = get_task() def train_and_evaluate(estimator: tf.estimator, train_spec: tf.estimator.TrainSpec, eval_spec: tf.estimator.EvalSpec): event.broadcast_train_eval_start_timer(client, task) tf.estimator.train_and_evaluate(estimator, train_spec, eval_spec) event.broadcast_train_eval_stop_timer(client, task) return train_and_evaluate
def main(): client = skein.ApplicationClient.from_current() task = get_task() task_type, task_id = get_task_description() event.init_event(client, task, "127.0.0.1:0") _task_commons._setup_container_logs(client) if task_type == "evaluator": evaluator_fn(client) else: logger.info(f"{task_type}:{task_id}: nothing to do") event.stop_event(client, task, None)
def start_cluster( host_port: typing.Tuple[str, int], client: skein.ApplicationClient, all_tasks: typing.List[str]) -> typing.Dict[str, typing.List[str]]: # There is a race condition between acquiring a TCP port for # ``tf.train.Server``, and calling ``train_and_evaluate``. # There is no TensorFlow API to get rid of the race condition # completely, but the window of opportunity can be reduced by # preempting the server. # See https://github.com/tensorflow/tensorflow/issues/21492 cluster_spec: typing.Dict = dict() host, port = host_port event.init_event(client, get_task(), f"{socket.gethostbyname(host)}:{port}") cluster_spec = aggregate_spec(client, all_tasks) return cluster_spec
def main(): client = skein.ApplicationClient.from_current() task_type, task_id = get_task_description() task = get_task() event.init_event(client, task, "127.0.0.1:0") _task_commons._setup_container_logs(client) net_if = get_net_if() if task_type == 'chief': _driver_fn(client, net_if) if task_type in ['worker', 'chief']: _worker_fn(client, task, net_if) elif task_type == 'evaluator': evaluator_fn(client) else: logger.error(f'Unknown task type {task_type}') event.stop_event(client, task, None)
def _shutdown_container(client: skein.ApplicationClient, cluster_tasks: List[str], session_config: tf.compat.v1.ConfigProto, thread: Optional[MonitoredThread]) -> None: # Wait for all tasks connected to this one. The set of tasks to # wait for contains all tasks in the cluster, or the ones # matching ``device_filters`` if set. The implementation assumes # that ``device_filers`` are symmetric. exception = thread.exception if thread is not None and isinstance(thread, MonitoredThread) \ else None task = get_task() event.stop_event(client, task, exception) _wait_for_connected_tasks(client, cluster_tasks, getattr(session_config, "device_filters", [])) event.broadcast_container_stop_time(client, task) if exception is not None: raise exception from None
def _execute_dispatched_function( client: skein.ApplicationClient, experiment: Union[Experiment, KerasExperiment]) -> MonitoredThread: task_type, task_id = get_task_description() _logger.info(f"Starting execution {task_type}:{task_id}") if isinstance(experiment, Experiment): thread = MonitoredThread( name=f"{task_type}:{task_id}", target=_gen_monitored_train_and_evaluate(client), args=tuple(experiment), daemon=True) elif isinstance(experiment, KerasExperiment): raise ValueError( "KerasExperiment using parameter strategy is unsupported") else: raise ValueError( "experiment must be an Experiment or a KerasExperiment") thread.start() task = get_task() event.start_event(client, task) return thread
def start_tf_board(client: skein.ApplicationClient, tf_board_model_dir: str): task = get_task() os.environ['GCS_READ_CACHE_DISABLED'] = '1' os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'cpp' os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION_VERSION'] = '2' try: program.setup_environment() tensorboard = program.TensorBoard() with _internal.reserve_sock_addr() as (h, p): tensorboard_url = f"http://{h}:{p}" argv = [ 'tensorboard', f"--logdir={tf_board_model_dir}", f"--port={p}" ] tb_extra_args = os.getenv('TB_EXTRA_ARGS', "") if tb_extra_args: argv += tb_extra_args.split(' ') tensorboard.configure(argv) tensorboard.launch() event.start_event(client, task) event.url_event(client, task, f"{tensorboard_url}") except Exception as e: _logger.error("Cannot start tensorboard", e) event.stop_event(client, task, e)
def main() -> None: _log_sys_info() task_type, task_id = get_task_description() task = get_task() client = skein.ApplicationClient.from_current() _setup_container_logs(client) cluster_tasks = _get_cluster_tasks(client) model_dir = os.getenv('TB_MODEL_DIR', "") if not model_dir: _logger.info("Read model_dir from estimator config") experiment = _get_experiment(client) if isinstance(experiment, Experiment): model_dir = experiment.estimator.config.model_dir elif isinstance(experiment, KerasExperiment): model_dir = experiment.model_dir else: raise ValueError("experiment must be an Experiment or a KerasExperiment") _logger.info(f"Starting tensorboard on {model_dir}") thread = _internal.MonitoredThread( name=f"{task_type}:{task_id}", target=tensorboard.start_tf_board, args=(client, model_dir), daemon=True) thread.start() for cluster_task in cluster_tasks: event.wait(client, f"{cluster_task}/stop") timeout = tensorboard.get_termination_timeout() thread.join(timeout) event.stop_event(client, task, thread.exception) event.broadcast_container_stop_time(client, task)