예제 #1
0
def _get_experiment(client: skein.ApplicationClient) -> Experiment:
    try:
        experiment = dill.loads(client.kv.wait(KV_EXPERIMENT_FN))()
    except Exception as e:
        task = cluster.get_task()
        event.start_event(client, task)
        event.stop_event(client, task, e)
        raise
    return experiment
예제 #2
0
def _get_experiment(client: skein.ApplicationClient) -> NamedTuple:
    try:
        experiment = cloudpickle.loads(
            client.kv.wait(constants.KV_EXPERIMENT_FN))()
    except Exception as e:
        task = get_task()
        event.start_event(client, task)
        event.stop_event(client, task, e)
        raise
    return experiment
예제 #3
0
def _execute_dispatched_function(client: skein.ApplicationClient,
                                 experiment: Experiment) -> MonitoredThread:
    task_type, task_id = cluster.get_task_description()
    _logger.info(f"Starting execution {task_type}:{task_id}")
    thread = MonitoredThread(name=f"{task_type}:{task_id}",
                             target=_gen_monitored_train_and_evaluate(client),
                             args=tuple(experiment),
                             daemon=True)
    thread.start()
    task = cluster.get_task()
    event.start_event(client, task)
    return thread
예제 #4
0
def _execute_dispatched_function(
        client: skein.ApplicationClient,
        experiment: Union[Experiment, KerasExperiment]) -> MonitoredThread:
    task_type, task_id = get_task_description()
    _logger.info(f"Starting execution {task_type}:{task_id}")
    if isinstance(experiment, Experiment):
        thread = MonitoredThread(
            name=f"{task_type}:{task_id}",
            target=_gen_monitored_train_and_evaluate(client),
            args=tuple(experiment),
            daemon=True)
    elif isinstance(experiment, KerasExperiment):
        raise ValueError(
            "KerasExperiment using parameter strategy is unsupported")
    else:
        raise ValueError(
            "experiment must be an Experiment or a KerasExperiment")
    thread.start()
    task = get_task()
    event.start_event(client, task)
    return thread
예제 #5
0
def start_tf_board(client: skein.ApplicationClient, tf_board_model_dir: str):
    task = cluster.get_task()
    os.environ['GCS_READ_CACHE_DISABLED'] = '1'
    os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION'] = 'cpp'
    os.environ['PROTOCOL_BUFFERS_PYTHON_IMPLEMENTATION_VERSION'] = '2'
    try:
        program.setup_environment()
        tensorboard = program.TensorBoard()
        with _internal.reserve_sock_addr() as (h, p):
            tensorboard_url = f"http://{h}:{p}"
            argv = ['tensorboard', f"--logdir={tf_board_model_dir}",
                    f"--port={p}"]
            tb_extra_args = os.getenv('TB_EXTRA_ARGS', "")
            if tb_extra_args:
                argv += tb_extra_args.split(' ')
            tensorboard.configure(argv)
        tensorboard.launch()
        event.start_event(client, task)
        event.url_event(client, task, f"{tensorboard_url}")
    except Exception as e:
        _logger.error("Cannot start tensorboard", e)
        event.stop_event(client, task, e)