Exemplo n.º 1
0
def main():
    client = skein.ApplicationClient.from_current()
    task = cluster.get_task()
    task_type, task_id = cluster.get_task_description()
    event.init_event(client, task, "127.0.0.1:0")
    _task_commons._setup_container_logs(client)

    if task_type == "evaluator":
        evaluator_fn(client)
    else:
        logger.info(f"{task_type}:{task_id}: nothing to do")

    event.stop_event(client, task, None)
Exemplo n.º 2
0
def main():
    client = skein.ApplicationClient.from_current()
    task_type, task_id = cluster.get_task_description()
    task = cluster.get_task()
    event.init_event(client, task, f"127.0.0.1:0")
    _task_commons._setup_container_logs(client)

    if task_type in ['chief', 'worker']:
        _worker_fn(task_type, task_id, client)
    elif task_type == 'evaluator':
        _evaluator_fn(client)
    else:
        logger.error(f'Unknown task type {task_type}')

    event.stop_event(client, task, None)
Exemplo n.º 3
0
def start_cluster(
        host_port: typing.Tuple[str, int], client: skein.ApplicationClient,
        all_tasks: typing.List[str]) -> typing.Dict[str, typing.List[str]]:
    # There is a race condition between acquiring a TCP port for
    # ``tf.train.Server``, and calling ``train_and_evaluate``.
    # There is no TensorFlow API to get rid of the race condition
    # completely, but the window of opportunity can be reduced by
    # preempting the server.
    # See https://github.com/tensorflow/tensorflow/issues/21492
    cluster_spec: typing.Dict = dict()
    host, port = host_port
    event.init_event(client, get_task(),
                     f"{socket.gethostbyname(host)}:{port}")
    cluster_spec = aggregate_spec(client, all_tasks)
    return cluster_spec
Exemplo n.º 4
0
def main():
    client = skein.ApplicationClient.from_current()
    task_type, task_id = get_task_description()
    task = get_task()
    event.init_event(client, task, "127.0.0.1:0")
    _task_commons._setup_container_logs(client)
    net_if = get_net_if()

    if task_type == 'chief':
        _driver_fn(client, net_if)
    if task_type in ['worker', 'chief']:
        _worker_fn(client, task, net_if)
    elif task_type == 'evaluator':
        evaluator_fn(client)
    else:
        logger.error(f'Unknown task type {task_type}')

    event.stop_event(client, task, None)