Пример #1
0
def get_config(args=None,
               is_chief=True,
               task_index=0,
               chief_worker_hostname="",
               n_workers=1):
    logger.set_logger_dir(args.train_log_path +
                          datetime.now().strftime('%Y-%m-%d_%H-%M-%S') + '_' +
                          str(task_index))

    # function to split model parameters between multiple parameter servers
    ps_strategy = tf.contrib.training.GreedyLoadBalancingStrategy(
        len(cluster['ps']), tf.contrib.training.byte_size_load_fn)
    device_function = tf.train.replica_device_setter(
        worker_device='/job:worker/task:{}/cpu:0'.format(task_index),
        cluster=cluster_spec,
        ps_strategy=ps_strategy)

    M = Model(device_function)

    name_base = str(uuid.uuid1()).replace('-', '')[:16]
    PIPE_DIR = os.environ.get('TENSORPACK_PIPEDIR', '.').rstrip('/')
    namec2s = 'ipc://{}/sim-c2s-{}'.format(PIPE_DIR, name_base)
    names2c = 'ipc://{}/sim-s2c-{}'.format(PIPE_DIR, name_base)
    procs = [
        MySimulatorWorker(k, namec2s, names2c)
        for k in range(args.simulator_procs)
    ]
    ensure_proc_terminate(procs)
    start_proc_mask_signal(procs)

    neptune_client = neptune_mp_server.Client(
        server_host=chief_worker_hostname, server_port=args.port)

    master = MySimulatorMaster(task_index,
                               neptune_client,
                               namec2s,
                               names2c,
                               M,
                               dummy=args.dummy,
                               predictor_threads=args.nr_predict_towers,
                               predict_batch_size=args.predict_batch_size,
                               do_train=args.do_train)

    # here's the data passed to the repeated data source
    dataflow = BatchData(DataFromQueue(master.queue), BATCH_SIZE)

    with tf.device(device_function):
        with tf.variable_scope(tf.get_variable_scope(), reuse=None):
            lr = tf.Variable(args.learning_rate,
                             trainable=False,
                             name='learning_rate')
    tf.summary.scalar('learning_rate', lr)

    intra_op_par = args.intra_op_par
    inter_op_par = args.inter_op_par

    session_config = get_default_sess_config(0.5)
    print("{} {}".format(intra_op_par, type(intra_op_par)))
    if intra_op_par is not None:
        session_config.intra_op_parallelism_threads = intra_op_par

    if inter_op_par is not None:
        session_config.inter_op_parallelism_threads = inter_op_par

    session_config.log_device_placement = False
    extra_arg = {
        'dummy_predictor': args.dummy_predictor,
        'intra_op_par': intra_op_par,
        'inter_op_par': inter_op_par,
        'max_steps': args.max_steps,
        'device_count': {
            'CPU': args.cpu_device_count
        },
        'threads_to_trace': args.threads_to_trace,
        'dummy': args.dummy,
        'cpu': args.cpu,
        'queue_size': args.queue_size,
        #'worker_host' : "grpc://localhost:{}".format(cluster['worker'][my_task_index].split(':')[1]),
        'worker_host': server.target,
        'is_chief': is_chief,
        'device_function': device_function,
        'n_workers': n_workers,
        'use_sync_opt': args.use_sync_opt,
        'port': args.port,
        'batch_size': BATCH_SIZE,
        'debug_charts': args.debug_charts,
        'adam_debug': args.adam_debug,
        'task_index': task_index,
        'lr': lr,
        'schedule_hyper': args.schedule_hyper,
        'experiment_dir': args.experiment_dir
    }

    print("\n\n worker host: {} \n\n".format(extra_arg['worker_host']))

    with tf.device(device_function):
        if args.optimizer == 'adam':
            optimizer = tf.train.AdamOptimizer(lr,
                                               epsilon=args.epsilon,
                                               beta1=args.beta1,
                                               beta2=args.beta2)
            if args.adam_debug:
                optimizer = MyAdamOptimizer(lr,
                                            epsilon=args.epsilon,
                                            beta1=args.beta1,
                                            beta2=args.beta2)
        elif args.optimizer == 'gd':
            optimizer = tf.train.GradientDescentOptimizer(lr)
        elif args.optimizer == 'adagrad':
            optimizer = tf.train.AdagradOptimizer(lr)
        elif args.optimizer == 'adadelta':
            optimizer = tf.train.AdadeltaOptimizer(lr, epsilon=1e-3)
        elif args.optimizer == 'momentum':
            optimizer = tf.train.MomentumOptimizer(lr, momentum=0.9)
        elif args.optimizer == 'rms':
            optimizer = tf.train.RMSPropOptimizer(lr)

        # wrap in SyncReplicasOptimizer
        if args.use_sync_opt == 1:
            if not args.adam_debug:
                optimizer = tf.train.SyncReplicasOptimizer(
                    optimizer,
                    replicas_to_aggregate=args.num_grad,
                    total_num_replicas=n_workers)
            else:
                optimizer = MySyncReplicasOptimizer(
                    optimizer,
                    replicas_to_aggregate=args.num_grad,
                    total_num_replicas=n_workers)
            extra_arg['hooks'] = optimizer.make_session_run_hook(is_chief)

    callbacks = [
        StatPrinter(), master,
        DebugLogCallback(neptune_client,
                         worker_id=task_index,
                         nr_send=args.send_debug_every,
                         debug_charts=args.debug_charts,
                         adam_debug=args.adam_debug,
                         schedule_hyper=args.schedule_hyper)
    ]

    if args.debug_charts:
        callbacks.append(
            HeartPulseCallback('heart_pulse_{}.log'.format(
                os.environ['SLURMD_NODENAME'])))

    if args.early_stopping is not None:
        args.early_stopping = float(args.early_stopping)

        if my_task_index == 1 and not args.eval_node:
            # only one worker does evaluation
            callbacks.append(
                PeriodicCallback(
                    Evaluator(EVAL_EPISODE, ['state'], ['logits'],
                              neptune_client,
                              worker_id=task_index,
                              solved_score=args.early_stopping), 2))
    elif my_task_index == 1 and not args.eval_node:
        # only 1 worker does evaluation
        callbacks.append(
            PeriodicCallback(
                Evaluator(EVAL_EPISODE, ['state'], ['logits'],
                          neptune_client,
                          worker_id=task_index), 2))

    if args.save_every != 0:
        callbacks.append(
            PeriodicPerStepCallback(
                ModelSaver(var_collections=M.vars_for_save,
                           models_dir=args.models_dir), args.save_every))

    if args.schedule_hyper and my_task_index == 2:
        callbacks.append(
            HyperParameterScheduler('learning_rate', [(20, 0.0005),
                                                      (60, 0.0001)]))
        callbacks.append(
            HyperParameterScheduler('entropy_beta', [(40, 0.005),
                                                     (80, 0.001)]))

    return TrainConfig(dataset=dataflow,
                       optimizer=optimizer,
                       callbacks=Callbacks(callbacks),
                       extra_threads_procs=[master],
                       session_config=session_config,
                       model=M,
                       step_per_epoch=STEP_PER_EPOCH,
                       max_epoch=args.max_epoch,
                       extra_arg=extra_arg)
Пример #2
0
                input_queues[worker_id].put(dist)

    for t in threads:
        t.join()

    scores = []
    for _ in range(worker_num):
        scores.append(score_queue.get())
    return np.mean(scores), np.max(scores)


eval_model.state = tf.placeholder(tf.float32, shape=(None, ) + IMAGE_SHAPE3)
eval_model.policy = build_graph(eval_model.state)

if __name__ == '__main__':
    neptune_client = neptune_mp_server.Client(server_host=args.server_host,
                                              server_port=args.server_port)

    i = 0
    dir_path = os.path.join(args.models_dir, 'iter_{}')
    # print("WAITING FOR DIR {}".format(dir_path.format(i)))
    while True:
        if os.path.isdir(dir_path.format(i)):
            dir_path_i = dir_path.format(i)
            time.sleep(0.1)
            for f in os.listdir(dir_path_i):
                if f[:5] == 'model':
                    model_path = os.path.join(dir_path_i, f)
                    break
            model_time = float(model_path.split('-')[1])
            model_steps = float(model_path.split('-')[2])
            model_mean, model_max = eval_model(model_path, EVAL_EPISODE)