def run_training(ctx):
    """
    :type ctx: neptune.Context
    """
    global ENV_NAME, PREDICTOR_THREAD, EXPERIMENT_MODEL, HISTORY_LOGS, DEBUGING_INFO, FRAME_HISTORY

    ENV_NAME = ctx.params.env
    assert ENV_NAME

    DEBUGING_INFOING_INFO = hasattr(
        ctx.params, "debuging_info") and ctx.params.debuging_info == "True"

    # print "DEBUGGING INFO:{}".format(DEBUGING_INFO)
    FRAME_HISTORY = int(get_atribute(ctx, "frame_history", 4))

    # module_name, function_name = ctx.params.featureExtractor.split(".")
    module_name = ctx.params.experimentModelClass[:ctx.params.
                                                  experimentModelClass.
                                                  rfind('.')]
    class_name = ctx.params.experimentModelClass[
        ctx.params.experimentModelClass.rfind('.') + 1:]
    experiment_model_class = importlib.import_module(
        module_name).__dict__[class_name]

    if hasattr(ctx.params, "stage"):
        # That's not the most elegant solution but well ;)
        stage = int(ctx.params.stage)
        EXPERIMENT_MODEL = experiment_model_class(
            ctx.params.experimentModelParameters, stage)
    else:
        EXPERIMENT_MODEL = experiment_model_class(
            ctx.params.experimentModelParameters)

    p = get_player()
    del p  # set NUM_ACTIONS
    EXPERIMENT_MODEL.set_number_of_actions(NUM_ACTIONS)

    if ctx.params.gpu:
        # print "CUDA_VISIBLE_DEVICES:{}".format(os.environ['CUDA_VISIBLE_DEVICES'])
        print "Set GPU:{}".format(ctx.params.gpu)
        os.environ['CUDA_VISIBLE_DEVICES'] = ctx.params.gpu

    nr_gpu = get_nr_gpu()
    if nr_gpu > 1:
        predict_tower = range(nr_gpu)[-nr_gpu / 2:]
    else:
        predict_tower = [0]
    PREDICTOR_THREAD = len(predict_tower) * PREDICTOR_THREAD_PER_GPU

    if hasattr(ctx.params, "history_logs"):
        if ctx.params.history_logs == "":
            HISTORY_LOGS = ([], [], [])
        else:
            HISTORY_LOGS = json.loads(ctx.params.history_logs)

    config = get_config(ctx)

    if ctx.params.load != "":
        config.session_init = SaverRestore(ctx.params.load)

    if hasattr(ctx.params, "load_previous_stage"):
        if ctx.params.load_previous_stage != "":
            config.session_init = SaverRestore(ctx.params.load_previous_stage)

    config.tower = range(nr_gpu)[:-nr_gpu / 2] or [0]
    logger.info("[BA3C] Train on gpu {} and infer on gpu {}".format(
        ','.join(map(str, config.tower)), ','.join(map(str, predict_tower))))
    AsyncMultiGPUTrainer(config, predict_tower=predict_tower).train()

    # For the moment this is a hack.
    # The neptune interface does not allow to return values

    if hasattr(ctx.params, "mother_experiment_id"):
        experiment_dir = ctx.dump_dir_url
        json_path = os.path.join(experiment_dir,
                                 ctx.params.mother_experiment_id + ".json")
        info_to_pass = {}
        info_to_pass["previous_experiment_id"] = ctx.job.id
        print "Experiment history logs to save:{}".format(HISTORY_LOGS)
        info_to_pass["history_logs"] = json.dumps(HISTORY_LOGS)
        with open(json_path, 'w') as outfile:
            json.dump(info_to_pass, outfile)
예제 #2
0
                        chief_worker_hostname = _chief_worker_hostname(cluster)
                        config = get_config(args, is_chief, my_task_index,
                                            chief_worker_hostname,
                                            len(cluster['worker']))
                        if args.load:
                            config.session_init = SaverRestore(args.load)
                        config.tower = range(nr_towers)

                        logger.info(
                            "[BA3C] Train on gpu {} and infer on gpu {}".
                            format(','.join(map(str, config.tower)),
                                   ','.join(map(str, predict_towers))))

                        if args.sync:
                            logger.info('using sync version')
                            SyncMultiGPUTrainer(
                                config, predict_tower=predict_towers).train()
                        else:
                            logger.info('using async version')
                            while True:
                                try:
                                    trainer = AsyncMultiGPUTrainer(
                                        config, predict_tower=predict_towers)
                                    trainer.train()
                                except Exception as e:
                                    print(
                                        '===== EXCEPTION IN TRAIN-ATARI.PY [{}] ======'
                                        .format(os.environ['SLURMD_NODENAME']))
                                    traceback.print_exc()
예제 #3
0
    if args.gpu:
        os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu
    if args.task != 'train':
        assert args.load is not None

    if args.task != 'train':
        cfg = PredictConfig(model=Model(),
                            session_init=SaverRestore(args.load),
                            input_var_names=['state'],
                            output_var_names=['logits'])
        if args.task == 'play':
            play_model(cfg)
        elif args.task == 'eval':
            eval_model_multithread(cfg, EVAL_EPISODE)
    else:
        nr_gpu = get_nr_gpu()
        if nr_gpu > 1:
            predict_tower = range(nr_gpu)[-nr_gpu / 2:]
        else:
            predict_tower = [0]
        PREDICTOR_THREAD = len(predict_tower) * PREDICTOR_THREAD_PER_GPU
        config = get_config()
        if args.load:
            config.session_init = SaverRestore(args.load)
        config.tower = range(nr_gpu)[:-nr_gpu / 2] or [0]
        logger.info("[BA3C] Train on gpu {} and infer on gpu {}".format(
            ','.join(map(str, config.tower)), ','.join(map(str,
                                                           predict_tower))))
        AsyncMultiGPUTrainer(config, predict_tower=predict_tower).train()