def run_training(ctx): """ :type ctx: neptune.Context """ global ENV_NAME, PREDICTOR_THREAD, EXPERIMENT_MODEL, HISTORY_LOGS, DEBUGING_INFO, FRAME_HISTORY ENV_NAME = ctx.params.env assert ENV_NAME DEBUGING_INFOING_INFO = hasattr( ctx.params, "debuging_info") and ctx.params.debuging_info == "True" # print "DEBUGGING INFO:{}".format(DEBUGING_INFO) FRAME_HISTORY = int(get_atribute(ctx, "frame_history", 4)) # module_name, function_name = ctx.params.featureExtractor.split(".") module_name = ctx.params.experimentModelClass[:ctx.params. experimentModelClass. rfind('.')] class_name = ctx.params.experimentModelClass[ ctx.params.experimentModelClass.rfind('.') + 1:] experiment_model_class = importlib.import_module( module_name).__dict__[class_name] if hasattr(ctx.params, "stage"): # That's not the most elegant solution but well ;) stage = int(ctx.params.stage) EXPERIMENT_MODEL = experiment_model_class( ctx.params.experimentModelParameters, stage) else: EXPERIMENT_MODEL = experiment_model_class( ctx.params.experimentModelParameters) p = get_player() del p # set NUM_ACTIONS EXPERIMENT_MODEL.set_number_of_actions(NUM_ACTIONS) if ctx.params.gpu: # print "CUDA_VISIBLE_DEVICES:{}".format(os.environ['CUDA_VISIBLE_DEVICES']) print "Set GPU:{}".format(ctx.params.gpu) os.environ['CUDA_VISIBLE_DEVICES'] = ctx.params.gpu nr_gpu = get_nr_gpu() if nr_gpu > 1: predict_tower = range(nr_gpu)[-nr_gpu / 2:] else: predict_tower = [0] PREDICTOR_THREAD = len(predict_tower) * PREDICTOR_THREAD_PER_GPU if hasattr(ctx.params, "history_logs"): if ctx.params.history_logs == "": HISTORY_LOGS = ([], [], []) else: HISTORY_LOGS = json.loads(ctx.params.history_logs) config = get_config(ctx) if ctx.params.load != "": config.session_init = SaverRestore(ctx.params.load) if hasattr(ctx.params, "load_previous_stage"): if ctx.params.load_previous_stage != "": config.session_init = SaverRestore(ctx.params.load_previous_stage) config.tower = range(nr_gpu)[:-nr_gpu / 2] or [0] logger.info("[BA3C] Train on gpu {} and infer on gpu {}".format( ','.join(map(str, config.tower)), ','.join(map(str, predict_tower)))) AsyncMultiGPUTrainer(config, predict_tower=predict_tower).train() # For the moment this is a hack. # The neptune interface does not allow to return values if hasattr(ctx.params, "mother_experiment_id"): experiment_dir = ctx.dump_dir_url json_path = os.path.join(experiment_dir, ctx.params.mother_experiment_id + ".json") info_to_pass = {} info_to_pass["previous_experiment_id"] = ctx.job.id print "Experiment history logs to save:{}".format(HISTORY_LOGS) info_to_pass["history_logs"] = json.dumps(HISTORY_LOGS) with open(json_path, 'w') as outfile: json.dump(info_to_pass, outfile)
chief_worker_hostname = _chief_worker_hostname(cluster) config = get_config(args, is_chief, my_task_index, chief_worker_hostname, len(cluster['worker'])) if args.load: config.session_init = SaverRestore(args.load) config.tower = range(nr_towers) logger.info( "[BA3C] Train on gpu {} and infer on gpu {}". format(','.join(map(str, config.tower)), ','.join(map(str, predict_towers)))) if args.sync: logger.info('using sync version') SyncMultiGPUTrainer( config, predict_tower=predict_towers).train() else: logger.info('using async version') while True: try: trainer = AsyncMultiGPUTrainer( config, predict_tower=predict_towers) trainer.train() except Exception as e: print( '===== EXCEPTION IN TRAIN-ATARI.PY [{}] ======' .format(os.environ['SLURMD_NODENAME'])) traceback.print_exc()
if args.gpu: os.environ['CUDA_VISIBLE_DEVICES'] = args.gpu if args.task != 'train': assert args.load is not None if args.task != 'train': cfg = PredictConfig(model=Model(), session_init=SaverRestore(args.load), input_var_names=['state'], output_var_names=['logits']) if args.task == 'play': play_model(cfg) elif args.task == 'eval': eval_model_multithread(cfg, EVAL_EPISODE) else: nr_gpu = get_nr_gpu() if nr_gpu > 1: predict_tower = range(nr_gpu)[-nr_gpu / 2:] else: predict_tower = [0] PREDICTOR_THREAD = len(predict_tower) * PREDICTOR_THREAD_PER_GPU config = get_config() if args.load: config.session_init = SaverRestore(args.load) config.tower = range(nr_gpu)[:-nr_gpu / 2] or [0] logger.info("[BA3C] Train on gpu {} and infer on gpu {}".format( ','.join(map(str, config.tower)), ','.join(map(str, predict_tower)))) AsyncMultiGPUTrainer(config, predict_tower=predict_tower).train()