def train_copos(args): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: rank = 0 configure_logger(args.log_path) else: rank = MPI.COMM_WORLD.Get_rank() configure_logger(args.log_path, format_strs=[]) workerseed = args.seed + 10000 * MPI.COMM_WORLD.Get_rank() def policy_fn(name, ob_space, ac_space): return CompatibleMlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=32, num_hid_layers=2) set_global_seeds(workerseed) env = build_env(args, normalize_ob=True) #env = gym.make(args.env) #env.seed(workerseed) timesteps_per_batch = 10000 #timesteps_per_batch=2048 beta = -1 if beta < 0: nr_episodes = int(args.num_timesteps) // timesteps_per_batch # Automatically compute beta based on initial entropy and number of iterations tmp_pi = policy_fn("tmp_pi", env.observation_space, env.action_space) sess.run(tf.global_variables_initializer()) tmp_ob = np.zeros((1, ) + env.observation_space.shape) entropy = sess.run(tmp_pi.pd.entropy(), feed_dict={tmp_pi.ob: tmp_ob}) beta = 2 * entropy / nr_episodes print("Initial entropy: " + str(entropy) + ", episodes: " + str(nr_episodes)) print("Automatically set beta: " + str(beta)) copos_mpi.learn(env, policy_fn, timesteps_per_batch=timesteps_per_batch, epsilon=0.01, beta=beta, cg_iters=10, cg_damping=0.1, max_timesteps=int(args.num_timesteps), gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) env.close()
def main(args): # configure logger, disable logging in child MPI processes (with rank > 0) arg_parser = common_arg_parser() args, unknown_args = arg_parser.parse_known_args() extra_args = parse_cmdline_kwargs(unknown_args) print(args) if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: #rank = 0 #logger.configure() #logger.configure(dir=extra_args['logdir']) rank = 0 configure_logger(args.log_path) else: rank = MPI.COMM_WORLD.Get_rank() configure_logger(args.log_path, format_strs=[]) model, env = train(args, extra_args) if args.save_path is not None and rank == 0: save_path = osp.expanduser(args.save_path) model.save(save_path) if args.play: logger.log("Running trained model") obs = env.reset() state = model.initial_state if hasattr(model, 'initial_state') else None dones = np.zeros((1, )) episode_rew = 0 while True: if state is not None: actions, _, state, _ = model.step(obs, S=state, M=dones) else: actions, _, _, _ = model.step(obs) obs, rew, done, _ = env.step(actions) episode_rew += rew[0] if isinstance(env, VecEnv) else rew env.render() done = done.any() if isinstance(done, np.ndarray) else done if done: print('episode_rew={}'.format(episode_rew)) episode_rew = 0 obs = env.reset() env.close() return model
def nm_main(env, env_type, seed, alg, num_timesteps, network, gamestate, num_env, reward_scale, save_path, save_video_interval, save_video_length, play, log_path, env_args, alg_args): bl_args = bl_arg_class() bl_args.env = env bl_args.env_type = env_type bl_args.seed = seed bl_args.alg = alg bl_args.num_timesteps = num_timesteps bl_args.network = network bl_args.gamestate = gamestate bl_args.num_env = num_env bl_args.reward_scale = reward_scale bl_args.save_path = save_path bl_args.save_video_interval = save_video_interval bl_args.save_video_length = save_video_length bl_args.log_path = log_path bl_args.play = play bl_args.env_args = env_args if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: rank = 0 blr.configure_logger(bl_args.log_path) else: rank = MPI.COMM_WORLD.Get_rank() blr.configure_logger(bl_args.log_path, format_strs=[]) model, env = blr.train(bl_args, alg_args) #if save_path is not None and rank == 0: # save_path = os.path.expanduser(save_path) # model.save(save_path) env.close() print('\nDie allerallerallerletzte Zeile...\n')
def main(args): # print("\n\n\n\n\nXXX") # print(sys.path) # import baselines # print(baselines.__file__()) # for varname in ['PMI_RANK', 'OMPI_COMM_WORLD_RANK']: # if varname in os.environ: # print(varname, int(os.environ[varname])) # print("parsing args...") arg_parser = init_arg_parser() args, unknown_args = arg_parser.parse_known_args(args) # if args.num_cpu > 1: if args.allow_run_as_root: whoami = mpi_fork_run_as_root(args.num_cpu, bind_to_core=args.bind_to_core) else: whoami = mpi_fork(args.num_cpu, bind_to_core=args.bind_to_core) if whoami == 'parent': print('parent exiting with code 0...') sys.exit(0) U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # assert MPI.COMM_WORLD.Get_size() == args.num_cpu, MPI.COMM_WORLD.Get_size() # configure logger # rank = MPI.COMM_WORLD.Get_rank() # FIXME: how to log when rank != 0?? # if rank == 0: configure_logger(args.log_path, format_strs=[]) logger.info(f"main: {rank} / {MPI.COMM_WORLD.Get_size()}") logger.info(f"logger dir: {logger.get_dir()}") extra_args = parse_cmdline_kwargs(unknown_args) logger.info(args, extra_args) # else: # configure_logger(log_path=None) # or still args.log_path? # raise RuntimeError(f"tf session: {tf.get_default_session()}, {MPI.COMM_WORLD.Get_rank()} / {MPI.COMM_WORLD.Get_size()}") def make_wrapped_env(): env = gym.make(args.env) if args.env_type == 'maze': pass elif args.env_type == 'robotics': from baselines.envs.goal_sampler_env_wrapper import GoalSamplerEnvWrapper env = GoalSamplerEnvWrapper(env) elif args.env_type == 'ant': env = GoalExplorationEnv(env=env, only_feasible=True, extend_dist_rew=0, inner_weight=0, goal_weight=1) else: raise NotImplementedError(args.env_type) # FIXME: if resample space is feasible, can set only_feasible = False to avoid unnecessary computation return env venv_kwargs = dict( make_wrapped_env=make_wrapped_env, seed=args.seed, reward_scale=args.reward_scale, flatten_dict_observations=False, mpi_rank=rank, monitor_log_dir=args.log_path, # FIXME ) venv = make_vec_env(num_env=args.num_env, **venv_kwargs) eval_venv = make_vec_env(num_env=args.num_env, **venv_kwargs) if args.debug: plotter_venv = make_vec_env(num_env=1, **venv_kwargs) else: plotter_venv = None # Seed everything. rank_seed = args.seed + 1000000 * rank if args.seed is not None else None set_global_seeds(rank_seed) logger.info(f'setting global rank: {rank_seed} ') # Prepare params. params = dict() params.update(config.DEFAULT_PARAMS) params.update(config.DEFAULT_ENV_PARAMS[args.env]) params.update(**extra_args) # makes it possible to override any parameter # if args.debug: # params['n_cycles'] = 2 # params['n_batches'] = 2 # params['ve_n_batches'] = 2 # params['size_ensemble'] = 2 # env settings params['env_name'] = args.env params['num_cpu'] = args.num_cpu params['rollout_batch_size'] = args.num_env params['timesteps_per_cpu'] = int(args.num_timesteps) with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params['make_env'] = make_wrapped_env learn_fun_return = learn( venv=venv, eval_venv=eval_venv, plotter_venv=plotter_venv, params=params, save_path=args.log_path, save_interval=args.save_interval, ) if rank == 0: save_path = os.path.expanduser(logger.get_dir()) for k, v in learn_fun_return.items(): v.save(os.path.join(save_path, f"final-{k}.joblib")) venv.close() eval_venv.close() if plotter_venv is not None: plotter_venv.close()