def train(logdir, env, expert_path, seed, batch_size, lr, traj_limitation): env_id = env logdir = logdir + '/bc/' + env_id + '/s-{}/l-{}-b-{}/seed-{}'.format( traj_limitation, lr, batch_size, seed) print(logdir, env, expert_path, seed) logger.configure(logdir, format_strs=['stdout', 'log', 'json', 'tensorboard']) expert = MADataSet(expert_path, ret_threshold=-10, traj_limitation=traj_limitation) def create_env(rank): def _thunk(): env = make_env.make_env(env_id) env.seed(seed + rank) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), allow_early_resets=True) gym.logger.setLevel(logging.WARN) return env return _thunk env = SubprocVecEnv([create_env(i) for i in range(1)], is_multi_agent=True) policy_fn = CategoricalPolicy learn(policy_fn, env, expert, seed, int(2e7), batch_size=batch_size, lr=lr)
def train(logdir, env_id, num_timesteps, lr, timesteps_per_batch, seed, num_cpu): def create_env(rank): def _thunk(): env = make_env.make_env(env_id) env.seed(seed + rank) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), allow_early_resets=True) gym.logger.setLevel(logging.WARN) return env return _thunk logger.configure(logdir, format_strs=['stdout', 'log', 'json', 'tensorboard']) set_global_seeds(seed) env = SubprocVecEnv([create_env(i) for i in range(num_cpu)], is_multi_agent=True) policy_fn = CategoricalPolicy learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu, nsteps=timesteps_per_batch // num_cpu, lr=lr, ent_coef=0.00, identical=make_env.get_identical(env_id)) env.close()
def train(logdir, env_id, num_timesteps, lr, timesteps_per_batch, seed, num_cpu, expert_path, traj_limitation, ret_threshold, dis_lr, disc_type='decentralized', bc_iters=500, l2=0.1, d_iters=1, rew_scale=0.1): def create_env(rank): def _thunk(): env = make_env.make_env(env_id) env.seed(seed + rank) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), allow_early_resets=True) gym.logger.setLevel(logging.WARN) return env return _thunk logger.configure(logdir, format_strs=['stdout', 'log', 'json', 'tensorboard']) set_global_seeds(seed) env = SubprocVecEnv([create_env(i) for i in range(num_cpu)], is_multi_agent=True) print(num_cpu) policy_fn = CategoricalPolicy expert = MADataSet(expert_path, ret_threshold=ret_threshold, traj_limitation=traj_limitation, nobs_flag=True) learn(policy_fn, expert, env, env_id, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu, nsteps=timesteps_per_batch // num_cpu, lr=lr, ent_coef=0.0, dis_lr=dis_lr, disc_type=disc_type, bc_iters=bc_iters, identical=make_env.get_identical(env_id), l2=l2, d_iters=d_iters, rew_scale=rew_scale) env.close()
def main(): parser = mujoco_arg_parser() parser.add_argument('--cpu', type=int, default=1) parser.add_argument('--lr', type=float, default=3e-4) parser.add_argument('--batch', type=int, default=2048) args = parser.parse_args() logdir = './results/mappo/' + args.env + '/l-{}-b-{}/seed-{}'.format(args.lr, args.batch, args.seed) try: logger.configure(logdir, format_strs=['stdout', 'log', 'json', 'tensorboard']) except: logger.configure() train(args.env, num_timesteps=1e7, seed=args.seed, num_cpu=args.cpu, batch=args.batch, lr=args.lr)
def train(logdir, env_id, lr, num_timesteps, seed, timesteps_per_batch, cont=False): from sandbox.ppo_sgd import mlp_policy from sandbox.ppo_sgd import pposgd_simple from rl import logger from rl.common import set_global_seeds, tf_util as U from rl import bench from gym.envs.registration import register import multiagent import make_env logger.configure(logdir, format_strs=['log', 'json', 'tensorboard']) U.make_session(num_cpu=1).__enter__() set_global_seeds(seed) env = make_env.make_env(env_id) def policy_fn(name, ob_space, ac_space, id): pi = mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2, id=id) return pi env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(seed) gym.logger.setLevel(logging.WARN) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=timesteps_per_batch, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=lr, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', cont=cont) env.close() return None
def train(logdir, env_id, num_timesteps, lr, timesteps_per_batch, seed, num_cpu, max_episode_len): def create_env(rank): def _thunk(): env = make_env.make_env(env_id, max_episode_len=max_episode_len) env.discrete_action_input = True env.seed(seed + rank) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), allow_early_resets=True) gym.logger.setLevel(logging.WARN) return env return _thunk logger.configure(logdir, format_strs=['json']) set_global_seeds(seed) env = SubprocVecEnv([create_env(i) for i in range(num_cpu)], is_multi_agent=True) policy_fn = CategoricalPolicy learn(policy_fn, env, seed, total_timesteps=int(num_timesteps * 1.1), nprocs=num_cpu, nsteps=timesteps_per_batch // num_cpu, lr=lr, ent_coef=0.00, identical=make_env.get_identical(env_id), log_interval=50, save_interval=int(num_timesteps / timesteps_per_batch), max_episode_len=max_episode_len) logger.Logger.CURRENT.close() env.close()
def train(logdir, env, expert_path, seed, max_episode_len): print(logdir, env, expert_path, seed, max_episode_len) logger.configure(logdir, format_strs=['stdout', 'log', 'json', 'tensorboard']) expert = MADataSet(expert_path, ret_threshold=-10, traj_limitation=200) env_id = env def create_env(rank): def _thunk(): env = make_env.make_env(env_id, max_episode_len=max_episode_len) env.seed(seed + rank) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), allow_early_resets=True) gym.logger.setLevel(logging.WARN) return env return _thunk env = SubprocVecEnv([create_env(i) for i in range(1)], is_multi_agent=True) policy_fn = CategoricalPolicy learn(policy_fn, env, expert, seed)
def train_trpo(game, num_timesteps, eval_episodes, seed, horizon, out_dir='.', load_path=None, checkpoint_path_in=None, gamma=0.99, timesteps_per_batch=500, num_layers=0, num_hidden=32, checkpoint_freq=20, max_kl=0.01): start_time = time.time() clip = None dir = 'game' game_params = {} # Accept custom grid if the environment requires it if game == 'Taxi' or game == 'TaxiEasy': game_params['grid'] = args.grid game_params['box'] = True if game in ['RaceStrategy-v0', 'Cliff-v0']: game_params['horizon'] = horizon # env = Race(gamma=gamma, horizon=horizon, ) # env_eval = Race(gamma=gamma, horizon=horizon) env = make_game(args.game, game_params) env_eval = make_game(args.game, game_params) directory_output = (dir + '/trpo_' + str(num_layers) + '_'+ str(num_hidden) + '_'+ str(max_kl) + '/') def eval_policy_closure(**args): return eval_policy(env=env_eval, gamma=gamma, **args) tf.set_random_seed(seed) sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() time_str = str(start_time) if rank == 0: logger.configure(dir=out_dir + '/' + directory_output + '/logs', format_strs=['stdout', 'csv'], suffix=time_str) else: logger.configure(format_strs=[]) logger.set_level(logger.DISABLED) network = mlp(num_hidden=num_hidden, num_layers=num_layers) optimized_policy = trpo_mpi.learn(network=network, env=env, eval_policy=eval_policy_closure, timesteps_per_batch=timesteps_per_batch, max_kl=max_kl, cg_iters=10, cg_damping=1e-3, total_timesteps=num_timesteps, gamma=gamma, lam=1.0, vf_iters=3, vf_stepsize=1e-4, checkpoint_freq=checkpoint_freq, checkpoint_dir_out=out_dir + '/' + directory_output + '/models/' + time_str + '/', load_path=load_path, checkpoint_path_in=checkpoint_path_in, eval_episodes=eval_episodes, init_std=1, trainable_variance=True, trainable_bias=True, clip=clip) s = env.reset() done = False states = [] actions = [] s = 0 delta_state = 0.2 while s < env.dim[0]: a, _, _, _ = optimized_policy.step([s]) states.append(s) actions.append(a[0]) s += delta_state s = env.reset() plt.plot(states, actions) plt.show() print('TOTAL TIME:', time.time() - start_time) print("Time taken: %f seg" % ((time.time() - start_time))) print("Time taken: %f hours" % ((time.time() - start_time) / 3600)) env.close()