def _thunk(): env = gym.make(env_id) env.seed(seed + rank) if logger.get_dir(): env = bench.Monitor(env, os.path.join(logger.get_dir(), "{}.monitor.json".format(rank))) gym.logger.setLevel(logging.WARN) return wrap_deepmind(env)
def train(env_id, num_frames, seed): from baselines.ppo1 import pposgd_simple, cnn_policy import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "%i.monitor.json" % rank)) env.seed(workerseed) gym.logger.setLevel(logging.WARN) env = wrap_train(env) num_timesteps = int(num_frames / 4 * 1.1) env.seed(workerseed) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear' ) env.close()
def train(env_id, num_frames, seed): from baselines.trpo_mpi.nosharing_cnn_policy import CnnPolicy from baselines.trpo_mpi import trpo_mpi import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return CnnPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "%i.monitor.json"%rank)) env.seed(workerseed) gym.logger.setLevel(logging.WARN) env = wrap_train(env) num_timesteps = int(num_frames / 4 * 1.1) env.seed(workerseed) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=512, max_kl=0.001, cg_iters=10, cg_damping=1e-3, max_timesteps=num_timesteps, gamma=0.98, lam=1.0, vf_iters=3, vf_stepsize=1e-4, entcoeff=0.00) env.close()
def train(env_id, num_timesteps, seed): from baselines.ppo1 import pposgd_simple, cnn_policy import baselines.common.tf_util as U rank = MPI.COMM_WORLD.Get_rank() sess = U.single_threaded_session() sess.__enter__() if rank == 0: logger.configure() else: logger.configure(format_strs=[]) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() if seed is not None else None set_global_seeds(workerseed) env = make_atari(env_id) def policy_fn(name, ob_space, ac_space): #pylint: disable=W0613 return cnn_policy.CnnPolicy(name=name, ob_space=ob_space, ac_space=ac_space) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) env.seed(workerseed) env = wrap_deepmind(env) env.seed(workerseed) pposgd_simple.learn(env, policy_fn, max_timesteps=int(num_timesteps * 1.1), timesteps_per_actorbatch=256, clip_param=0.2, entcoeff=0.01, optim_epochs=4, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear' ) env.close()
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = gym.make(args.env_id) def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) savedir_fname = learn(env, policy_fn, dataset, max_iters=args.BC_max_iter, ckpt_dir=args.checkpoint_dir, log_dir=args.log_dir, task_name=task_name, verbose=True) avg_len, avg_ret = runner(env, policy_fn, savedir_fname, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample, reuse=True)
def train(policy, rollout_worker, evaluator, n_epochs, n_test_rollouts, n_cycles, n_batches, policy_save_interval, save_policies, **kwargs): rank = MPI.COMM_WORLD.Get_rank() latest_policy_path = os.path.join(logger.get_dir(), 'policy_latest.pkl') best_policy_path = os.path.join(logger.get_dir(), 'policy_best.pkl') periodic_policy_path = os.path.join(logger.get_dir(), 'policy_{}.pkl') logger.info("Training...") best_success_rate = -1 for epoch in range(n_epochs): # train rollout_worker.clear_history() for _ in range(n_cycles): episode = rollout_worker.generate_rollouts() policy.store_episode(episode) for _ in range(n_batches): policy.train() policy.update_target_net() # test evaluator.clear_history() for _ in range(n_test_rollouts): evaluator.generate_rollouts() # record logs logger.record_tabular('epoch', epoch) for key, val in evaluator.logs('test'): logger.record_tabular(key, mpi_average(val)) for key, val in rollout_worker.logs('train'): logger.record_tabular(key, mpi_average(val)) for key, val in policy.logs(): logger.record_tabular(key, mpi_average(val)) if rank == 0: logger.dump_tabular() # save the policy if it's better than the previous ones success_rate = mpi_average(evaluator.current_success_rate()) if rank == 0 and success_rate >= best_success_rate and save_policies: best_success_rate = success_rate logger.info('New best success rate: {}. Saving policy to {} ...'.format(best_success_rate, best_policy_path)) evaluator.save_policy(best_policy_path) evaluator.save_policy(latest_policy_path) if rank == 0 and policy_save_interval > 0 and epoch % policy_save_interval == 0 and save_policies: policy_path = periodic_policy_path.format(epoch) logger.info('Saving periodic policy to {} ...'.format(policy_path)) evaluator.save_policy(policy_path) # make sure that different threads have different seeds local_uniform = np.random.uniform(size=(1,)) root_uniform = local_uniform.copy() MPI.COMM_WORLD.Bcast(root_uniform, root=0) if rank != 0: assert local_uniform[0] != root_uniform[0]
def _thunk(): if env_id == "TestEnv": env = TestEnv(renderer=renderer) #gym.make(env_id) else: env = gym.make(env_id) env.seed(seed + rank) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) gym.logger.setLevel(logging.WARN) # only clip rewards when not evaluating return env
def make_robotics_env(env_id, seed, rank=0): """ Create a wrapped, monitored gym.Env for MuJoCo. """ set_global_seeds(seed) env = gym.make(env_id) env = FlattenDictWrapper(env, ['observation', 'desired_goal']) env = Monitor( env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), info_keywords=('is_success',)) env.seed(seed) return env
def make_mujoco_env(env_id, seed, reward_scale=1.0): """ Create a wrapped, monitored gym.Env for MuJoCo. """ rank = MPI.COMM_WORLD.Get_rank() myseed = seed + 1000 * rank if seed is not None else None set_global_seeds(myseed) env = gym.make(env_id) logger_path = None if logger.get_dir() is None else os.path.join(logger.get_dir(), str(rank)) env = Monitor(env, logger_path, allow_early_resets=True) env.seed(seed) if reward_scale != 1.0: from baselines.common.retro_wrappers import RewardScaler env = RewardScaler(env, reward_scale) return env
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = gym.make(args.env_id) def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) if args.task == 'train': dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) train(env, args.seed, policy_fn, reward_giver, dataset, args.algo, args.g_step, args.d_step, args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.log_dir, args.pretrained, args.BC_max_iter, task_name ) elif args.task == 'evaluate': runner(env, policy_fn, args.load_model_path, timesteps_per_batch=1024, number_trajs=10, stochastic_policy=args.stochastic_policy, save=args.save_sample ) else: raise NotImplementedError env.close()
def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, save_interval=None, lrschedule='linear'): tf.reset_default_graph() set_global_seeds(seed) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space make_model = lambda : Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps =nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef= vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip, lrschedule=lrschedule) if save_interval and logger.get_dir(): import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() runner = Runner(env, model, nsteps=nsteps, gamma=gamma) nbatch = nenvs*nsteps tstart = time.time() coord = tf.train.Coordinator() enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True) for update in range(1, total_timesteps//nbatch+1): obs, states, rewards, masks, actions, values = runner.run() policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values) model.old_obs = obs nseconds = time.time()-tstart fps = int((update*nbatch)/nseconds) if update % log_interval == 0 or update == 1: ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update*nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("policy_loss", float(policy_loss)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.dump_tabular() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): savepath = osp.join(logger.get_dir(), 'checkpoint%.5i'%update) print('Saving to', savepath) model.save(savepath) coord.request_stop() coord.join(enqueue_threads) env.close()
def main(): logger.configure() env = make_atari('PongNoFrameskip-v4') env = bench.Monitor(env, logger.get_dir()) env = deepq.wrap_atari_dqn(env) model = deepq.learn( env, "conv_only", convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=True, lr=1e-4, total_timesteps=int(1e7), buffer_size=10000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, ) model.save('pong_model.pkl') env.close()
def train(args, extra_args): env_type, env_id = get_env_type(args) print('env_type: {}'.format(env_type)) total_timesteps = int(args.num_timesteps) seed = args.seed learn = get_learn_function(args.alg) alg_kwargs = get_learn_function_defaults(args.alg, env_type) alg_kwargs.update(extra_args) env = build_env(args) if args.save_video_interval != 0: env = VecVideoRecorder(env, osp.join(logger.get_dir(), "videos"), record_video_trigger=lambda x: x % args.save_video_interval == 0, video_length=args.save_video_length) if args.network: alg_kwargs['network'] = args.network else: if alg_kwargs.get('network') is None: alg_kwargs['network'] = get_default_network(env_type) print('Training {} on {}:{} with arguments \n{}'.format(args.alg, env_type, env_id, alg_kwargs)) model = learn( env=env, seed=seed, total_timesteps=total_timesteps, **alg_kwargs ) return model, env
def make_vec_env(env_id, env_type, num_env, seed, wrapper_kwargs=None, start_index=0, reward_scale=1.0, flatten_dict_observations=True, gamestate=None): """ Create a wrapped, monitored SubprocVecEnv for Atari and MuJoCo. """ wrapper_kwargs = wrapper_kwargs or {} mpi_rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 seed = seed + 10000 * mpi_rank if seed is not None else None logger_dir = logger.get_dir() def make_thunk(rank): return lambda: make_env( env_id=env_id, env_type=env_type, mpi_rank=mpi_rank, subrank=rank, seed=seed, reward_scale=reward_scale, gamestate=gamestate, flatten_dict_observations=flatten_dict_observations, wrapper_kwargs=wrapper_kwargs, logger_dir=logger_dir ) set_global_seeds(seed) if num_env > 1: return SubprocVecEnv([make_thunk(i + start_index) for i in range(num_env)]) else: return DummyVecEnv([make_thunk(start_index)])
def make_mujoco_env(env_id, seed): """ Create a wrapped, monitored gym.Env for MuJoCo. """ set_global_seeds(seed) env = gym.make(env_id) env = Monitor(env, logger.get_dir()) env.seed(seed) return env
def make_env(subrank=None): env = gym.make(env_name) if subrank is not None and logger.get_dir() is not None: try: from mpi4py import MPI mpi_rank = MPI.COMM_WORLD.Get_rank() except ImportError: MPI = None mpi_rank = 0 logger.warn('Running with a single MPI process. This should work, but the results may differ from the ones publshed in Plappert et al.') max_episode_steps = env._max_episode_steps env = Monitor(env, os.path.join(logger.get_dir(), str(mpi_rank) + '.' + str(subrank)), allow_early_resets=True) # hack to re-expose _max_episode_steps (ideally should replace reliance on it downstream) env = gym.wrappers.TimeLimit(env, max_episode_steps=max_episode_steps) return env
def make_mujoco_env(env_id, seed): """ Create a wrapped, monitored gym.Env for MuJoCo. """ rank = MPI.COMM_WORLD.Get_rank() set_global_seeds(seed + 10000 * rank) env = gym.make(env_id) env = Monitor(env, os.path.join(logger.get_dir(), str(rank))) env.seed(seed) return env
def train(env_id, num_timesteps, seed): from baselines.ppo1 import mlp_policy, pposgd_simple U.make_session(num_cpu=1).__enter__() set_global_seeds(seed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(seed) gym.logger.setLevel(logging.WARN) pposgd_simple.learn(env, policy_fn, max_timesteps=num_timesteps, timesteps_per_batch=2048, clip_param=0.2, entcoeff=0.0, optim_epochs=10, optim_stepsize=3e-4, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', ) env.close()
def train(env_id, num_timesteps, seed): env=gym.make(env_id) if logger.get_dir(): env = bench.Monitor(env, os.path.join(logger.get_dir(), "monitor.json")) set_global_seeds(seed) env.seed(seed) gym.logger.setLevel(logging.WARN) with tf.Session(config=tf.ConfigProto()) as session: ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] with tf.variable_scope("vf"): vf = NeuralNetValueFunction(ob_dim, ac_dim) with tf.variable_scope("pi"): policy = GaussianMlpPolicy(ob_dim, ac_dim) learn(env, policy=policy, vf=vf, gamma=0.99, lam=0.97, timesteps_per_batch=2500, desired_kl=0.002, num_timesteps=num_timesteps, animate=False) env.close()
def train(env_id, num_timesteps, seed): import baselines.common.tf_util as U sess = U.single_threaded_session() sess.__enter__() rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) workerseed = seed + 10000 * MPI.COMM_WORLD.Get_rank() set_global_seeds(workerseed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): return MlpPolicy(name=name, ob_space=env.observation_space, ac_space=env.action_space, hid_size=32, num_hid_layers=2) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "%i.monitor.json" % rank)) env.seed(workerseed) gym.logger.setLevel(logging.WARN) trpo_mpi.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) env.close()
def save(self, path=None): """Save model to a pickle located at `path`""" if path is None: path = os.path.join(logger.get_dir(), "model.pkl") with tempfile.TemporaryDirectory() as td: save_state(os.path.join(td, "model")) arc_name = os.path.join(td, "packed.zip") with zipfile.ZipFile(arc_name, 'w') as zipf: for root, dirs, files in os.walk(td): for fname in files: file_path = os.path.join(root, fname) if file_path != arc_name: zipf.write(file_path, os.path.relpath(file_path, td)) with open(arc_name, "rb") as f: model_data = f.read() with open(path, "wb") as f: cloudpickle.dump((model_data, self._act_params), f)
def main(): parser = argparse.ArgumentParser(formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument('--env', help='environment ID', default='BreakoutNoFrameskip-v4') parser.add_argument('--seed', help='RNG seed', type=int, default=0) parser.add_argument('--prioritized', type=int, default=1) parser.add_argument('--prioritized-replay-alpha', type=float, default=0.6) parser.add_argument('--dueling', type=int, default=1) parser.add_argument('--num-timesteps', type=int, default=int(10e6)) parser.add_argument('--checkpoint-freq', type=int, default=10000) parser.add_argument('--checkpoint-path', type=str, default=None) args = parser.parse_args() logger.configure() set_global_seeds(args.seed) env = make_atari(args.env) env = bench.Monitor(env, logger.get_dir()) env = deepq.wrap_atari_dqn(env) model = deepq.models.cnn_to_mlp( convs=[(32, 8, 4), (64, 4, 2), (64, 3, 1)], hiddens=[256], dueling=bool(args.dueling), ) deepq.learn( env, q_func=model, lr=1e-4, max_timesteps=args.num_timesteps, buffer_size=10000, exploration_fraction=0.1, exploration_final_eps=0.01, train_freq=4, learning_starts=10000, target_network_update_freq=1000, gamma=0.99, prioritized_replay=bool(args.prioritized), prioritized_replay_alpha=args.prioritized_replay_alpha, checkpoint_freq=args.checkpoint_freq, checkpoint_path=args.checkpoint_path, ) env.close()
def main(): logger.configure() parser = mujoco_arg_parser() parser.add_argument('--model-path', default=os.path.join(logger.get_dir(), 'humanoid_policy')) parser.set_defaults(num_timesteps=int(2e7)) args = parser.parse_args() if not args.play: # train the model train(num_timesteps=args.num_timesteps, seed=args.seed, model_path=args.model_path) else: # construct the model object, load pre-trained model and render pi = train(num_timesteps=1, seed=args.seed) U.load_state(args.model_path) env = make_mujoco_env('Humanoid-v2', seed=0) ob = env.reset() while True: action = pi.act(stochastic=False, ob=ob)[0] ob, _, done, _ = env.step(action) env.render() if done: ob = env.reset()
def main(): parser = arg_parser() add_env_params(parser) parser.add_argument('--num-timesteps', type=int, default=int(1e12)) parser.add_argument('--num_env', type=int, default=16) parser.add_argument('--use_news', type=int, default=0) parser.add_argument('--gamma', type=float, default=0.99) parser.add_argument('--gamma_ext', type=float, default=0.999) parser.add_argument('--lam', type=float, default=0.95) parser.add_argument('--update_ob_stats_every_step', type=int, default=0) parser.add_argument('--update_ob_stats_independently_per_gpu', type=int, default=0) parser.add_argument('--update_ob_stats_from_random_agent', type=int, default=1) parser.add_argument('--proportion_of_exp_used_for_predictor_update', type=float, default=1.) parser.add_argument('--tag', type=str, default='') parser.add_argument('--policy', type=str, default='rnn', choices=['cnn', 'rnn']) parser.add_argument('--int_coeff', type=float, default=1.) parser.add_argument('--ext_coeff', type=float, default=2.) parser.add_argument('--dynamics_bonus', type=int, default=0) args = parser.parse_args() logger.configure(dir=logger.get_dir(), format_strs=['stdout', 'log', 'csv'] if MPI.COMM_WORLD.Get_rank() == 0 else []) if MPI.COMM_WORLD.Get_rank() == 0: with open(os.path.join(logger.get_dir(), 'experiment_tag.txt'), 'w') as f: f.write(args.tag) # shutil.copytree(os.path.dirname(os.path.abspath(__file__)), os.path.join(logger.get_dir(), 'code')) mpi_util.setup_mpi_gpus() seed = 10000 * args.seed + MPI.COMM_WORLD.Get_rank() set_global_seeds(seed) hps = dict(frame_stack=4, nminibatches=4, nepochs=4, lr=0.0001, max_grad_norm=0.0, use_news=args.use_news, gamma=args.gamma, gamma_ext=args.gamma_ext, max_episode_steps=args.max_episode_steps, lam=args.lam, update_ob_stats_every_step=args.update_ob_stats_every_step, update_ob_stats_independently_per_gpu=args. update_ob_stats_independently_per_gpu, update_ob_stats_from_random_agent=args. update_ob_stats_from_random_agent, proportion_of_exp_used_for_predictor_update=args. proportion_of_exp_used_for_predictor_update, policy=args.policy, int_coeff=args.int_coeff, ext_coeff=args.ext_coeff, dynamics_bonus=args.dynamics_bonus) tf_util.make_session(make_default=True) print( 'Executing train from outside &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&' ) train(env_id=args.env, num_env=args.num_env, seed=seed, num_timesteps=args.num_timesteps, hps=hps, use_reward=args.use_reward, ep_path=args.ep_path, dmlab=args.dmlab)
def learn(*, network, env, total_timesteps, eval_env = None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, model_fn=None, update_fn=None, init_fn=None, mpi_rank_weight=1, comm=None, **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baseline.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baseline.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baseline.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baseline.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) # Get the nb of env nenvs = env.num_envs # Get state_space and action_space ob_space_list = env.observation_space ac_space_list = env.action_space num_agent =len(ob_space_list) # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches is_mpi_root = (MPI is None or MPI.COMM_WORLD.Get_rank() == 0) # Instantiate the model object (that creates act_model and train_model) if model_fn is None: from interative_ppo.model import Model model_fn = Model from interative_ppo.act_model import ActModel model_list=[] sample_model_list=[] for i,i_obs_space in enumerate(ob_space_list): i_action_space=ac_space_list[i] print("network",network,network_kwargs) policy = build_policy(i_obs_space,i_action_space, network, **network_kwargs) model = model_fn(policy=policy, ob_space=i_obs_space, ac_space=i_action_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, comm=comm, mpi_rank_weight=mpi_rank_weight,model_index=i) model_list.append(model) g=tf.Graph() with g.as_default(): sample_sess=tf.Session(graph=g) for i ,i_obs_space in enumerate(ob_space_list): i_action_space=ac_space_list[i] policy=build_policy(i_obs_space,i_action_space,network,**network_kwargs) model=ActModel(policy=policy,ob_space=i_obs_space,ac_space=i_action_space,nbatch_act=nenvs,sample_sess=sample_sess,model_index=i) sample_model_list.append(model) print("after_load") if load_path is not None: for i,i_load_path in enumerate(load_path): model_list[i].load(i_load_path) # Instantiate the runner object runner = Runner(env=env, model_list=model_list,sample_model_list=sample_model_list, nsteps=nsteps, gamma=gamma, lam=lam) if eval_env is not None: eval_runner = Runner(env = eval_env, model_list = model_list, nsteps = nsteps, gamma = gamma, lam= lam) epinfobuf = [deque(maxlen=100)for _ in range(num_agent)] if eval_env is not None: eval_epinfobuf = deque(maxlen=100) if init_fn is not None: init_fn() # Start total timer tfirststart = time.perf_counter() nupdates = total_timesteps // nbatch interative_interval=3 interative_Away_Now=50 for update in range(1, nupdates+1): assert nbatch % nminibatches == 0 # Start timer tstart = time.perf_counter() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(frac) # Calculate the cliprange cliprangenow = cliprange(frac) # if update % log_interval == 0 and is_mpi_root: logger.info('Stepping environment...') # Get minibatch if update>interative_interval and update%interative_interval==0: for iLoad in range(num_agent): # iStartSample=max(interative_interval,(update-interative_Away_Now)//3*3) # iSample=np.random.choice(range(iStartSample,update,3)) iSample=1 sample_path=osp.join(logger.get_dir(),"checkpoints","%s-%.5i"%(iLoad,iSample)) oSample_Model=sample_model_list[iLoad] oSample_Model.load(sample_path) mblossvals = [[] for _ in range(num_agent)] for iTrainAgent in range(num_agent): obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run(iTrainAgent) #pylint: disable=E0632 if eval_env is not None: eval_obs, eval_returns, eval_masks, eval_actions, eval_values, eval_neglogpacs, eval_states, eval_epinfos = eval_runner.run() #pylint: disable=E0632 # if update % log_interval == 0 and is_mpi_root: logger.info('Done.') epinfobuf[iTrainAgent].extend(epinfos[iTrainAgent]) if eval_env is not None: eval_epinfobuf.extend(eval_epinfos) # Here what we're going to do is for each minibatch calculate the loss and append it. if states[0] is None: # nonrecurrent version # Index of each element of batch_size # Create the indices array inds = np.arange(nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs[iTrainAgent], returns[iTrainAgent], masks[iTrainAgent], actions[iTrainAgent], values[iTrainAgent], neglogpacs[iTrainAgent])) mblossvals[iTrainAgent].append(model_list[iTrainAgent].train(lrnow, cliprangenow, *slices)) else: # recurrent version TODO assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates)) # Feedforward --> get losses --> update lossvals = [np.mean(mblossvals[i], axis=0)for i in range(num_agent)] # End timer tnow = time.perf_counter() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update_fn is not None: update_fn(update) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values[0], returns[0]) logger.logkv("misc/serial_timesteps", update*nsteps) logger.logkv("misc/nupdates", update) logger.logkv("misc/total_timesteps", update*nbatch) logger.logkv("fps", fps) logger.logkv("misc/explained_variance", float(ev)) for i in range(num_agent): logger.logkv('eprewmean%s'%i, safemean([epinfo['r'] for epinfo in epinfobuf[i]])) logger.logkv('eplenmean%s'%i, safemean([epinfo['l'] for epinfo in epinfobuf[i]])) if eval_env is not None: logger.logkv('eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf]) ) logger.logkv('eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf]) ) logger.logkv('misc/time_elapsed', tnow - tfirststart) for i in range(num_agent): for (lossval, lossname) in zip(lossvals[i], model_list[i].loss_names): logger.logkv('loss%s/'%i + lossname, lossval) logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and is_mpi_root: checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) for iModel,model in enumerate(model_list): savepath = osp.join(checkdir, '%s-%.5i'%(iModel,update)) print('Saving to', savepath) model.save(savepath) return model_list
def get_h5filename(self): filename = os.path.join(logger.get_dir(), 'videos_{}.hdf5'.format(MPI.COMM_WORLD.Get_rank())) return filename
def launch(env, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, override_params={}, save_policies=True): # Fork for multi-CPU MPI implementation. if num_cpu > 1: try: whoami = mpi_fork(num_cpu, ['--bind-to', 'core']) except CalledProcessError: # fancy version of mpi call failed, try simple version whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if rank == 0: if logdir or logger.get_dir() is None: logger.configure(dir=logdir) else: logger.configure() logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = env params['replay_strategy'] = replay_strategy if env in config.DEFAULT_ENV_PARAMS: params.update( config.DEFAULT_ENV_PARAMS[env]) # merge env-specific parameters in params.update( **override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) config.log_params(params, logger=logger) if num_cpu == 1: logger.warn() logger.warn('*** Warning ***') logger.warn( 'You are running HER with just a single MPI worker. This will work, but the ' + 'experiments that we report in Plappert et al. (2018, https://arxiv.org/abs/1802.09464) ' + 'were obtained with --num_cpu 19. This makes a significant difference and if you ' + 'are looking to reproduce those results, be aware of this. Please also refer to ' + 'https://github.com/openai/baselines/issues/314 for further details.' ) logger.warn('****************') logger.warn() dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in [ 'T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps' ]: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train(logdir=logdir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies)
def _thunk(): env = make_atari(env_id) env.seed(seed + rank) env = Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank)), allow_early_resets=True) return wrap_deepmind(env, **wrapper_kwargs)
parser.add_argument('--noise_type', type=str, default='ou_0.2') boolean_flag(parser, 'evaluation', default=False) args = parser.parse_args() sess = U.single_threaded_session() sess.__enter__() # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: logger.set_level(logger.DISABLED) # Create envs. env = gym.make(str(args.environment)) logger.configure("/tmp/experiments/"+str(args.environment)+"/DDPG/") env = bench.Monitor(env, logger.get_dir()) if args.evaluation and rank==0: eval_env = gym.make(env_id) eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) env = bench.Monitor(env, None) else: eval_env = None # gym.logger.setLevel(logging.WARN) # if evaluation and rank==0: # eval_env = gym.make(env_id) # eval_env = bench.Monitor(eval_env, os.path.join(logger.get_dir(), 'gym_eval')) # env = bench.Monitor(env, None) # Parse noise_type action_noise = None
def main(args): U.make_session(num_cpu=1).__enter__() set_global_seeds(args.seed) env = gym.make(args.env_id) logger.configure() env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), "monitor.json")) env.seed(args.seed) gym.logger.setLevel(logging.WARN) task_name = get_task_name(args) args.checkpoint_dir = osp.join(args.checkpoint_dir, task_name) args.log_dir = osp.join(args.log_dir, task_name) if args.task == 'train': from baselines.gail import mlp_policy def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) dataset = Mujoco_Dset(expert_path=args.expert_path, traj_limitation=args.traj_limitation) if args.states_only: reward_giver = WeakTransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) else: reward_giver = TransitionClassifier(env, args.adversary_hidden_size, entcoeff=args.adversary_entcoeff) train(env, args.seed, policy_fn, reward_giver, dataset, args.algo, args.g_step, args.d_step, args.policy_entcoeff, args.num_timesteps, args.save_per_iter, args.checkpoint_dir, args.log_dir, args.pretrained, args.BC_max_iter, task_name, args.states_only ) elif args.task == 'evaluate': from baselines.gail import mlp_policy def policy_fn(name, ob_space, ac_space, reuse=False): return mlp_policy.MlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, reuse=reuse, hid_size=args.policy_hidden_size, num_hid_layers=2) runner(env, policy_fn, args.load_model_path, timesteps_per_batch=1024, number_trajs=args.traj_limitation, stochastic_policy=args.stochastic_policy, save=args.save_sample ) elif args.task == 'expert_train': from baselines.trpo_mpi import trpo_mpi as original_trpo from baselines.ppo1.mlp_policy import MlpPolicy as OriginalMlpPolicy def policy_fn(name, ob_space, ac_space, reuse=False): return OriginalMlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=args.policy_hidden_size, num_hid_layers=2) original_trpo.learn(env, policy_fn, timesteps_per_batch=1024, max_kl=0.01, cg_iters=10, cg_damping=0.1, max_timesteps=args.num_timesteps, gamma=0.99, lam=0.98, vf_iters=5, vf_stepsize=1e-3) saver = tf.train.Saver() saver.save(tf.get_default_session(), args.save_model_path) elif args.task == 'expert_gen': from baselines.trpo_mpi import trpo_mpi as original_trpo from baselines.ppo1.mlp_policy import MlpPolicy as OriginalMlpPolicy def policy_fn(name, ob_space, ac_space, reuse=False): return OriginalMlpPolicy(name=name, ob_space=ob_space, ac_space=ac_space, hid_size=args.policy_hidden_size, num_hid_layers=2) runner(env, policy_fn, args.save_model_path, timesteps_per_batch=1024, number_trajs=args.traj_limitation, stochastic_policy=args.stochastic_policy, save=args.save_sample ) else: raise NotImplementedError env.close()
def run_task(v): random.seed(v['seed']) np.random.seed(v['seed']) sampling_res = 2 if 'sampling_res' not in v.keys() else v['sampling_res'] # Log performance of randomly initialized policy with FIXED goal [0.1, 0.1] logger.log("Initializing report and plot_policy_reward...") log_dir = logger.get_snapshot_dir() # problem with logger module here!! report = HTMLReport(osp.join(log_dir, 'report.html'), images_per_row=4) report.add_header("{}".format(EXPERIMENT_TYPE)) report.add_text(format_dict(v)) if v['control_mode'] == 'linear': from sandbox.envs.maze.point_maze_env import PointMazeEnv inner_env = normalize(PointMazeEnv(maze_id=v['maze_id'], maze_size_scaling=v['maze_scaling'], control_mode=v['control_mode'])) inner_env_test = normalize(PointMazeEnv(maze_id=v['maze_id'], maze_size_scaling=v['maze_scaling'], control_mode=v['control_mode'])) elif v['control_mode'] == 'pos': from sandbox.envs.maze.point_maze_pos_env import PointMazeEnv inner_env = normalize(PointMazeEnv(maze_id=v['maze_id'], maze_size_scaling=v['maze_scaling'], control_mode=v['control_mode'])) inner_env_test = normalize(PointMazeEnv(maze_id=v['maze_id'], maze_size_scaling=v['maze_scaling'], control_mode=v['control_mode'])) uniform_goal_generator = UniformStateGenerator(state_size=v['goal_size'], bounds=v['goal_range'], center=v['goal_center']) num_cpu = 1 if num_cpu > 1: try: whoami = mpi_fork(num_cpu, ['--bind-to', 'core']) print("fancy call succeeded") except CalledProcessError: print("fancy version of mpi call failed, try simple version") whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() # Configure logging rank = MPI.COMM_WORLD.Get_rank() logdir = '' if rank == 0: if logdir or logger_b.get_dir() is None: logger_b.configure(dir=logdir) else: logger_b.configure() logdir = logger_b.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = v['seed'] + 1000000 * rank set_global_seeds(rank_seed) feasible_states = sample_unif_feas(inner_env, 10) if v['unif_starts']: starts = np.random.permutation(np.array(feasible_states))[:300] else: starts = np.array([[0, 0]]) uniform_start_generator = UniformListStateGenerator( starts.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], ) # Prepare params. def make_env(inner_env=inner_env, terminal_eps=v['terminal_eps'], terminate_env=v['terminate_env']): return GoalStartExplorationEnv( env=inner_env, goal_generator=uniform_goal_generator, obs2goal_transform=lambda x: x[:v['goal_size']], start_generator=uniform_start_generator, obs2start_transform=lambda x: x[:v['goal_size']], terminal_eps=terminal_eps, distance_metric=v['distance_metric'], extend_dist_rew=v['extend_dist_rew'], only_feasible=v['only_feasible'], terminate_env=terminate_env, goal_weight=v['goal_weight'], inner_weight=0, append_goal_to_observation=False ) env = make_env() test_env = make_env(inner_env=inner_env_test, terminal_eps=1., terminate_env=True) params = config.DEFAULT_PARAMS params['action_l2'] = v['action_l2'] params['max_u'] = v['max_u'] params['gamma'] = v['discount'] params['env_name'] = 'FetchReach-v0' params['replay_strategy'] = v['replay_strategy'] params['lr'] = v['lr'] params['layers'] = v['layers'] params['hidden'] = v['hidden'] params['n_cycles'] = v['n_cycles'] # cycles per epoch params['n_batches'] = v['n_batches'] # training batches per cycle params['batch_size'] = v['batch_size'] # per mpi thread, measured in transitions and reduced to even multiple of chunk_length. params['n_test_rollouts'] = v['n_test_rollouts'] # changed from 10 to 3 # number of test rollouts per epoch, each consists of rollout_batch_size rollouts # exploration params['random_eps'] = 0.3 # percentagcone of time a random action is taken params['noise_eps'] = v['action_noise'] params['goal_weight'] = v['goal_weight'] with open(os.path.join(logger_b.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params['T'] = v['horizon'] params['to_goal'] = v['to_goal'] params['nearby_action_penalty'] = v['nearby_action_penalty'] params['nearby_penalty_weight'] = v['nearby_penalty_weight'] params['nearby_p'] = v['nearby_p'] params['perturb_scale'] = v['perturb_scale'] params['cells_apart'] = v['cells_apart'] params['perturb_to_feasible'] = v['perturb_to_feasible'] params['sample_expert'] = v['sample_expert'] params['expert_batch_size'] = v['expert_batch_size'] params['bc_loss'] = v['bc_loss'] params['anneal_bc'] = v['anneal_bc'] params['gail_weight'] =v['gail_weight'] params['terminate_bootstrapping'] = v['terminate_bootstrapping'] params['mask_q'] = int(v['mode'] == 'pure_bc') params['two_qs'] = v['two_qs'] params['anneal_discriminator'] = v['anneal_discriminator'] params['two_rs'] = v['two_qs'] or v['anneal_discriminator'] params['with_termination'] = v['rollout_terminate'] if 'clip_dis' in v and v['clip_dis']: params['dis_bound'] = v['clip_dis'] params = config.prepare_params(params) params['make_env'] = make_env config.log_params(params, logger=logger_b) dims = config.configure_dims(params) # prepare GAIL if v['use_s_p']: discriminator = GAIL(dims['o'] + dims['o'] + dims['g'] if not v['only_s'] else dims['o'] + dims['g'], dims['o'], dims['o'], dims['g'], 0., gail_loss = v['gail_reward'], use_s_p = True, only_s=v['only_s']) else: discriminator = GAIL(dims['o'] + dims['u'] + dims['g'] if not v['only_s'] else dims['o'] + dims['g'], dims['o'], dims['u'], dims['g'], 0., gail_loss = v['gail_reward'], only_s=v['only_s']) params['discriminator'] = discriminator # configure replay buffer for expert buffer params_expert = {k:params[k] for k in ['make_env', 'replay_k', 'discriminator', 'gail_weight', 'two_rs', 'with_termination']} params_expert['replay_strategy'] = 'future' if v['relabel_expert'] else 'none' params_expert['sample_g_first'] = v['relabel_expert'] and v['sample_g_first'] params_expert['zero_action_p'] = v['zero_action_p'] params_policy_buffer = {k: params[k] for k in ['make_env', 'replay_k', 'discriminator', 'gail_weight', 'two_rs', 'with_termination']} params_policy_buffer['replay_strategy'] = 'future' params_policy_buffer['sample_g_first'] = False policy = config.configure_ddpg(dims=dims, params=params, clip_return=v['clip_return'], reuse=tf.AUTO_REUSE, env=env) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': True, 'T': params['T'], 'weight': v['goal_weight'], 'rollout_terminate': v['rollout_terminate'] } expert_rollout_params = { 'exploit': not v['noisy_expert'], 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], 'weight': v['goal_weight'], 'rollout_terminate': v['rollout_terminate'] } for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']: rollout_params[name] = params[name] expert_rollout_params[name] = params[name] expert_rollout_params['noise_eps'] = v['expert_noise'] expert_rollout_params['random_eps'] = v['expert_eps'] rollout_worker = RolloutWorker([env], policy, dims, logger_b, **rollout_params) # prepare expert policy, rollout worker import joblib if v['expert_policy'] == 'planner': from sandbox.experiments.goals.maze.expert.maze_expert import MazeExpert expert_policy = MazeExpert(inner_env, step_size=0.2) else: expert_policy = joblib.load(os.path.join(os.path.dirname(os.path.abspath(__file__)), v['expert_policy']))['policy'] expert_rollout_worker = RolloutWorker([env], expert_policy, dims, logger_b, **expert_rollout_params) input_shapes = dims_to_shapes(dims) expert_sample_transitions = config.configure_her(params_expert) buffer_shapes = {key: (v['horizon'] if key != 'o' else v['horizon'] + 1, *input_shapes[key]) for key, val in input_shapes.items()} buffer_shapes['g'] = (buffer_shapes['g'][0], 2) buffer_shapes['ag'] = (v['horizon'] + 1, 2) buffer_shapes['successes'] = (v['horizon'],) expert_buffer = ReplayBuffer(buffer_shapes, int(1e6), v['horizon'], expert_sample_transitions) policy.expert_buffer = expert_buffer sample_transitions_relabel = config.configure_her(params_policy_buffer) normal_sample_transitions = policy.sample_transitions empty_buffer = ReplayBuffer(buffer_shapes, int(1e6), v['horizon'], normal_sample_transitions) if not v['query_expert'] or not 'gail' in v['mode']: for i in range(v['num_demos']): # rollout is generated by expert policy episode = expert_rollout_worker.generate_rollouts(reset=not v['no_resets']) # and is stored into the expert buffer expert_buffer.store_episode(episode) if i <= 20: path_length = np.argmax(episode['info_goal_reached'][0]) path_length = v['horizon'] - 1 if path_length == 0 else path_length plot_path(episode['o'][0][:path_length], report=report, obs=True, goal=episode['g'][0][0], limit=v['goal_range'], center=v['goal_center']) report.new_row() # TODO: what is subsampling_rate uninitialized_vars = [] for var in tf.global_variables(): try: tf.get_default_session().run(var) except tf.errors.FailedPreconditionError: uninitialized_vars.append(var) init_new_vars_op = tf.initialize_variables(uninitialized_vars) tf.get_default_session().run(init_new_vars_op) outer_iter = 0 logger.log('Generating the Initial Heatmap...') def evaluate_performance(env): four_rooms = np.array([[-2, -2], [-13, -13]]) if v['unif_starts']: mean_rewards, successes = [], [] for pos in four_rooms: env.update_start_generator(FixedStateGenerator(np.array(pos))) mr, scs = test_and_plot_policy(policy, env, horizon=v['horizon'], max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'], using_gym=True, noise=v['action_noise'], n_processes=8, log=False) mean_rewards.append(mr) successes.append(scs) with logger.tabular_prefix('Outer_'): logger.record_tabular('iter', outer_iter) logger.record_tabular('MeanRewards', np.mean(mean_rewards)) logger.record_tabular('Success', np.mean(successes)) else: env.update_start_generator(FixedStateGenerator(np.array([0, 0]))) _, scs = test_and_plot_policy(policy, env, horizon=v['horizon'], max_reward=v['max_reward'], sampling_res=sampling_res, n_traj=v['n_traj'], itr=outer_iter, report=report, limit=v['goal_range'], center=v['goal_center'], using_gym=True, noise=v['action_noise'], n_processes=8) report.new_row() env.update_start_generator(uniform_start_generator) return scs logger.dump_tabular(with_prefix=False) import cloudpickle max_success = 0. if not v['query_expert'] and v['num_demos'] > 0: if not v['relabel_expert']: goals = goals_filtered = expert_buffer.buffers['g'][:v['num_demos'], 0, :] else: # collect all states visited by the expert goals = None for i in range(v['num_demos']): terminate_index = np.argmax(expert_buffer.buffers['successes'][i]) if np.logical_not(np.any(expert_buffer.buffers['successes'][i])): terminate_index = v['horizon'] cur_goals = expert_buffer.buffers['o'][i, :terminate_index, :2] if goals is None: goals = cur_goals else: goals = np.concatenate([goals, cur_goals]) goal_state_collection = StateCollection(distance_threshold=v['coll_eps']) goal_state_collection.append(goals) goals_filtered = goal_state_collection.states else: goals_filtered = goals = np.random.permutation(np.array(feasible_states))[:300] if v['agent_state_as_goal']: goals = goals else: feasible_states = sample_unif_feas(inner_env, 10) goals = np.random.permutation(np.array(feasible_states))[:300] evaluate_performance(test_env) logger.dump_tabular(with_prefix=False) for outer_iter in range(1, v['outer_iters']): logger.log("Outer itr # %i" % outer_iter) with ExperimentLogger(log_dir, 'last', snapshot_mode='last', hold_outter_log=True): logger.log("Updating the environment goal generator") if v['unif_goals']: env.update_goal_generator( UniformListStateGenerator( goals.tolist(), persistence=v['persistence'], with_replacement=v['with_replacement'], ) ) else: env.update_goal_generator(FixedStateGenerator(v['final_goal'])) logger.log("Training the algorithm") train(policy, discriminator, rollout_worker, v['inner_iters'], v['n_cycles'], v['n_batches'], v['n_batches_dis'], policy.buffer, expert_buffer, empty_buffer=empty_buffer if v['on_policy_dis'] else None, num_rollouts=v['num_rollouts'], reset=not v['no_resets'], feasible_states=feasible_states if v['query_expert'] else None, expert_policy=expert_policy if v['query_expert'] else None, agent_policy=policy if v['query_agent'] else None, train_dis_per_rollout=v['train_dis_per_rollout'], noise_expert=v['noise_dis_agent'], noise_agent=v['noise_dis_expert'], sample_transitions_relabel=sample_transitions_relabel if v['relabel_for_policy'] else None, q_annealing=v['q_annealing'], outer_iter=outer_iter, annealing_coeff=v['annealing_coeff']) # logger.record_tabular('NonZeroRewProp', nonzeros) logger.log('Generating the Heatmap...') success = evaluate_performance(test_env) if success > max_success: print ("% f >= %f, saving policy to params_best" % (success, max_success)) with open(osp.join(log_dir, 'params_best.pkl'), 'wb') as f: cloudpickle.dump({'env': env, 'policy': policy}, f) max_success = success report.new_row() logger.dump_tabular(with_prefix=False)
def make_env(): env = gym.make(str(args.environment)) logger.configure("/tmp/experiments/" + str(args.environment) + "/PPO2/") env = bench.Monitor(env, logger.get_dir()) # print(env.action_space.sample()) return env
def train(args, extra_args): env_type, env_id = get_env_type(args) print('env_type: {}'.format(env_type)) total_timesteps = int(args.num_timesteps) seed = args.seed set_global_seeds(seed) learn = get_learn_function(args.alg) alg_kwargs = get_learn_function_defaults(args.alg, env_type) alg_kwargs.update(extra_args) env = build_env(args, normalize_ob=False) eval_env = build_env(args, normalize_ob=False, is_eval=True) if args.save_video_interval != 0: env = VecVideoRecorder( env, osp.join(logger.get_dir(), "videos"), record_video_trigger=lambda x: x % args.save_video_interval == 0, video_length=args.save_video_length) if args.network: alg_kwargs['network'] = args.network else: if alg_kwargs.get('network') is None: alg_kwargs['network'] = get_default_network(env_type) beta = -1 if beta < 0: #print(alg_kwargs) nr_episodes = total_timesteps // alg_kwargs['timesteps_per_batch'] # Automatically compute beta based on initial entropy and number of iterations policy = build_policy( env, alg_kwargs['network'], value_network='copy', normalize_observations=alg_kwargs['normalize_observations'], copos=True) ob = observation_placeholder(env.observation_space) sess = U.single_threaded_session() sess.__enter__() with tf.variable_scope("tmp_pi"): tmp_pi = policy(observ_placeholder=ob) sess.run(tf.global_variables_initializer()) tmp_ob = np.zeros((1, ) + env.observation_space.shape) entropy = sess.run(tmp_pi.pd.entropy(), feed_dict={tmp_pi.X: tmp_ob}) #beta = 2 * entropy / nr_episodes beta = 0 print("Initial entropy: " + str(entropy) + ", episodes: " + str(nr_episodes)) print("Constantly set beta: " + str(beta)) print('Training {} on {}:{} with arguments \n{}'.format( args.alg, env_type, env_id, alg_kwargs)) iters = 0 for model in learn(env=env, env_id=env_id, eval_env=eval_env, make_eval_env=lambda: build_env( args, normalize_ob=False, is_eval=True), seed=seed, beta=beta, total_timesteps=total_timesteps, **alg_kwargs): if args.store_ckpt: save_path = osp.join(logger.get_dir(), 'model-{}'.format(iters)) model.save(save_path) if isinstance(env, VecNormalize): rms_path = osp.join(logger.get_dir(), 'rms-{}'.format(iters)) with open(rms_path, 'wb') as f: rms = (env.ob_rms, env.ret_rms) pickle.dump(rms, f) logger.log('Save {} model'.format(iters + 1)) iters += 1 return model, env
def learn(policy, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=24, nscripts=12, nsteps=20, nstack=4, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.01, kfac_clip=0.001, save_interval=None, lrschedule='linear', callback=None): tf.reset_default_graph() set_global_seeds(seed) nenvs = nprocs ob_space = (32, 32, 3) # env.observation_space ac_space = (32, 32) make_model = lambda: Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nscripts=nscripts, nsteps=nsteps, nstack=nstack, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip, lrschedule=lrschedule) if save_interval and logger.get_dir(): import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() print("make_model complete!") runner = Runner(env, model, nsteps=nsteps, nscripts=nscripts, nstack=nstack, gamma=gamma, callback=callback) nbatch = nenvs * nsteps tstart = time.time() #enqueue_threads = model.q_runner.create_threads(model.sess, coord=tf.train.Coordinator(), start=True) for update in range(1, total_timesteps // nbatch + 1): obs, states, rewards, masks, actions, xy0, xy1, values = runner.run() policy_loss, value_loss, policy_entropy, \ policy_loss_xy0, policy_entropy_xy0, \ policy_loss_xy1, policy_entropy_xy1, \ = model.train(obs, states, rewards, masks, actions, xy0, xy1, values) model.old_obs = obs nseconds = time.time() - tstart fps = int((update * nbatch) / nseconds) if update % log_interval == 0 or update == 1: ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("policy_loss", float(policy_loss)) logger.record_tabular("policy_loss_xy0", float(policy_loss_xy0)) logger.record_tabular("policy_entropy_xy0", float(policy_entropy_xy0)) logger.record_tabular("policy_loss_xy1", float(policy_loss_xy1)) logger.record_tabular("policy_entropy_xy1", float(policy_entropy_xy1)) # logger.record_tabular("policy_loss_y0", float(policy_loss_y0)) # logger.record_tabular("policy_entropy_y0", float(policy_entropy_y0)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.dump_tabular() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): savepath = osp.join(logger.get_dir(), 'checkpoint%.5i' % update) print('Saving to', savepath) model.save(savepath) env.close()
def train(env, policy, rollout_worker, evaluator, n_epochs, n_test_rollouts, n_cycles, n_batches, policy_save_interval, save_policies, policy_path=None, **kwargs): latest_policy_path = os.path.join(logger.get_dir(), 'policy_latest.pkl') best_policy_path = os.path.join(logger.get_dir(), 'policy_best.pkl') periodic_policy_path = os.path.join(logger.get_dir(), 'policy_{}.pkl') saver = tf.train.Saver() model_name = 'model.ckpt' if policy_path is not None: saver.restore(policy.sess, policy_path + model_name) logger.info("Successfully restored policy from {}".format(policy_path)) if policy_path is None: policy_path = ''.join(['./trained/', env, '_', policy.scope, '/']) if not os.path.exists(policy_path): os.makedirs(policy_path) epoch_log_path = logger.get_dir() + '/epoch.txt' trained_epochs = get_counter(epoch_log_path) policy_path += model_name rank = MPI.COMM_WORLD.Get_rank() # latest_policy_path = os.path.join(logger.get_dir(), 'policy_latest.pkl') # best_policy_path = os.path.join(logger.get_dir(), 'policy_best.pkl') # periodic_policy_path = os.path.join(logger.get_dir(), 'policy_{}.pkl') logger.info("Training...") best_success_rate = -1 for epoch in range(trained_epochs + 1, n_epochs): # train logger.info("Episode {}/{}".format(epoch, n_epochs)) rollout_worker.clear_history() for _ in range(n_cycles): episode = rollout_worker.generate_rollouts() policy.store_episode(episode) for _ in range(n_batches): policy.train() policy.update_target_net() # test evaluator.clear_history() for _ in range(n_test_rollouts): evaluator.generate_rollouts() # record logs logger.record_tabular('epoch', epoch) for key, val in evaluator.logs('test'): logger.record_tabular(key, mpi_average(val)) for key, val in rollout_worker.logs('train'): logger.record_tabular(key, mpi_average(val)) for key, val in policy.logs(): logger.record_tabular(key, mpi_average(val)) if rank == 0: logger.dump_tabular() # save the policy if it's better than the previous ones success_rate = mpi_average(evaluator.current_success_rate()) if rank == 0 and success_rate >= best_success_rate and save_policies: best_success_rate = success_rate logger.info( 'New best success rate: {}. Saving policy to {} ...'.format( best_success_rate, policy_path)) pth = saver.save(policy.sess, policy_path) try: with open(epoch_log_path, 'a') as file: file.write(str(epoch) + '\n') except Exception as e: logger.info(e) print("saved policy to {}".format(pth)) evaluator.save_policy(best_policy_path) evaluator.save_policy(latest_policy_path) if rank == 0 and policy_save_interval > 0 and epoch % policy_save_interval == 0 and save_policies: policy_path = periodic_policy_path.format(epoch) logger.info('Saving periodic policy to {} ...'.format(policy_path)) evaluator.save_policy(policy_path) # make sure that different threads have different seeds local_uniform = np.random.uniform(size=(1, )) root_uniform = local_uniform.copy() MPI.COMM_WORLD.Bcast(root_uniform, root=0) if rank != 0: assert local_uniform[0] != root_uniform[0]
def make_env(): monitor = bench.Monitor(env, baselines_logger.get_dir(), allow_early_resets=True) return monitor
def learn(network, env, seed=None, total_timesteps=None, nb_epochs=None, # with default settings, perform 1M steps total nb_epoch_cycles=20, nb_rollout_steps=100, reward_scale=1.0, render=False, render_eval=False, noise_type='adaptive-param_0.2', normalize_returns=False, normalize_observations=True, critic_l2_reg=1e-2, actor_lr=1e-4, critic_lr=1e-3, popart=False, gamma=0.99, clip_norm=None, nb_train_steps=50, # per epoch cycle and MPI worker, nb_eval_steps=100, batch_size=64, # per MPI worker tau=0.01, eval_env=None, param_noise_adaption_interval=50, **network_kwargs): set_global_seeds(seed) if total_timesteps is not None: assert nb_epochs is None nb_epochs = int(total_timesteps) // (nb_epoch_cycles * nb_rollout_steps) else: nb_epochs = 500 if MPI is not None: rank = MPI.COMM_WORLD.Get_rank() else: rank = 0 nb_actions = env.action_space.shape[-1] assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(network=network, **network_kwargs) actor = Actor(nb_actions, network=network, **network_kwargs) action_noise = None param_noise = None if noise_type is not None: for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) max_action = env.action_space.high logger.info('scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) sess = U.get_session() # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() nenvs = obs.shape[0] episode_reward = np.zeros(nenvs, dtype = np.float32) #vector episode_step = np.zeros(nenvs, dtype = int) # vector episodes = 0 #scalar t = 0 # scalar epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. if nenvs > 1: # if simulating multiple envs in parallel, impossible to reset agent at the end of the episode in each # of the environments, so resetting here instead agent.reset() for t_rollout in range(nb_rollout_steps): # Predict next action. action, q, _, _ = agent.step(obs, apply_noise=True, compute_Q=True) # Execute next action. if rank == 0 and render: env.render() # max_action is of dimension A, whereas action is dimension (nenvs, A) - the multiplication gets broadcasted to the batch new_obs, r, done, info = env.step(max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) # note these outputs are batched from vecenv t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) #the batched data will be unrolled in memory.py's append. obs = new_obs for d in range(len(done)): if done[d]: # Episode done. epoch_episode_rewards.append(episode_reward[d]) episode_rewards_history.append(episode_reward[d]) epoch_episode_steps.append(episode_step[d]) episode_reward[d] = 0. episode_step[d] = 0 epoch_episodes += 1 episodes += 1 if nenvs == 1: agent.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t_train % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: nenvs_eval = eval_obs.shape[0] eval_episode_reward = np.zeros(nenvs_eval, dtype = np.float32) for t_rollout in range(nb_eval_steps): eval_action, eval_q, _, _ = agent.step(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) for d in range(len(eval_done)): if eval_done[d]: eval_episode_rewards.append(eval_episode_reward[d]) eval_episode_rewards_history.append(eval_episode_reward[d]) eval_episode_reward[d] = 0.0 if MPI is not None: mpi_size = MPI.COMM_WORLD.Get_size() else: mpi_size = 1 # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s'%x) combined_stats_sums = np.array([ np.array(x).flatten()[0] for x in combined_stats.values()]) if MPI is not None: combined_stats_sums = MPI.COMM_WORLD.allreduce(combined_stats_sums) combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)} # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) if rank == 0: logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f) return agent
def learn( *, network, env, eval_env, make_eval_env, env_id, seed, beta, total_timesteps, timesteps_per_batch, # what to train on #num_samples=(1500,), num_samples=(1, ), #horizon=(5,), horizon=(2, ), #num_elites=(10,), num_elites=(1, ), max_kl=0.001, cg_iters=10, gamma=0.99, lam=1.0, # advantage estimation ent_coef=0.0, cg_damping=1e-2, vf_stepsize=3e-4, vf_iters=3, max_episodes=0, max_iters=0, # time constraint callback=None, load_path=None, TRPO=False, # MBL # For train mbl mbl_train_freq=5, # For eval num_eval_episodes=5, eval_freq=5, vis_eval=False, #eval_targs=('mbmf',), eval_targs=('mf', ), quant=2, # For mbl.step mbl_lamb=(1.0, ), mbl_gamma=0.99, #mbl_sh=1, # Number of step for stochastic sampling mbl_sh=10000, #vf_lookahead=-1, #use_max_vf=False, reset_per_step=(0, ), # For get_model num_fc=2, num_fwd_hidden=500, use_layer_norm=False, # For MBL num_warm_start=int(1e4), init_epochs=10, update_epochs=5, batch_size=512, update_with_validation=False, use_mean_elites=1, use_ent_adjust=0, adj_std_scale=0.5, # For data loading validation_set_path=None, # For data collect collect_val_data=False, # For traj collect traj_collect='mf', # For profile measure_time=True, eval_val_err=False, measure_rew=True, **network_kwargs): ''' learn a policy function with TRPO algorithm Parameters: ---------- network neural network to learn. Can be either string ('mlp', 'cnn', 'lstm', 'lnlstm' for basic types) or function that takes input placeholder and returns tuple (output, None) for feedforward nets or (output, (state_placeholder, state_output, mask_placeholder)) for recurrent nets env environment (one of the gym environments or wrapped via baselines.common.vec_env.VecEnv-type class timesteps_per_batch timesteps per gradient estimation batch max_kl max KL divergence between old policy and new policy ( KL(pi_old || pi) ) ent_coef coefficient of policy entropy term in the optimization objective cg_iters number of iterations of conjugate gradient algorithm cg_damping conjugate gradient damping vf_stepsize learning rate for adam optimizer used to optimie value function loss vf_iters number of iterations of value function optimization iterations per each policy optimization step total_timesteps max number of timesteps max_episodes max number of episodes max_iters maximum number of policy optimization iterations callback function to be called with (locals(), globals()) each policy optimization step load_path str, path to load the model from (default: None, i.e. no model is loaded) **network_kwargs keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network Returns: ------- learnt model ''' if not isinstance(num_samples, tuple): num_samples = (num_samples, ) if not isinstance(horizon, tuple): horizon = (horizon, ) if not isinstance(num_elites, tuple): num_elites = (num_elites, ) if not isinstance(mbl_lamb, tuple): mbl_lamb = (mbl_lamb, ) if not isinstance(reset_per_step, tuple): reset_per_step = (reset_per_step, ) if validation_set_path is None: if collect_val_data: validation_set_path = os.path.join(logger.get_dir(), 'val.pkl') else: validation_set_path = os.path.join('dataset', '{}-val.pkl'.format(env_id)) if eval_val_err: eval_val_err_path = os.path.join('dataset', '{}-combine-val.pkl'.format(env_id)) logger.log(locals()) logger.log('MBL_SH', mbl_sh) logger.log('Traj_collect', traj_collect) if MPI is not None: nworkers = MPI.COMM_WORLD.Get_size() rank = MPI.COMM_WORLD.Get_rank() else: nworkers = 1 rank = 0 cpus_per_worker = 1 U.get_session( config=tf.ConfigProto(allow_soft_placement=True, inter_op_parallelism_threads=cpus_per_worker, intra_op_parallelism_threads=cpus_per_worker)) policy = build_policy(env, network, value_network='copy', copos=True, **network_kwargs) set_global_seeds(seed) np.set_printoptions(precision=3) # Setup losses and stuff # ---------------------------------------- ob_space = env.observation_space ac_space = env.action_space discrete_ac_space = isinstance(ac_space, gym.spaces.Discrete) ob = observation_placeholder(ob_space) with tf.variable_scope("pi"): pi = policy(observ_placeholder=ob) with tf.variable_scope("oldpi"): oldpi = policy(observ_placeholder=ob) # MBL # --------------------------------------- #viz = Visdom(env=env_id) win = None eval_targs = list(eval_targs) logger.log(eval_targs) make_model = get_make_mlp_model(num_fc=num_fc, num_fwd_hidden=num_fwd_hidden, layer_norm=use_layer_norm) mbl = MBL(env=eval_env, env_id=env_id, make_model=make_model, num_warm_start=num_warm_start, init_epochs=init_epochs, update_epochs=update_epochs, batch_size=batch_size, **network_kwargs) val_dataset = {'ob': None, 'ac': None, 'ob_next': None} if update_with_validation: logger.log('Update with validation') val_dataset = load_val_data(validation_set_path) if eval_val_err: logger.log('Log val error') eval_val_dataset = load_val_data(eval_val_err_path) if collect_val_data: logger.log('Collect validation data') val_dataset_collect = [] def _mf_pi(ob, t=None): stochastic = True ac, vpred, _, _ = pi.step(ob, stochastic=stochastic) return ac, vpred def _mf_det_pi(ob, t=None): #ac, vpred, _, _ = pi.step(ob, stochastic=False) ac, vpred = pi._evaluate([pi.pd.mode(), pi.vf], ob) return ac, vpred def _mf_ent_pi(ob, t=None): mean, std, vpred = pi._evaluate([pi.pd.mode(), pi.pd.std, pi.vf], ob) ac = np.random.normal(mean, std * adj_std_scale, size=mean.shape) return ac, vpred ################### use_ent_adjust======> adj_std_scale????????pi action sample def _mbmf_inner_pi(ob, t=0): if use_ent_adjust: return _mf_ent_pi(ob) else: #return _mf_pi(ob) if t < mbl_sh: return _mf_pi(ob) else: return _mf_det_pi(ob) # --------------------------------------- # Run multiple configuration once all_eval_descs = [] def make_mbmf_pi(n, h, e, l): def _mbmf_pi(ob): ac, rew = mbl.step(ob=ob, pi=_mbmf_inner_pi, horizon=h, num_samples=n, num_elites=e, gamma=mbl_gamma, lamb=l, use_mean_elites=use_mean_elites) return ac[None], rew return Policy(step=_mbmf_pi, reset=None) for n in num_samples: for h in horizon: for l in mbl_lamb: for e in num_elites: if 'mbmf' in eval_targs: all_eval_descs.append( ('MeanRew', 'MBL_COPOS', make_mbmf_pi(n, h, e, l))) #if 'mbmf' in eval_targs: all_eval_descs.append(('MeanRew-n-{}-h-{}-e-{}-l-{}-sh-{}-me-{}'.format(n, h, e, l, mbl_sh, use_mean_elites), 'MBL_TRPO-n-{}-h-{}-e-{}-l-{}-sh-{}-me-{}'.format(n, h, e, l, mbl_sh, use_mean_elites), make_mbmf_pi(n, h, e, l))) if 'mf' in eval_targs: all_eval_descs.append( ('MeanRew', 'COPOS', Policy(step=_mf_pi, reset=None))) logger.log('List of evaluation targets') for it in all_eval_descs: logger.log(it[0]) pool = Pool(mp.cpu_count()) warm_start_done = False # ---------------------------------------- atarg = tf.placeholder( dtype=tf.float32, shape=[None]) # Target advantage function (if applicable) ret = tf.placeholder(dtype=tf.float32, shape=[None]) # Empirical return ac = pi.pdtype.sample_placeholder([None]) kloldnew = oldpi.pd.kl(pi.pd) ent = pi.pd.entropy() meankl = tf.reduce_mean(kloldnew) meanent = tf.reduce_mean(ent) entbonus = ent_coef * meanent vferr = tf.reduce_mean(tf.square(pi.vf - ret)) ratio = tf.exp(pi.pd.logp(ac) - oldpi.pd.logp(ac)) # advantage * pnew / pold surrgain = tf.reduce_mean(ratio * atarg) optimgain = surrgain + entbonus losses = [optimgain, meankl, entbonus, surrgain, meanent] loss_names = ["optimgain", "meankl", "entloss", "surrgain", "entropy"] dist = meankl all_var_list = get_trainable_variables("pi") # var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("pol")] # vf_var_list = [v for v in all_var_list if v.name.split("/")[1].startswith("vf")] var_list = get_pi_trainable_variables("pi") vf_var_list = get_vf_trainable_variables("pi") vfadam = MpiAdam(vf_var_list) get_flat = U.GetFlat(var_list) set_from_flat = U.SetFromFlat(var_list) klgrads = tf.gradients(dist, var_list) flat_tangent = tf.placeholder(dtype=tf.float32, shape=[None], name="flat_tan") shapes = [var.get_shape().as_list() for var in var_list] start = 0 tangents = [] for shape in shapes: sz = U.intprod(shape) tangents.append(tf.reshape(flat_tangent[start:start + sz], shape)) start += sz gvp = tf.add_n([ tf.reduce_sum(g * tangent) for (g, tangent) in zipsame(klgrads, tangents) ]) #pylint: disable=E1111 fvp = U.flatgrad(gvp, var_list) assign_old_eq_new = U.function( [], [], updates=[ tf.assign(oldv, newv) for (oldv, newv) in zipsame(get_variables("oldpi"), get_variables("pi")) ]) compute_losses = U.function([ob, ac, atarg], losses) compute_lossandgrad = U.function([ob, ac, atarg], losses + [U.flatgrad(optimgain, var_list)]) compute_fvp = U.function([flat_tangent, ob, ac, atarg], fvp) compute_vflossandgrad = U.function([ob, ret], U.flatgrad(vferr, vf_var_list)) @contextmanager def timed(msg): if rank == 0: print(colorize(msg, color='magenta')) tstart = time.time() yield print( colorize("done in %.3f seconds" % (time.time() - tstart), color='magenta')) else: yield def allmean(x): assert isinstance(x, np.ndarray) out = np.empty_like(x) MPI.COMM_WORLD.Allreduce(x, out, op=MPI.SUM) out /= nworkers return out U.initialize() if load_path is not None: pi.load(load_path) th_init = get_flat() MPI.COMM_WORLD.Bcast(th_init, root=0) set_from_flat(th_init) vfadam.sync() print("Init param sum", th_init.sum(), flush=True) # Initialize eta, omega optimizer if discrete_ac_space: init_eta = 1 init_omega = 0.5 eta_omega_optimizer = EtaOmegaOptimizerDiscrete( beta, max_kl, init_eta, init_omega) else: init_eta = 0.5 init_omega = 2.0 #????eta_omega_optimizer details????? eta_omega_optimizer = EtaOmegaOptimizer(beta, max_kl, init_eta, init_omega) # Prepare for rollouts # ---------------------------------------- if traj_collect == 'mf': seg_gen = traj_segment_generator(pi, env, timesteps_per_batch, stochastic=True) episodes_so_far = 0 timesteps_so_far = 0 iters_so_far = 0 tstart = time.time() lenbuffer = deque(maxlen=40) # rolling buffer for episode lengths rewbuffer = deque(maxlen=40) # rolling buffer for episode rewards if sum([max_iters > 0, total_timesteps > 0, max_episodes > 0]) == 0: # noththing to be done return pi assert sum([max_iters>0, total_timesteps>0, max_episodes>0]) < 2, \ 'out of max_iters, total_timesteps, and max_episodes only one should be specified' while True: if callback: callback(locals(), globals()) if total_timesteps and timesteps_so_far >= total_timesteps: break elif max_episodes and episodes_so_far >= max_episodes: break elif max_iters and iters_so_far >= max_iters: break logger.log("********** Iteration %i ************" % iters_so_far) with timed("sampling"): seg = seg_gen.__next__() if traj_collect == 'mf-random' or traj_collect == 'mf-mb': seg_mbl = seg_gen_mbl.__next__() else: seg_mbl = seg add_vtarg_and_adv(seg, gamma, lam) # ob, ac, atarg, ret, td1ret = map(np.concatenate, (obs, acs, atargs, rets, td1rets)) ob, ac, atarg, tdlamret = seg["ob"], seg["ac"], seg["adv"], seg[ "tdlamret"] # Val data collection if collect_val_data: for ob_, ac_, ob_next_ in zip(ob[:-1, 0, ...], ac[:-1, ...], ob[1:, 0, ...]): val_dataset_collect.append( (copy.copy(ob_), copy.copy(ac_), copy.copy(ob_next_))) # ----------------------------- # MBL update else: ob_mbl, ac_mbl = seg_mbl["ob"], seg_mbl["ac"] mbl.add_data_batch(ob_mbl[:-1, 0, ...], ac_mbl[:-1, ...], ob_mbl[1:, 0, ...]) mbl.update_forward_dynamic(require_update=iters_so_far % mbl_train_freq == 0, ob_val=val_dataset['ob'], ac_val=val_dataset['ac'], ob_next_val=val_dataset['ob_next']) # ----------------------------- if traj_collect == 'mf': #if traj_collect == 'mf' or traj_collect == 'mf-random' or traj_collect == 'mf-mb': vpredbefore = seg[ "vpred"] # predicted value function before udpate atarg = (atarg - atarg.mean()) / atarg.std( ) # standardized advantage function estimate if hasattr(pi, "ret_rms"): pi.ret_rms.update(tdlamret) if hasattr(pi, "rms"): pi.rms.update(ob) # update running mean/std for policy args = seg["ob"], seg["ac"], atarg fvpargs = [arr[::5] for arr in args] def fisher_vector_product(p): return allmean(compute_fvp(p, *fvpargs)) + cg_damping * p assign_old_eq_new( ) # set old parameter values to new parameter values with timed("computegrad"): *lossbefore, g = compute_lossandgrad(*args) lossbefore = allmean(np.array(lossbefore)) g = allmean(g) if np.allclose(g, 0): logger.log("Got zero gradient. not updating") else: with timed("cg"): stepdir = cg(fisher_vector_product, g, cg_iters=cg_iters, verbose=rank == 0) assert np.isfinite(stepdir).all() if TRPO: shs = .5 * stepdir.dot(fisher_vector_product(stepdir)) lm = np.sqrt(shs / max_kl) # logger.log("lagrange multiplier:", lm, "gnorm:", np.linalg.norm(g)) fullstep = stepdir / lm expectedimprove = g.dot(fullstep) surrbefore = lossbefore[0] stepsize = 1.0 thbefore = get_flat() for _ in range(10): thnew = thbefore + fullstep * stepsize set_from_flat(thnew) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) improve = surr - surrbefore logger.log("Expected: %.3f Actual: %.3f" % (expectedimprove, improve)) if not np.isfinite(meanlosses).all(): logger.log( "Got non-finite value of losses -- bad!") elif kl > max_kl * 1.5: logger.log( "violated KL constraint. shrinking step.") elif improve < 0: logger.log( "surrogate didn't improve. shrinking step.") else: logger.log("Stepsize OK!") break stepsize *= .5 else: logger.log("couldn't compute a good step") set_from_flat(thbefore) else: copos_update_dir = stepdir # Split direction into log-linear 'w_theta' and non-linear 'w_beta' parts w_theta, w_beta = pi.split_w(copos_update_dir) tmp_ob = np.zeros( (1, ) + env.observation_space.shape ) # We assume that entropy does not depend on the NN # Optimize eta and omega if discrete_ac_space: entropy = lossbefore[4] #entropy = - 1/timesteps_per_batch * np.sum(np.sum(pi.get_action_prob(ob) * pi.get_log_action_prob(ob), axis=1)) eta, omega = eta_omega_optimizer.optimize( pi.compute_F_w(ob, copos_update_dir), pi.get_log_action_prob(ob), timesteps_per_batch, entropy) else: Waa, Wsa = pi.w2W(w_theta) wa = pi.get_wa(ob, w_beta) varphis = pi.get_varphis(ob) #old_ent = old_entropy.eval({oldpi.ob: tmp_ob})[0] old_ent = lossbefore[4] eta, omega = eta_omega_optimizer.optimize( w_theta, Waa, Wsa, wa, varphis, pi.get_kt(), pi.get_prec_matrix(), pi.is_new_policy_valid, old_ent) logger.log("Initial eta: " + str(eta) + " and omega: " + str(omega)) current_theta_beta = get_flat() prev_theta, prev_beta = pi.all_to_theta_beta( current_theta_beta) if discrete_ac_space: # Do a line search for both theta and beta parameters by adjusting only eta eta = eta_search(w_theta, w_beta, eta, omega, allmean, compute_losses, get_flat, set_from_flat, pi, max_kl, args, discrete_ac_space) logger.log("Updated eta, eta: " + str(eta)) set_from_flat( pi.theta_beta_to_all(prev_theta, prev_beta)) # Find proper omega for new eta. Use old policy parameters first. eta, omega = eta_omega_optimizer.optimize( pi.compute_F_w(ob, copos_update_dir), pi.get_log_action_prob(ob), timesteps_per_batch, entropy, eta) logger.log("Updated omega, eta: " + str(eta) + " and omega: " + str(omega)) # do line search for ratio for non-linear "beta" parameter values #ratio = beta_ratio_line_search(w_theta, w_beta, eta, omega, allmean, compute_losses, get_flat, set_from_flat, pi, # max_kl, beta, args) # set ratio to 1 if we do not use beta ratio line search ratio = 1 #print("ratio from line search: " + str(ratio)) cur_theta = (eta * prev_theta + w_theta.reshape(-1, )) / (eta + omega) cur_beta = prev_beta + ratio * w_beta.reshape( -1, ) / eta else: for i in range(2): # Do a line search for both theta and beta parameters by adjusting only eta eta = eta_search(w_theta, w_beta, eta, omega, allmean, compute_losses, get_flat, set_from_flat, pi, max_kl, args) logger.log("Updated eta, eta: " + str(eta) + " and omega: " + str(omega)) # Find proper omega for new eta. Use old policy parameters first. set_from_flat( pi.theta_beta_to_all(prev_theta, prev_beta)) eta, omega = \ eta_omega_optimizer.optimize(w_theta, Waa, Wsa, wa, varphis, pi.get_kt(), pi.get_prec_matrix(), pi.is_new_policy_valid, old_ent, eta) logger.log("Updated omega, eta: " + str(eta) + " and omega: " + str(omega)) # Use final policy logger.log("Final eta: " + str(eta) + " and omega: " + str(omega)) cur_theta = (eta * prev_theta + w_theta.reshape(-1, )) / (eta + omega) cur_beta = prev_beta + w_beta.reshape(-1, ) / eta set_from_flat(pi.theta_beta_to_all(cur_theta, cur_beta)) meanlosses = surr, kl, *_ = allmean( np.array(compute_losses(*args))) ##copos specific over if nworkers > 1 and iters_so_far % 20 == 0: paramsums = MPI.COMM_WORLD.allgather( (thnew.sum(), vfadam.getflat().sum())) # list of tuples assert all( np.allclose(ps, paramsums[0]) for ps in paramsums[1:]) #cg over for (lossname, lossval) in zip(loss_names, meanlosses): logger.record_tabular(lossname, lossval) #policy update over with timed("vf"): for _ in range(vf_iters): for (mbob, mbret) in dataset.iterbatches( (seg["ob"], seg["tdlamret"]), include_final_partial_batch=False, batch_size=64): g = allmean(compute_vflossandgrad(mbob, mbret)) vfadam.update(g, vf_stepsize) logger.record_tabular("ev_tdlam_before", explained_variance(vpredbefore, tdlamret)) lrlocal = (seg["ep_lens"], seg["ep_rets"]) # local values if MPI is not None: listoflrpairs = MPI.COMM_WORLD.allgather(lrlocal) # list of tuples else: listoflrpairs = [lrlocal] lens, rews = map(flatten_lists, zip(*listoflrpairs)) lenbuffer.extend(lens) rewbuffer.extend(rews) logger.record_tabular("EpLenMean", np.mean(lenbuffer)) logger.record_tabular("EpRewMean", np.mean(rewbuffer)) logger.record_tabular("EpThisIter", len(lens)) episodes_so_far += len(lens) timesteps_so_far += sum(lens) iters_so_far += 1 logger.record_tabular("EpisodesSoFar", episodes_so_far) logger.record_tabular("TimestepsSoFar", timesteps_so_far) logger.record_tabular("TimeElapsed", time.time() - tstart) if rank == 0: # MBL evaluation if not collect_val_data: #set_global_seeds(seed) default_sess = tf.get_default_session() def multithread_eval_policy(env_, pi_, num_episodes_, vis_eval_, seed): with default_sess.as_default(): if hasattr(env, 'ob_rms') and hasattr(env_, 'ob_rms'): env_.ob_rms = env.ob_rms res = eval_policy(env_, pi_, num_episodes_, vis_eval_, seed, measure_time, measure_rew) try: env_.close() except: pass return res if mbl.is_warm_start_done() and iters_so_far % eval_freq == 0: warm_start_done = mbl.is_warm_start_done() if num_eval_episodes > 0: targs_names = {} with timed('eval'): num_descs = len(all_eval_descs) list_field_names = [e[0] for e in all_eval_descs] list_legend_names = [e[1] for e in all_eval_descs] list_pis = [e[2] for e in all_eval_descs] list_eval_envs = [ make_eval_env() for _ in range(num_descs) ] list_seed = [seed for _ in range(num_descs)] list_num_eval_episodes = [ num_eval_episodes for _ in range(num_descs) ] print(list_field_names) print(list_legend_names) list_vis_eval = [ vis_eval for _ in range(num_descs) ] for i in range(num_descs): field_name, legend_name = list_field_names[ i], list_legend_names[i], res = multithread_eval_policy( list_eval_envs[i], list_pis[i], list_num_eval_episodes[i], list_vis_eval[i], seed) #eval_results = pool.starmap(multithread_eval_policy, zip(list_eval_envs, list_pis, list_num_eval_episodes, list_vis_eval,list_seed)) #for field_name, legend_name, res in zip(list_field_names, list_legend_names, eval_results): perf, elapsed_time, eval_rew = res logger.record_tabular(field_name, perf) if measure_time: logger.record_tabular( 'Time-%s' % (field_name), elapsed_time) if measure_rew: logger.record_tabular( 'SimRew-%s' % (field_name), eval_rew) targs_names[field_name] = legend_name if eval_val_err: fwd_dynamics_err = mbl.eval_forward_dynamic( obs=eval_val_dataset['ob'], acs=eval_val_dataset['ac'], obs_next=eval_val_dataset['ob_next']) logger.record_tabular('FwdValError', fwd_dynamics_err) logger.dump_tabular() #if iters_so_far= #print(logger.get_dir()) #print(targs_names) # if num_eval_episodes > 0: # win = plot(viz, win, logger.get_dir(), targs_names=targs_names, quant=quant, opt='best') # else: # logger.dump_tabular() # if iters_so_far==21: # sys.exit() # ----------- #logger.dump_tabular() yield pi if collect_val_data: with open(validation_set_path, 'wb') as f: pickle.dump(val_dataset_collect, f) logger.log('Save {} validation data'.format(len(val_dataset_collect)))
def run(env_id, seed, noise_type, num_cpu, layer_norm, logdir, gym_monitor, evaluation, bind_to_core, **kwargs): kwargs['logdir'] = logdir whoami = mpi_fork(num_cpu, bind_to_core=bind_to_core) if whoami == 'parent': sys.exit(0) # Configure things. rank = MPI.COMM_WORLD.Get_rank() if rank != 0: # Write to temp directory for all non-master workers. actual_dir = None Logger.CURRENT.close() Logger.CURRENT = Logger(dir=mkdtemp(), output_formats=[]) logger.set_level(logger.DISABLED) # Create envs. if rank == 0: env = gym.make(env_id) if gym_monitor and logdir: env = gym.wrappers.Monitor(env, os.path.join(logdir, 'gym_train'), force=True) env = SimpleMonitor(env) if evaluation: eval_env = gym.make(env_id) if gym_monitor and logdir: eval_env = gym.wrappers.Monitor(eval_env, os.path.join(logdir, 'gym_eval'), force=True) eval_env = SimpleMonitor(eval_env) else: eval_env = None else: env = gym.make(env_id) if evaluation: eval_env = gym.make(env_id) else: eval_env = None # Parse noise_type action_noise = None param_noise = None nb_actions = env.action_space.shape[-1] for current_noise_type in noise_type.split(','): current_noise_type = current_noise_type.strip() if current_noise_type == 'none': pass elif 'adaptive-param' in current_noise_type: _, stddev = current_noise_type.split('_') param_noise = AdaptiveParamNoiseSpec(initial_stddev=float(stddev), desired_action_stddev=float(stddev)) elif 'normal' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = NormalActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) elif 'ou' in current_noise_type: _, stddev = current_noise_type.split('_') action_noise = OrnsteinUhlenbeckActionNoise(mu=np.zeros(nb_actions), sigma=float(stddev) * np.ones(nb_actions)) else: raise RuntimeError('unknown noise type "{}"'.format(current_noise_type)) # Configure components. memory = Memory(limit=int(1e6), action_shape=env.action_space.shape, observation_shape=env.observation_space.shape) critic = Critic(layer_norm=layer_norm) actor = Actor(nb_actions, layer_norm=layer_norm) # Seed everything to make things reproducible. seed = seed + 1000000 * rank logger.info('rank {}: seed={}, logdir={}'.format(rank, seed, logger.get_dir())) tf.reset_default_graph() set_global_seeds(seed) env.seed(seed) if eval_env is not None: eval_env.seed(seed) # Disable logging for rank != 0 to avoid noise. if rank == 0: start_time = time.time() training.train(env=env, eval_env=eval_env, param_noise=param_noise, action_noise=action_noise, actor=actor, critic=critic, memory=memory, **kwargs) env.close() if eval_env is not None: eval_env.close() Logger.CURRENT.close() if rank == 0: logger.info('total runtime: {}s'.format(time.time() - start_time))
def make_env(): e = gym.make(env_id) e = bench.Monitor(e, logger.get_dir(), allow_early_resets=True) e.seed(seed) return e
def make_env(): env = gym.make(env_id) env = bench.Monitor(env, logger.get_dir(), allow_early_resets=True) return env
def main(): # configure logger, disable logging in child MPI processes (with rank > 0) arg_parser = common_arg_parser() arg_parser.add_argument('--options_play', help='Agent play with options', default=False, action='store_true') arg_parser.add_argument( '--selective_option_play', default=False, action='store_true', help='Agent play with selective option') args, unknown_args = arg_parser.parse_known_args() extra_args = {k: parse(v) for k, v in parse_unknown_args(unknown_args).items()} if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: rank = 0 logger.configure() else: logger.configure(format_strs=[]) rank = MPI.COMM_WORLD.Get_rank() model, _ = train(args, extra_args) if args.save_path is not None and rank == 0: save_path = osp.expanduser(args.save_path) model.save(save_path) if args.play: logger.log("Running trained model") env = build_env(args) obs = env.reset() w, h, _ = obs[0].shape video = MovieWriter(osp.join(logger.get_dir(), "play.mp4"), (w, h), 2) video.add_frame(np.array(obs[0][:, :, ::-1], dtype=np.uint8)) import cv2 cv2.imwrite(os.path.join(logger.get_dir(), "init_ob.png"), obs[0][:, :, ::-1]) i = 0 while i < 1000: i += 1 actions = model.step(obs, stochastic=True)[0] obs, _, done, _ = env.step(actions) # env.render() video.add_frame(np.array(obs[0][:, :, ::-1], dtype=np.uint8)) # done = done.any() if isinstance(done, np.ndarray) else done done = done[0] if done: obs = env.reset() env.close() video.close() break if args.selective_option_play: logger.log("Running selective option play with trained options policy") env = build_env(args) obs = env.reset() w, h, _ = obs[0].shape video = MovieWriter( osp.join(logger.get_dir(), "selective_option_play.mp4"), (w, h), 2) video.add_frame(np.array(obs[0][:, :, ::-1], dtype=np.uint8)) import cv2 cv2.imwrite(os.path.join(logger.get_dir(), "init_ob.png"), obs[0][:, :, ::-1]) i = 0 while i < 1000: i += 1 actions = model.selective_option_step(obs, stochastic=True)[0] obs, _, done, _ = env.step(actions) # env.render() video.add_frame(np.array(obs[0][:, :, ::-1], dtype=np.uint8)) # done = done.any() if isinstance(done, np.ndarray) else done done = done[0] if done: obs = env.reset() env.close() video.close() break if args.options_play: logger.log("Running trained options policy") video_path = osp.join(logger.get_dir(), "options_play") if not osp.exists(video_path): os.mkdir(video_path) # assume 64 options for i in range(64): env = build_env(args) obs = env.reset() w, h, _ = obs[0].shape logger.log("Create op_play_{}.mp4".format(i)) video = MovieWriter( osp.join(video_path, "op_play_{}.mp4".format(i)), (w, h), 2) video.add_frame(np.array(obs[0][:, :, ::-1], dtype=np.uint8)) option_z = np.zeros((env.num_envs, 64)) option_z[:, i] = 1.0 step = 1 while step < 1000: actions = model.option_step(option_z, obs, stochastic=True)[0] discri = model.option_select(obs)[0] logger.log("step: {} discriminator: {}".format(step, discri)) obs, _, done, _ = env.step(actions) video.add_frame(np.array(obs[0][:, :, ::-1], dtype=np.uint8)) done = done[0] step += 1 if done: obs = env.reset() env.close() video.close() break
def learn(*, network, env, total_timesteps, sess=None, fixstd=False, logstd_init=0, eval_env=None, seed=None, nsteps=2048, ent_coef=0.0, lr=3e-4, vf_coef=0.5, max_grad_norm=None, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, **network_kwargs): ''' Learn policy using PPO algorithm (https://arxiv.org/abs/1707.06347) Parameters: ---------- network: policy network architecture. Either string (mlp, lstm, lnlstm, cnn_lstm, cnn, cnn_small, conv_only - see baselines.common/models.py for full list) specifying the standard network architecture, or a function that takes tensorflow tensor as input and returns tuple (output_tensor, extra_feed) where output tensor is the last network layer output, extra_feed is None for feed-forward neural nets, and extra_feed is a dictionary describing how to feed state into the network for recurrent neural nets. See common/models.py/lstm for more details on using recurrent nets in policies env: baselines.common.vec_env.VecEnv environment. Needs to be vectorized for parallel environment simulation. The environments produced by gym.make can be wrapped using baselines.common.vec_env.DummyVecEnv class. nsteps: int number of steps of the vectorized environment per update (i.e. batch size is nsteps * nenv where nenv is number of environment copies simulated in parallel) total_timesteps: int number of timesteps (i.e. number of actions taken in the environment) ent_coef: float policy entropy coefficient in the optimization objective lr: float or function learning rate, constant or a schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training. vf_coef: float value function loss coefficient in the optimization objective max_grad_norm: float or None gradient norm clipping coefficient gamma: float discounting factor lam: float advantage estimation discounting factor (lambda in the paper) log_interval: int number of timesteps between logging events nminibatches: int number of training minibatches per update. For recurrent policies, should be smaller or equal than number of environments run in parallel. noptepochs: int number of training epochs per update cliprange: float or function clipping range, constant or schedule function [0,1] -> R+ where 1 is beginning of the training and 0 is the end of the training save_interval: int number of timesteps between saving events load_path: str path to load the model from **network_kwargs: keyword arguments to the policy / network builder. See baselines.common/policies.py/build_policy and arguments to a particular type of network For instance, 'mlp' network architecture has arguments num_hidden and num_layers. ''' set_global_seeds(seed) if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) policy = build_policy(env, network, **network_kwargs) # Get the nb of env nenvs = env.num_envs # Get state_space and action_space ob_space = env.observation_space ac_space = env.action_space obdim = ob_space.shape[0] acdim = ac_space.shape[0] print("obdim : ", ob_space.shape[0]) print("acdim : ", ac_space.shape[0]) # Calculate the batch_size nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches # Instantiate the model object (that creates act_model and train_model) make_model = lambda: Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) model = make_model() if load_path is not None: model.load(load_path) # Instantiate the runner object runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) if eval_env is not None: eval_runner = EvalRunner(env=eval_env, model=model, nsteps=5000, gamma=gamma, lam=lam) eval_runner.obfilt = runner.obfilt eval_runner.rewfilt = runner.rewfilt epinfobuf = deque(maxlen=10) if eval_env is not None: eval_epinfobuf = deque(maxlen=10) # Start total timer tfirststart = time.time() nupdates = total_timesteps // nbatch logstd_var = None for var in tf.trainable_variables(): if 'logstd' in var.name: logstd_var = var sess.run(logstd_var.assign(logstd_init * np.ones(shape=(1, acdim)))) for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 # Start timer tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates # Calculate the learning rate lrnow = lr(frac) # Calculate the cliprange cliprangenow = cliprange(frac) # Get minibatch if fixstd: frac_ = np.maximum(1.0 - (update - 1.0) / nupdates, 0) sess.run( logstd_var.assign(((logstd_init + 1) * frac_ - 1) * np.ones(shape=(1, acdim)))) obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run( ) #pylint: disable=E0632 edge_s = np.transpose(obs)[:15].reshape(-1, 3, len(obs)) cloud_s = np.vstack([path['cloud_cpu_used'] for path in epinfos]) edge_queue = edge_s[2] - edge_s[1] # shape (episode length, 8) edge_cpu = edge_s[3] workload = edge_s[4] cloud_cpu = cloud_s[0] edge_queue_avg = edge_queue.mean() # shape (,) # cloud_queue_avg = cloud_queue.mean() # float edge_power = 10 * (40 * edge_cpu.sum(axis=0) * (10**9) / 10)**3 # shape (5000,) cloud_power = 54 * (216 * cloud_cpu * (10**9) / 54)**3 # shape (5000,) edge_power_avg = edge_power.mean() cloud_power_avg = cloud_power.mean() power = edge_power_avg + cloud_power_avg if eval_env is not None: eval_obs, eval_epinfos = eval_runner.run() #pylint: disable=E0632 eval_edge_s = np.transpose(eval_obs)[:15].reshape( -1, 3, len(eval_obs)) eval_cloud_s = np.vstack( [path['cloud_cpu_used'] for path in eval_epinfos]) eval_edge_queue = eval_edge_s[2] - eval_edge_s[ 1] # shape (episode length, 8) eval_edge_cpu = eval_edge_s[3] eval_workload = eval_edge_s[4] # eval_cloud_queue = eval_cloud_s[2] eval_cloud_cpu = eval_cloud_s[0] eval_edge_queue_avg = eval_edge_queue.mean() # shape (,) # eval_cloud_queue_avg = eval_cloud_queue.mean() # float eval_edge_power = 10 * (40 * eval_edge_cpu.sum(axis=0) * (10**9) / 10)**3 # shape (5000,) eval_cloud_power = 54 * (216 * eval_cloud_cpu * (10**9) / 54)**3 # shape (5000,) eval_edge_power_avg = eval_edge_power.mean() eval_cloud_power_avg = eval_cloud_power.mean() eval_power = eval_edge_power_avg + eval_cloud_power_avg epinfobuf.extend(epinfos) if eval_env is not None: eval_epinfobuf.extend(eval_epinfos) obs = runner.obfilt(obs) # Here what we're going to do is for each minibatch calculate the loss and append it. mblossvals = [] if states is None: # nonrecurrent version # Index of each element of batch_size # Create the indices array inds = np.arange(nbatch) for _ in range(noptepochs): # Randomize the indexes np.random.shuffle(inds) # 0 to batch_size with batch_train_size step for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) envsperbatch = nbatch_train // nsteps for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append( model.train(lrnow, cliprangenow, *slices, mbstates)) # Feedforward --> get losses --> update lossvals = np.mean(mblossvals, axis=0) # End timer tnow = time.time() # Calculate the fps (frame per second) fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: # Calculates if value function is a good predicator of the returns (ev > 1) # or if it's just worse than predicting nothing (ev =< 0) ev = explained_variance(values, returns) logger.logkv("serial_timesteps", update * nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) logger.logkv("edge_queue_avg", edge_queue_avg) logger.logkv("power", power) for i in range(200): start_ = i * 100 end_ = (i + 1) * 100 logger.logkv("eval_q_1_%02d%02d" % (i, i + 1), np.mean(eval_edge_queue[0, start_:end_])) logger.logkv("eval_q_2_%02d%02d" % (i, i + 1), np.mean(eval_edge_queue[1, start_:end_])) logger.logkv("eval_q_3_%02d%02d" % (i, i + 1), np.mean(eval_edge_queue[2, start_:end_])) logger.logkv( "eval_q_var_%02d%02d" % (i, i + 1), np.mean(np.var(eval_edge_queue[:3, start_:end_], axis=0))) logger.logkv("eval_q_1_all", np.mean(eval_edge_queue[0, :])) logger.logkv("eval_q_2_all", np.mean(eval_edge_queue[1, :])) logger.logkv("eval_q_3_all", np.mean(eval_edge_queue[2, :])) logger.logkv("eval_q_var_all", np.mean(np.var(eval_edge_queue[:3, :], axis=0))) logger.logkv("eval_edge_queue_avg", eval_edge_queue_avg) logger.logkv("eval_power", eval_power) # logger.record_tabular('return-average', np.mean(total_returns)) # logger.record_tabular('return-average', np.mean(total_returns)) # logger.record_tabular('return-average', np.mean(total_returns)) # logger.record_tabular('return-min', np.min(total_returns)) # logger.record_tabular('return-max', np.max(total_returns)) # logger.record_tabular('return-std', np.std(total_returns)) # logger.record_tabular('episode-length-avg', np.mean(episode_lengths)) # logger.record_tabular('episode-length-min', np.min(episode_lengths)) # logger.record_tabular('episode-length-max', np.max(episode_lengths)) # logger.record_tabular('episode-length-std', np.std(episode_lengths)) # if eval_env is not None: # logger.logkv('eval_eprewmean', safemean([epinfo['r'] for epinfo in eval_epinfobuf]) ) # logger.logkv('eval_eplenmean', safemean([epinfo['l'] for epinfo in eval_epinfobuf]) ) # logger.logkv("eval_edge_queue_avg", eval_edge_queue_avg) # logger.logkv("eval_power", eval_power) # logger.logkv('time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) if MPI is None or MPI.COMM_WORLD.Get_rank() == 0: logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir() and ( MPI is None or MPI.COMM_WORLD.Get_rank() == 0): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % update) print('Saving to', savepath) model.save(savepath) return model
def build_env(args): ncpu = multiprocessing.cpu_count() if sys.platform == 'darwin': ncpu //= 2 nenv = args.num_env or ncpu alg = args.alg rank = MPI.COMM_WORLD.Get_rank() if MPI else 0 seed = args.seed env_type, env_id = get_env_type(args.env) if env_type == 'mujoco': get_session(tf.ConfigProto(allow_soft_placement=True, intra_op_parallelism_threads=1, inter_op_parallelism_threads=1)) if args.num_env: env = SubprocVecEnv([lambda: make_mujoco_env(env_id, seed + i if seed is not None else None, args.reward_scale) for i in range(args.num_env)]) else: env = DummyVecEnv([lambda: make_mujoco_env(env_id, seed, args.reward_scale)]) env = VecNormalize(env) elif env_type == 'atari': if alg == 'acer': env = make_atari_env(env_id, nenv, seed) elif alg == 'deepq': env = atari_wrappers.make_atari(env_id) env.seed(seed) env = bench.Monitor(env, logger.get_dir()) env = atari_wrappers.wrap_deepmind(env, frame_stack=True, scale=True) elif alg == 'trpo_mpi': env = atari_wrappers.make_atari(env_id) env.seed(seed) env = bench.Monitor(env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) env = atari_wrappers.wrap_deepmind(env) # TODO check if the second seeding is necessary, and eventually remove env.seed(seed) else: frame_stack_size = 4 env = VecFrameStack(make_atari_env(env_id, nenv, seed), frame_stack_size) elif env_type == 'retro': import retro gamestate = args.gamestate or 'Level1-1' env = retro_wrappers.make_retro(game=args.env, state=gamestate, max_episode_steps=10000, use_restricted_actions=retro.Actions.DISCRETE) env.seed(args.seed) env = bench.Monitor(env, logger.get_dir()) env = retro_wrappers.wrap_deepmind_retro(env) elif env_type == 'classic_control': def make_env(): e = gym.make(env_id) e = bench.Monitor(e, logger.get_dir(), allow_early_resets=True) e.seed(seed) return e env = DummyVecEnv([make_env]) elif env_type == 'maze': frame_stack_size = 1 env = VecFrameStack(make_maze_env(env_id, nenv, seed), frame_stack_size) else: raise ValueError('Unknown env_type {}'.format(env_type)) return env
def osi_train_callback(model, name, iter): params = model.get_variable_dict() joblib.dump(params, logger.get_dir() + '/osi_params.pkl', compress=True)
def main(env_name, seed, run_num, data_saving_path, batch_size_per_process, num_iterations, autoencoder_base="./novelty_data/local/autoencoders/"): num_processes = MPI.COMM_WORLD.Get_size() num_timesteps_per_process = batch_size_per_process num_iterations_enforce = num_iterations import baselines.common.tf_util as U comm = MPI.COMM_WORLD mpi_rank = comm.Get_rank() tf.reset_default_graph() with U.single_threaded_session() as sess: autoencoder_list = [] for i in range(run_num): autoencoder_model = load_model(autoencoder_base + env_name + '_autoencoder_seed_' + str(seed) + '_run_' + str(i) + '.h5') autoencoder_list.append(autoencoder_model) U.ALREADY_INITIALIZED.update(set(tf.global_variables())) logger.reset() # logger.configure( # '../data/ppo_' + enforce_env_name + '_autoencoder_' + str(len(autoencoder_list)) + '_seed=' + str( # seed) + '/' + str(st)) logger.configure(data_saving_path) model = train(sess, env_name, num_timesteps=num_iterations_enforce * num_processes * num_timesteps_per_process, timesteps_per_actor=num_timesteps_per_process, autoencoders=autoencoder_list, seed=seed) if mpi_rank == 0: env = gym.make(env_name) env.env.novel_autoencoders = autoencoder_list if hasattr(env.env, 'disableViewer'): env.env.disableViewer = False env = wrappers.Monitor(env, logger.get_dir() + '/../results', force=True) obs = env.reset() step = 0 while (True): env.render() actions = model._act(False, obs) obs, _, done, _ = env.step(actions[0][0]) env.render() if done: obs = env.reset() if done: print("Visualization is Done") break step += 1
def get_filename(self): # KC modif # filename = os.path.join(logger.get_dir(), 'videos_{}.pk'.format(MPI.COMM_WORLD.Get_rank())) filename = os.path.join(logger.get_dir(), 'videos_{}.pk'.format(0)) return filename
def train(sess, env_id, num_timesteps, timesteps_per_actor, autoencoders, seed): from baselines.ppo1 import pposgd_novelty, mlp_policy_novelty, pposgd_novelty_projection rank = MPI.COMM_WORLD.Get_rank() workerseed = seed * 50 + 10000 * MPI.COMM_WORLD.Get_rank( ) if seed is not None else None set_global_seeds(workerseed) env = gym.make(env_id) def policy_fn(name, ob_space, ac_space): # pylint: disable=W0613 return mlp_policy_novelty.MlpPolicyNovelty( name=name, ob_space=ob_space, ac_space=ac_space, hid_size=64, num_hid_layers=3, ) env.env.novel_autoencoders = autoencoders env = bench.Monitor( env, logger.get_dir() and osp.join(logger.get_dir(), str(rank))) env.seed(seed + rank) if len(autoencoders) == 0: print("NO AUTOENCODER!!") model = pposgd_novelty.learn( env, policy_fn, max_timesteps=num_timesteps, # max_iters=30, timesteps_per_actorbatch=timesteps_per_actor, clip_param=0.2, entcoeff=0, optim_epochs=3, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', callback=callback, # Following params are kwargs session=sess, gap=5, ) else: print("AUTOENCODER LENGTH: ", len(autoencoders)) model = pposgd_novelty_projection.learn( env, policy_fn, max_timesteps=num_timesteps, # max_iters=30, timesteps_per_actorbatch=timesteps_per_actor, clip_param=0.2, entcoeff=0, optim_epochs=3, optim_stepsize=1e-3, optim_batchsize=64, gamma=0.99, lam=0.95, schedule='linear', callback=callback, # Following params are kwargs session=sess, gap=5, ) env.close() return model
def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr_alpha, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, r_ex_coef=0, r_in_coef=1, lr_beta=1E-4, reward_freq=1): if isinstance(lr_alpha, float): lr_alpha = constfn(lr_alpha) else: assert callable(lr_alpha) if isinstance(lr_beta, float): lr_beta = constfn(lr_beta) else: assert callable(lr_beta) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches make_model = lambda: Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm, r_ex_coef=r_ex_coef, r_in_coef=r_in_coef) if save_interval and logger.get_dir(): import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam, r_ex_coef=r_ex_coef, r_in_coef=r_in_coef, reward_freq=reward_freq) epinfobuf = deque(maxlen=100) eprinbuf = deque(maxlen=100) tfirststart = time.time() nupdates = total_timesteps // nbatch for update in range(1, nupdates + 1): assert nbatch % nminibatches == 0 nbatch_train = nbatch // nminibatches tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates cur_lr_alpha = lr_alpha(frac) cur_lr_beta = lr_beta(frac) cur_cliprange = cliprange(frac) obs, masks, actions, neglogpacs, r_ex, r_in, ret_ex, ret_mix, v_ex, v_mix, td_mix, states,\ epinfos, ep_r_ex, ep_r_in, ep_len = runner.run() #pylint: disable=E0632 epinfobuf.extend(epinfos) eprinbuf.extend(ep_r_in) mblossvals = [] if states is None: # nonrecurrent version inds = np.arange(nbatch) for _ in range(noptepochs): np.random.shuffle(inds) for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] coef_mat = np.zeros([nbatch_train, nbatch], "float32") for i in range(nbatch_train): coef = 1.0 for j in range(mbinds[i], nbatch): if j > mbinds[i] and (masks[j] or j % nsteps == 0): break coef_mat[i][j] = coef coef *= gamma * lam entropy, approxkl, clipfrac = model.train( obs[mbinds], obs, actions[mbinds], actions, neglogpacs[mbinds], None, masks[mbinds], r_ex, ret_ex[mbinds], v_ex[mbinds], td_mix, v_mix[mbinds], coef_mat, cur_lr_alpha, cur_lr_beta, cur_cliprange) else: # recurrent version raise NotImplementedError lossvals = np.mean(mblossvals, axis=0) tnow = time.time() fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: logger.logkv("serial_timesteps", update * nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update * nbatch) logger.logkv("fps", fps) logger.logkv('mean_reward', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('mean_length', safemean([epinfo['l'] for epinfo in epinfobuf])) logger.logkv('time_elapsed', tnow - tfirststart) v_ex_ev = explained_variance(v_ex, ret_ex) logger.logkv("v_ex_ev", float(v_ex_ev)) v_mix_ev = explained_variance(v_mix, ret_mix) logger.logkv("v_mix_ev", float(v_mix_ev)) logger.logkv("entropy", float(entropy)) logger.logkv("approxkl", float(approxkl)) logger.logkv("clipfrac", float(clipfrac)) logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i' % update) print('Saving to', savepath) model.save(savepath) env.close()
def __init__(self, make_env, hps, num_timesteps, envs_per_process): self.make_env = make_env self.hps = hps self.envs_per_process = envs_per_process self.num_timesteps = num_timesteps self._set_env_vars() self.policy = CnnPolicy(scope='pol', ob_space=self.ob_space, ac_space=self.ac_space, hidsize=512, feat_dim=hps['feat_dim'], ob_mean=self.ob_mean, ob_std=self.ob_std, layernormalize=False, nl=tf.nn.leaky_relu) self.feature_extractor = { "none": FeatureExtractor, "idf": InverseDynamics, "vaesph": partial(VAE, spherical_obs=True), "vaenonsph": partial(VAE, spherical_obs=False), "pix2pix": JustPixels }[hps['feat_learning']] self.feature_extractor = self.feature_extractor( policy=self.policy, features_shared_with_policy=False, feat_dim=hps['feat_dim'], layernormalize=hps['layernorm']) self.intrinsic_model = IntrinsicModel if hps[ 'feat_learning'] != 'pix2pix' else UNet self.intrinsic_model = self.intrinsic_model( auxiliary_task=self.feature_extractor, predict_from_pixels=hps['dyn_from_pixels'], feature_space=hps['feature_space'], nsteps_per_seg=hps['nsteps_per_seg'], feat_dim=hps['feat_dim'], naudio_samples=hps['naudio_samples'], train_discriminator=hps['train_discriminator'], discriminator_weighted=hps['discriminator_weighted'], noise_multiplier=hps['noise_multiplier'], concat=hps['concat'], log_dir=logger.get_dir(), make_video=hps['checkpoint_path'] != '') self.agent = PpoOptimizer(scope='ppo', ob_space=self.ob_space, ac_space=self.ac_space, stochpol=self.policy, use_news=hps['use_news'], gamma=hps['gamma'], lam=hps["lambda"], nepochs=hps['nepochs'], nminibatches=hps['nminibatches'], lr=hps['lr'], cliprange=0.1, nsteps_per_seg=hps['nsteps_per_seg'], nsegs_per_env=hps['nsegs_per_env'], ent_coef=hps['ent_coeff'], normrew=hps['norm_rew'], normadv=hps['norm_adv'], ext_coeff=hps['ext_coeff'], int_coeff=hps['int_coeff'], feature_space=hps['feature_space'], intrinsic_model=self.intrinsic_model, log_dir=logger.get_dir(), checkpoint_path=hps['checkpoint_path']) self.agent.to_report['aux'] = tf.reduce_mean( self.feature_extractor.loss) self.agent.total_loss += self.agent.to_report['aux'] if hps['feature_space'] == 'joint': self.agent.to_report['dyn_visual_loss'] = tf.reduce_mean( self.intrinsic_model.visual_loss) self.agent.to_report['dyn_audio_loss'] = tf.reduce_mean( self.intrinsic_model.audio_loss) self.agent.to_report['discrim_train_loss'] = tf.reduce_mean( self.intrinsic_model.discrim_train_loss) self.agent.to_report['intrinsic_model_loss'] = tf.reduce_mean( self.intrinsic_model.loss) elif hps['train_discriminator']: self.agent.to_report['intrinsic_model_loss'] = tf.reduce_mean( self.intrinsic_model.discrim_train_loss) else: self.agent.to_report['intrinsic_model_loss'] = tf.reduce_mean( self.intrinsic_model.loss) self.agent.total_loss += self.agent.to_report['intrinsic_model_loss'] self.agent.to_report['feat_var'] = tf.reduce_mean( tf.nn.moments(self.feature_extractor.features, [0, 1])[1])
def launch( env_name, logdir, n_epochs, num_cpu, seed, replay_strategy, policy_save_interval, clip_return, override_params={}, save_policies=True ): # Fork for multi-CPU MPI implementation. if num_cpu > 1: whoami = mpi_fork(num_cpu) if whoami == 'parent': sys.exit(0) import baselines.common.tf_util as U U.single_threaded_session().__enter__() rank = MPI.COMM_WORLD.Get_rank() # Configure logging if rank == 0: if logdir or logger.get_dir() is None: logger.configure(dir=logdir) else: logger.configure() logdir = logger.get_dir() assert logdir is not None os.makedirs(logdir, exist_ok=True) # Seed everything. rank_seed = seed + 1000000 * rank set_global_seeds(rank_seed) # Prepare params. params = config.DEFAULT_PARAMS params['env_name'] = env_name params['replay_strategy'] = replay_strategy if env_name in config.DEFAULT_ENV_PARAMS: params.update(config.DEFAULT_ENV_PARAMS[env_name]) # merge env-specific parameters in params.update(**override_params) # makes it possible to override any parameter with open(os.path.join(logger.get_dir(), 'params.json'), 'w') as f: json.dump(params, f) params = config.prepare_params(params) config.log_params(params, logger=logger) dims = config.configure_dims(params) policy = config.configure_ddpg(dims=dims, params=params, clip_return=clip_return) rollout_params = { 'exploit': False, 'use_target_net': False, 'use_demo_states': True, 'compute_Q': False, 'T': params['T'], } eval_params = { 'exploit': True, 'use_target_net': params['test_with_polyak'], 'use_demo_states': False, 'compute_Q': True, 'T': params['T'], } for name in ['T', 'rollout_batch_size', 'gamma', 'noise_eps', 'random_eps']: rollout_params[name] = params[name] eval_params[name] = params[name] rollout_worker = RolloutWorker(params['make_env'], policy, dims, logger, **rollout_params) rollout_worker.seed(rank_seed) evaluator = RolloutWorker(params['make_env'], policy, dims, logger, **eval_params) evaluator.seed(rank_seed) train( logdir=logdir, policy=policy, rollout_worker=rollout_worker, evaluator=evaluator, n_epochs=n_epochs, n_test_rollouts=params['n_test_rollouts'], n_cycles=params['n_cycles'], n_batches=params['n_batches'], policy_save_interval=policy_save_interval, save_policies=save_policies)
def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0): if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches make_model = lambda : Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) if save_interval and logger.get_dir(): import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf = deque(maxlen=100) tfirststart = time.time() nupdates = total_timesteps//nbatch for update in range(1, nupdates+1): assert nbatch % nminibatches == 0 nbatch_train = nbatch // nminibatches tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates lrnow = lr(frac) cliprangenow = cliprange(frac) obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632 epinfobuf.extend(epinfos) mblossvals = [] if states is None: # nonrecurrent version inds = np.arange(nbatch) for _ in range(noptepochs): np.random.shuffle(inds) for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) envsperbatch = nbatch_train // nsteps for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates)) lossvals = np.mean(mblossvals, axis=0) tnow = time.time() fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: ev = explained_variance(values, returns) logger.logkv("serial_timesteps", update*nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update*nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) logger.logkv('time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) logger.dumpkvs() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i'%update) print('Saving to', savepath) model.save(savepath) env.close()
def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=1, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, save_interval=None, lrschedule='linear', load_path=None, is_async=True, **network_kwargs): set_global_seeds(seed) if network == 'cnn': network_kwargs['one_dim_bias'] = True policy = build_policy(env, network, **network_kwargs) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space make_model = lambda : Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps =nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef= vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip, lrschedule=lrschedule, is_async=is_async) if save_interval and logger.get_dir(): import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() if load_path is not None: model.load(load_path) runner = Runner(env, model, nsteps=nsteps, gamma=gamma) epinfobuf = deque(maxlen=100) nbatch = nenvs*nsteps tstart = time.time() coord = tf.train.Coordinator() if is_async: enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True) else: enqueue_threads = [] for update in range(1, total_timesteps//nbatch+1): obs, states, rewards, masks, actions, values, epinfos = runner.run() epinfobuf.extend(epinfos) policy_loss, value_loss, policy_entropy = model.train(obs, states, rewards, masks, actions, values) model.old_obs = obs nseconds = time.time()-tstart fps = int((update*nbatch)/nseconds) if update % log_interval == 0 or update == 1: ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update*nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("policy_loss", float(policy_loss)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular("eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf])) logger.record_tabular("eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf])) logger.dump_tabular() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): savepath = osp.join(logger.get_dir(), 'checkpoint%.5i'%update) print('Saving to', savepath) model.save(savepath) coord.request_stop() coord.join(enqueue_threads) return model
def _thunk(): env = make_atari(env_id) env.seed(seed + rank) env = bench.Monitor(env, logger.get_dir() and os.path.join(logger.get_dir(), str(rank))) gym.logger.setLevel(logging.WARN) return wrap_deepmind(env)
def train(env, nb_epochs, nb_epoch_cycles, render_eval, reward_scale, render, param_noise, actor, critic, normalize_returns, normalize_observations, critic_l2_reg, actor_lr, critic_lr, action_noise, popart, gamma, clip_norm, nb_train_steps, nb_rollout_steps, nb_eval_steps, batch_size, memory, tau=0.01, eval_env=None, param_noise_adaption_interval=50): rank = MPI.COMM_WORLD.Get_rank() assert (np.abs(env.action_space.low) == env.action_space.high).all() # we assume symmetric actions. max_action = env.action_space.high logger.info('scaling actions by {} before executing in env'.format(max_action)) agent = DDPG(actor, critic, memory, env.observation_space.shape, env.action_space.shape, gamma=gamma, tau=tau, normalize_returns=normalize_returns, normalize_observations=normalize_observations, batch_size=batch_size, action_noise=action_noise, param_noise=param_noise, critic_l2_reg=critic_l2_reg, actor_lr=actor_lr, critic_lr=critic_lr, enable_popart=popart, clip_norm=clip_norm, reward_scale=reward_scale) logger.info('Using agent with the following configuration:') logger.info(str(agent.__dict__.items())) # Set up logging stuff only for a single worker. if rank == 0: saver = tf.train.Saver() else: saver = None step = 0 episode = 0 eval_episode_rewards_history = deque(maxlen=100) episode_rewards_history = deque(maxlen=100) with U.single_threaded_session() as sess: # Prepare everything. agent.initialize(sess) sess.graph.finalize() agent.reset() obs = env.reset() if eval_env is not None: eval_obs = eval_env.reset() done = False episode_reward = 0. episode_step = 0 episodes = 0 t = 0 epoch = 0 start_time = time.time() epoch_episode_rewards = [] epoch_episode_steps = [] epoch_episode_eval_rewards = [] epoch_episode_eval_steps = [] epoch_start_time = time.time() epoch_actions = [] epoch_qs = [] epoch_episodes = 0 for epoch in range(nb_epochs): for cycle in range(nb_epoch_cycles): # Perform rollouts. for t_rollout in range(nb_rollout_steps): # Predict next action. action, q = agent.pi(obs, apply_noise=True, compute_Q=True) assert action.shape == env.action_space.shape # Execute next action. if rank == 0 and render: env.render() assert max_action.shape == action.shape new_obs, r, done, info = env.step(max_action * action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) t += 1 if rank == 0 and render: env.render() episode_reward += r episode_step += 1 # Book-keeping. epoch_actions.append(action) epoch_qs.append(q) agent.store_transition(obs, action, r, new_obs, done) obs = new_obs if done: # Episode done. epoch_episode_rewards.append(episode_reward) episode_rewards_history.append(episode_reward) epoch_episode_steps.append(episode_step) episode_reward = 0. episode_step = 0 epoch_episodes += 1 episodes += 1 agent.reset() obs = env.reset() # Train. epoch_actor_losses = [] epoch_critic_losses = [] epoch_adaptive_distances = [] for t_train in range(nb_train_steps): # Adapt param noise, if necessary. if memory.nb_entries >= batch_size and t % param_noise_adaption_interval == 0: distance = agent.adapt_param_noise() epoch_adaptive_distances.append(distance) cl, al = agent.train() epoch_critic_losses.append(cl) epoch_actor_losses.append(al) agent.update_target_net() # Evaluate. eval_episode_rewards = [] eval_qs = [] if eval_env is not None: eval_episode_reward = 0. for t_rollout in range(nb_eval_steps): eval_action, eval_q = agent.pi(eval_obs, apply_noise=False, compute_Q=True) eval_obs, eval_r, eval_done, eval_info = eval_env.step(max_action * eval_action) # scale for execution in env (as far as DDPG is concerned, every action is in [-1, 1]) if render_eval: eval_env.render() eval_episode_reward += eval_r eval_qs.append(eval_q) if eval_done: eval_obs = eval_env.reset() eval_episode_rewards.append(eval_episode_reward) eval_episode_rewards_history.append(eval_episode_reward) eval_episode_reward = 0. mpi_size = MPI.COMM_WORLD.Get_size() # Log stats. # XXX shouldn't call np.mean on variable length lists duration = time.time() - start_time stats = agent.get_stats() combined_stats = stats.copy() combined_stats['rollout/return'] = np.mean(epoch_episode_rewards) combined_stats['rollout/return_history'] = np.mean(episode_rewards_history) combined_stats['rollout/episode_steps'] = np.mean(epoch_episode_steps) combined_stats['rollout/actions_mean'] = np.mean(epoch_actions) combined_stats['rollout/Q_mean'] = np.mean(epoch_qs) combined_stats['train/loss_actor'] = np.mean(epoch_actor_losses) combined_stats['train/loss_critic'] = np.mean(epoch_critic_losses) combined_stats['train/param_noise_distance'] = np.mean(epoch_adaptive_distances) combined_stats['total/duration'] = duration combined_stats['total/steps_per_second'] = float(t) / float(duration) combined_stats['total/episodes'] = episodes combined_stats['rollout/episodes'] = epoch_episodes combined_stats['rollout/actions_std'] = np.std(epoch_actions) # Evaluation statistics. if eval_env is not None: combined_stats['eval/return'] = eval_episode_rewards combined_stats['eval/return_history'] = np.mean(eval_episode_rewards_history) combined_stats['eval/Q'] = eval_qs combined_stats['eval/episodes'] = len(eval_episode_rewards) def as_scalar(x): if isinstance(x, np.ndarray): assert x.size == 1 return x[0] elif np.isscalar(x): return x else: raise ValueError('expected scalar, got %s'%x) combined_stats_sums = MPI.COMM_WORLD.allreduce(np.array([as_scalar(x) for x in combined_stats.values()])) combined_stats = {k : v / mpi_size for (k,v) in zip(combined_stats.keys(), combined_stats_sums)} # Total statistics. combined_stats['total/epochs'] = epoch + 1 combined_stats['total/steps'] = t for key in sorted(combined_stats.keys()): logger.record_tabular(key, combined_stats[key]) logger.dump_tabular() logger.info('') logdir = logger.get_dir() if rank == 0 and logdir: if hasattr(env, 'get_state'): with open(os.path.join(logdir, 'env_state.pkl'), 'wb') as f: pickle.dump(env.get_state(), f) if eval_env and hasattr(eval_env, 'get_state'): with open(os.path.join(logdir, 'eval_env_state.pkl'), 'wb') as f: pickle.dump(eval_env.get_state(), f)
def learn(network, env, seed, total_timesteps=int(40e6), gamma=0.99, log_interval=100, nprocs=32, nsteps=20, ent_coef=0.01, vf_coef=0.5, vf_fisher_coef=1.0, lr=0.25, max_grad_norm=0.5, kfac_clip=0.001, save_interval=None, lrschedule='linear', load_path=None, is_async=True, **network_kwargs): set_global_seeds(seed) if network == 'cnn': network_kwargs['one_dim_bias'] = True policy = build_policy(env, network, **network_kwargs) nenvs = env.num_envs ob_space = env.observation_space ac_space = env.action_space make_model = lambda: Model(policy, ob_space, ac_space, nenvs, total_timesteps, nprocs=nprocs, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, vf_fisher_coef=vf_fisher_coef, lr=lr, max_grad_norm=max_grad_norm, kfac_clip=kfac_clip, lrschedule=lrschedule, is_async=is_async) if save_interval and logger.get_dir(): import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() if load_path is not None: model.load(load_path) runner = Runner(env, model, nsteps=nsteps, gamma=gamma) epinfobuf = deque(maxlen=100) nbatch = nenvs * nsteps tstart = time.time() coord = tf.train.Coordinator() if is_async: enqueue_threads = model.q_runner.create_threads(model.sess, coord=coord, start=True) else: enqueue_threads = [] for update in range(1, total_timesteps // nbatch + 1): obs, states, rewards, masks, actions, values, epinfos = runner.run() epinfobuf.extend(epinfos) policy_loss, value_loss, policy_entropy = model.train( obs, states, rewards, masks, actions, values) model.old_obs = obs nseconds = time.time() - tstart fps = int((update * nbatch) / nseconds) if update % log_interval == 0 or update == 1: ev = explained_variance(values, rewards) logger.record_tabular("nupdates", update) logger.record_tabular("total_timesteps", update * nbatch) logger.record_tabular("fps", fps) logger.record_tabular("policy_entropy", float(policy_entropy)) logger.record_tabular("policy_loss", float(policy_loss)) logger.record_tabular("value_loss", float(value_loss)) logger.record_tabular("explained_variance", float(ev)) logger.record_tabular( "eprewmean", safemean([epinfo['r'] for epinfo in epinfobuf])) logger.record_tabular( "eplenmean", safemean([epinfo['l'] for epinfo in epinfobuf])) logger.dump_tabular() if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): savepath = osp.join(logger.get_dir(), 'checkpoint%.5i' % update) print('Saving to', savepath) model.save(savepath) coord.request_stop() coord.join(enqueue_threads) return model
def learn(*, policy, env, nsteps, total_timesteps, ent_coef, lr, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, load_path=None, num_casks=0): if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) nenvs = env.num_envs - num_casks ob_space = env.observation_space ac_space = env.action_space nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches make_model = lambda : Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=env.num_envs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) if save_interval and logger.get_dir(): import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() if load_path is not None: model.load(load_path) # load running mean std checkdir = load_path[0:-5] checkpoint = int(load_path.split('/')[-1]) if osp.exists(osp.join(checkdir, '%.5i_ob_rms.pkl' % checkpoint)): with open(osp.join(checkdir, '%.5i_ob_rms.pkl' % checkpoint), 'rb') as ob_rms_fp: env.ob_rms = pickle.load(ob_rms_fp) # if osp.exists(osp.join(checkdir, '%.5i_ret_rms.pkl' % checkpoint)): # with open(osp.join(checkdir, '%.5i_ret_rms.pkl' % checkpoint), 'rb') as ret_rms_fp: # env.ret_rms = pickle.load(ret_rms_fp) # tensorboard writer = tf.summary.FileWriter(logger.get_dir(), tf.get_default_session().graph) runner = Runner(env=env, model=model, nsteps=nsteps, gamma=gamma, lam=lam, writer=writer, num_casks=num_casks) epinfobuf = deque(maxlen=100) tfirststart = time.time() nupdates = total_timesteps//nbatch for update in range(1, nupdates+1): assert nbatch % nminibatches == 0 nbatch_train = nbatch // nminibatches tstart = time.time() frac = 1.0 - (update - 1.0) / nupdates lrnow = lr(frac) cliprangenow = cliprange(frac) obs, returns, masks, actions, values, neglogpacs, states, epinfos = runner.run() #pylint: disable=E0632 epinfobuf.extend(epinfos) mblossvals = [] if states is None: # nonrecurrent version inds = np.arange(nbatch) for _ in range(noptepochs): np.random.shuffle(inds) for start in range(0, nbatch, nbatch_train): end = start + nbatch_train mbinds = inds[start:end] slices = (arr[mbinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mblossvals.append(model.train(lrnow, cliprangenow, *slices)) else: # recurrent version assert nenvs % nminibatches == 0 envsperbatch = nenvs // nminibatches envinds = np.arange(nenvs) flatinds = np.arange(nenvs * nsteps).reshape(nenvs, nsteps) envsperbatch = nbatch_train // nsteps for _ in range(noptepochs): np.random.shuffle(envinds) for start in range(0, nenvs, envsperbatch): end = start + envsperbatch mbenvinds = envinds[start:end] mbflatinds = flatinds[mbenvinds].ravel() slices = (arr[mbflatinds] for arr in (obs, returns, masks, actions, values, neglogpacs)) mbstates = states[mbenvinds] mblossvals.append(model.train(lrnow, cliprangenow, *slices, mbstates)) lossvals = np.mean(mblossvals, axis=0) tnow = time.time() fps = int(nbatch / (tnow - tstart)) if update % log_interval == 0 or update == 1: ev = explained_variance(values, returns) logger.logkv("serial_timesteps", update*nsteps) logger.logkv("nupdates", update) logger.logkv("total_timesteps", update*nbatch) logger.logkv("fps", fps) logger.logkv("explained_variance", float(ev)) logger.logkv('eprewmean', safemean([epinfo['r'] for epinfo in epinfobuf])) logger.logkv('eplenmean', safemean([epinfo['l'] for epinfo in epinfobuf])) logger.logkv('epsrewmean', safemean([epinfo['sr'] for epinfo in epinfobuf])) logger.logkv('time_elapsed', tnow - tfirststart) for (lossval, lossname) in zip(lossvals, model.loss_names): logger.logkv(lossname, lossval) logger.dumpkvs() # tensorboard summary = tf.Summary() summary.value.add(tag='iteration/reward_mean', simple_value=safemean([epinfo['r'] for epinfo in epinfobuf])) summary.value.add(tag='iteration/length_mean', simple_value=safemean([epinfo['l'] for epinfo in epinfobuf])) summary.value.add(tag='iteration/shaped_reward_mean', simple_value=safemean([epinfo['sr'] for epinfo in epinfobuf])) summary.value.add(tag='iteration/fps', simple_value=fps) writer.add_summary(summary, update) if save_interval and (update % save_interval == 0 or update == 1) and logger.get_dir(): checkdir = osp.join(logger.get_dir(), 'checkpoints') os.makedirs(checkdir, exist_ok=True) savepath = osp.join(checkdir, '%.5i'%update) print('Saving to', savepath) model.save(savepath) # save running mean std with open(osp.join(checkdir, '%.5i_ob_rms.pkl' % update), 'wb') as ob_rms_fp: pickle.dump(env.ob_rms, ob_rms_fp) with open(osp.join(checkdir, '%.5i_ret_rms.pkl' % update), 'wb') as ret_rms_fp: pickle.dump(env.ret_rms, ret_rms_fp) env.close()
def make_env(): env = gym.make(env_id) env = bench.Monitor(env, logger.get_dir()) return env
def enjoy(*, policy, imgs, nsteps, total_timesteps, ent_coef, lr, vf_coef=0.5, max_grad_norm=0.5, gamma=0.99, lam=0.95, log_interval=10, nminibatches=4, noptepochs=4, cliprange=0.2, save_interval=0, reload_name=None): if isinstance(lr, float): lr = constfn(lr) else: assert callable(lr) if isinstance(cliprange, float): cliprange = constfn(cliprange) else: assert callable(cliprange) total_timesteps = int(total_timesteps) #nenv = env.num_envs nenvs = 1 ac_space = gym.spaces.Discrete(5) observation_space_shape = (256, 256, 3) sensor_space_shape = (20, ) obs_high = np.inf * np.ones(observation_space_shape) ob_space = gym.spaces.Box(-obs_high, obs_high) nbatch = nenvs * nsteps nbatch_train = nbatch // nminibatches make_model = lambda: Model(policy=policy, ob_space=ob_space, ac_space=ac_space, nbatch_act=nenvs, nbatch_train=nbatch_train, nsteps=nsteps, ent_coef=ent_coef, vf_coef=vf_coef, max_grad_norm=max_grad_norm) if save_interval and logger.get_dir(): import cloudpickle with open(osp.join(logger.get_dir(), 'make_model.pkl'), 'wb') as fh: fh.write(cloudpickle.dumps(make_model)) model = make_model() if reload_name: model.load(reload_name) runner = Runner(imgs=imgs, model=model, nsteps=nsteps, gamma=gamma, lam=lam) epinfobuf = deque(maxlen=100) tfirststart = time.time() actions = runner.run() #pylint: disable=E0632 return actions
def __init__(self, joint_increment_value=0.02, sim_time_factor=0.001, running_step=0.001, random_object=False, random_position=False, use_object_type=False, populate_object=False, env_object_type='free_shapes'): """ initializing all the relevant variables and connections :param joint_increment_value: increment of the joints :param running_step: gazebo simulation time factor :param random_object: spawn random object in the simulation :param random_position: change object position in each reset :param use_object_type: assign IDs to objects and used them in the observation space :param populate_object: to populate object(s) in the simulation using sdf file :param env_object_type: object type for environment, free_shapes for boxes while others are related to use_case 'door_handle', 'combox', ... """ print("Environment parameters: joint_increment_value={}, sim_time_factor={}, running_step={}, random_object={}, random_position={}, \ use_object_type={}, populate_object={}, env_object_type={}".format(joint_increment_value, sim_time_factor, running_step, random_object, random_position, use_object_type, populate_object, env_object_type)) # Assign Parameters self._joint_increment_value = joint_increment_value self.running_step = running_step self._random_object = random_object self._random_position = random_position self._use_object_type = use_object_type self._populate_object = populate_object # Assign MsgTypes self.joints_state = JointState() self.contact_1_state = ContactsState() self.contact_2_state = ContactsState() self.collision = Bool() self.camera_rgb_state = Image() self.camera_depth_state = Image() self.contact_1_force = Vector3() self.contact_2_force = Vector3() self.gripper_state = VacuumGripperState() self._list_of_observations = ["distance_gripper_to_object", "elbow_joint_state", "shoulder_lift_joint_state", "shoulder_pan_joint_state", "wrist_1_joint_state", "wrist_2_joint_state", "wrist_3_joint_state", "contact_1_force", "contact_2_force", "object_pos_x", "object_pos_y", "object_pos_z", "min_distance_gripper_to_object"] if self._use_object_type: self._list_of_observations.append("object_type") # Establishes connection with simulator """ 1) Gazebo Connection 2) Controller Connection 3) Joint Publisher """ self.gazebo = GazeboConnection(sim_time_factor=sim_time_factor) self.controllers_object = ControllersConnection() self.pickbot_joint_publisher_object = JointPub() # Define Subscribers as Sensordata """ 1) /pickbot/joint_states 2) /gripper_contactsensor_1_state 3) /gripper_contactsensor_2_state 4) /gz_collisions not used so far but available in the environment 5) /pickbot/gripper/state 6) /camera_rgb/image_raw 7) /camera_depth/depth/image_raw """ rospy.Subscriber("/pickbot/joint_states", JointState, self.joints_state_callback) rospy.Subscriber("/gripper_contactsensor_1_state", ContactsState, self.contact_1_callback) rospy.Subscriber("/gripper_contactsensor_2_state", ContactsState, self.contact_2_callback) rospy.Subscriber("/gz_collisions", Bool, self.collision_callback) # rospy.Subscriber("/pickbot/gripper/state", VacuumGripperState, self.gripper_state_callback) # rospy.Subscriber("/camera_rgb/image_raw", Image, self.camera_rgb_callback) # rospy.Subscriber("/camera_depth/depth/image_raw", Image, self.camera_depth_callback) # Define Action and state Space and Reward Range """ Action Space: Discrete with 12 actions 1-2) Increment/Decrement joint1_position_controller 3-4) Increment/Decrement joint2_position_controller 5-6) Increment/Decrement joint3_position_controller 7-8) Increment/Decrement joint4_position_controller 9-10) Increment/Decrement joint5_position_controller 11-12) Increment/Decrement joint6_position_controller State Space: Box Space with 12 values. It is a numpy array with shape (12,) Reward Range: -infinity to infinity """ self.action_space = spaces.Discrete(12) high = np.array([ 999, math.pi, math.pi, math.pi, math.pi, math.pi, math.pi, np.finfo(np.float32).max, np.finfo(np.float32).max, 1, 1.4, 1.5, 999]) low = np.array([ 0, -math.pi, -math.pi, -math.pi, -math.pi, -math.pi, -math.pi, 0, 0, -1, 0, 0, 0]) if self._use_object_type: high = np.append(high, 9) low = np.append(low, 0) self.observation_space = spaces.Box(low, high) self.reward_range = (-np.inf, np.inf) self._seed() self.done_reward = 0 # set up everything to publish the Episode Number and Episode Reward on a rostopic self.episode_num = 0 self.accumulated_episode_reward = 0 self.episode_steps = 0 self.reward_pub = rospy.Publisher('/openai/reward', RLExperimentInfo, queue_size=1) self.reward_list = [] self.episode_list = [] self.step_list = [] self.csv_name = logger.get_dir() + '/result_log' print("CSV NAME") print(self.csv_name) self.csv_success_exp = "success_exp" + datetime.datetime.now().strftime('%Y-%m-%d_%Hh%Mmin') + ".csv" self.success_2_contacts = 0 self.success_1_contact = 0 # object name: name of the target object # object type: index of the object name in the object list # object list: pool of the available objects, have at least one entry self.object_name = '' self.object_type_str = '' self.object_type = 0 self.object_height = 0 self.object_list = U.get_target_object(env_object_type) print("object list {}".format(self.object_list)) self.object_initial_position = Pose(position=Point(x=-0.13, y=0.848, z=1.06), # x=0.0, y=0.9, z=1.05 orientation=quaternion_from_euler(0.002567, 0.102, 1.563)) if self._populate_object: # populate objects from object list self.populate_objects() # select first object, set object name and object type # if object is random, spawn random object # else get the first entry of object_list self.set_target_object(random_object=self._random_object, random_position=self._random_position) # The distance between gripper and object, when the robot is in initial pose self.max_distance, _ = U.get_distance_gripper_to_object(self.object_height) # The closest distance during training self.min_distance = 999