def learn_cartpole(): """Train an agent.""" env = gym.make('CartPole-v0') try: agent = ActorCritic(gym_space_distribution(env.action_space), gym_space_vectorizer(env.observation_space)) with tf.Session() as sess: a2c = A2C(sess, agent, target_kl=0.03) roller = BasicRoller(env, agent, min_episodes=8, min_steps=1024) while True: with agent.frozen(): rollouts = roller.rollouts() print('mean=%f' % (mean_total_reward(rollouts), )) agent.actor.extend( a2c.policy_update(rollouts, STEP_SIZE, NUM_STEPS, min_leaf=30)) agent.critic.extend( a2c.value_update(rollouts, VAL_STEP, NUM_STEPS, min_leaf=30)) finally: env.close()
def test_truncation(stateful, state_tuple): """ Test sequence truncation for TruncatedRoller with a batch of one environment. """ def env_fn(): return SimpleEnv(7, (5, 3), 'uint8') env = env_fn() model = SimpleModel(env.action_space.low.shape, stateful=stateful, state_tuple=state_tuple) basic_roller = BasicRoller(env, model, min_episodes=5) expected = basic_roller.rollouts() total_timesteps = sum([x.num_steps for x in expected]) batched_env = batched_gym_env([env_fn], sync=True) trunc_roller = TruncatedRoller(batched_env, model, total_timesteps // 2 + 1) actual1 = trunc_roller.rollouts() assert actual1[-1].trunc_end actual2 = trunc_roller.rollouts() expected1, expected2 = _artificial_truncation(expected, len(actual1) - 1, actual1[-1].num_steps) assert len(actual2) == len(expected2) + 1 actual2 = actual2[:-1] _compare_rollout_batch(actual1, expected1) _compare_rollout_batch(actual2, expected2)
def run_ppo(): """ Run a training worker. """ env = gym.make('CartPole-v0') action_dist = gym_space_distribution(env.action_space) obs_vectorizer = gym_space_vectorizer(env.observation_space) with tf.Session() as sess: model = MLP(sess, action_dist, obs_vectorizer, layer_sizes=[32]) # Deal with CartPole-v0 reward scale. model.scale_outputs(20) roller = BasicRoller(env, model, min_episodes=30) ppo = PPO(model) optimizer = MPIOptimizer(tf.train.AdamOptimizer(learning_rate=1e-3), -ppo.objective) sess.run(tf.global_variables_initializer()) optimizer.sync_from_root(sess) for i in range(50): rollouts = roller.rollouts() # pylint: disable=E1101 print('batch %d: rank=%d mean=%f' % (i, MPI.COMM_WORLD.Get_rank(), mean_total_reward(rollouts))) mpi_ppo(ppo, optimizer, rollouts, log_fn=print)
def test_ep_batches(stateful, state_tuple, limits): """ Test that EpisodeRoller is equivalent to a BasicRoller when run on a batch of envs. """ def env_fn(): return SimpleEnv(3, (4, 5), 'uint8') model = SimpleModel((4, 5), stateful=stateful, state_tuple=state_tuple) batched_env = batched_gym_env([env_fn] * 21, num_sub_batches=7, sync=True) ep_roller = EpisodeRoller(batched_env, model, **limits) actual = ep_roller.rollouts() total_steps = sum([r.num_steps for r in actual]) assert len(actual) >= ep_roller.min_episodes assert total_steps >= ep_roller.min_steps if 'min_steps' not in limits: num_eps = ep_roller.min_episodes + batched_env.num_envs - 1 assert len(actual) == num_eps basic_roller = BasicRoller(env_fn(), model, min_episodes=len(actual)) expected = basic_roller.rollouts() _compare_rollout_batch(actual, expected)
def _test_truncation_case(self, stateful, state_tuple): """ Test rollout truncation and continuation for a specific set of model parameters. """ env_fn = lambda: SimpleEnv(7, (5, 3), 'uint8') env = env_fn() model = SimpleModel(env.action_space.low.shape, stateful=stateful, state_tuple=state_tuple) basic_roller = BasicRoller(env, model, min_episodes=5) expected = basic_roller.rollouts() total_timesteps = sum([x.num_steps for x in expected]) batched_env = batched_gym_env([env_fn], sync=True) trunc_roller = TruncatedRoller(batched_env, model, total_timesteps // 2 + 1) actual1 = trunc_roller.rollouts() self.assertTrue(actual1[-1].trunc_end) actual2 = trunc_roller.rollouts() expected1, expected2 = _artificial_truncation(expected, len(actual1) - 1, actual1[-1].num_steps) self.assertEqual(len(actual2), len(expected2) + 1) actual2 = actual2[:-1] _compare_rollout_batch(self, actual1, expected1) _compare_rollout_batch(self, actual2, expected2)
def _test_batch_equivalence_case(self, stateful, state_tuple, **roller_kwargs): """ Test BasicRoller equivalence when using a batch of environments. """ env_fn = lambda: SimpleEnv(3, (4, 5), 'uint8') model = SimpleModel((4, 5), stateful=stateful, state_tuple=state_tuple) batched_env = batched_gym_env([env_fn] * 21, num_sub_batches=7, sync=True) ep_roller = EpisodeRoller(batched_env, model, **roller_kwargs) actual = ep_roller.rollouts() total_steps = sum([r.num_steps for r in actual]) self.assertTrue(len(actual) >= ep_roller.min_episodes) self.assertTrue(total_steps >= ep_roller.min_steps) if 'min_steps' not in roller_kwargs: num_eps = ep_roller.min_episodes + batched_env.num_envs - 1 self.assertTrue(len(actual) == num_eps) basic_roller = BasicRoller(env_fn(), model, min_episodes=len(actual)) expected = basic_roller.rollouts() _compare_rollout_batch(self, actual, expected)
def learn_setup(env_id=None, timesteps=int(5e6), env_name=None, param_scale=1, name="test", expnum=0, env=None, n_episodes=None, n_steps_per_episode=None, reward_threshold=0, CMA_mu=None, CMA_cmean=None, CMA_rankmu=None, CMA_rankone=None, log_file=None): config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) if env_id is None: env_id = env_name if env is None: env = make_vec_env(env_id, "mujoco", 1, None, reward_scale=1.0, flatten_dict_observations=True) if log_file is None: log_file = os.path.join( 'results', "recent" + name + "_" + str(expnum) + ".monitor.csv") log_npy = os.path.join('results', "recent" + name + '_' + str(expnum) + '.npy') #env = LoggedEnv(env, log_file, log_npy) model = ContinuousMLP(sess, env.action_space, gym_space_vectorizer(env.observation_space)) roller = BasicRoller(env, model, min_episodes=1, min_steps=n_steps_per_episode) sess.run(tf.global_variables_initializer()) trainer = CMATrainer(sess, scale=param_scale, CMA_mu=CMA_mu, CMA_cmean=CMA_cmean, CMA_rankmu=CMA_rankmu, CMA_rankone=CMA_rankone) #, popsize=n_episodes) rewards = [] local_variables = { 'roller': roller, 'trainer': trainer, 'env_id': env_name, 'reward_threshold': reward_threshold, 'rewards': rewards } return local_variables
def test_ep_basic_equivalence(stateful, state_tuple, limits): """ Test that EpisodeRoller is equivalent to a BasicRoller when run on a single environment. """ env_fn = lambda: SimpleEnv(3, (4, 5), 'uint8') env = env_fn() model = SimpleModel(env.action_space.low.shape, stateful=stateful, state_tuple=state_tuple) basic_roller = BasicRoller(env, model, **limits) expected = basic_roller.rollouts() batched_env = batched_gym_env([env_fn], sync=True) ep_roller = EpisodeRoller(batched_env, model, **limits) actual = ep_roller.rollouts() _compare_rollout_batch(actual, expected)
def _test_basic_equivalence_case(self, stateful, state_tuple): """ Test BasicRoller equivalence for a specific set of model settings. """ env_fn = lambda: SimpleEnv(3, (4, 5), 'uint8') env = env_fn() model = SimpleModel(env.action_space.low.shape, stateful=stateful, state_tuple=state_tuple) basic_roller = BasicRoller(env, model, min_episodes=5) expected = basic_roller.rollouts() total_timesteps = sum([x.num_steps for x in expected]) batched_env = batched_gym_env([env_fn], sync=True) trunc_roller = TruncatedRoller(batched_env, model, total_timesteps) actual = trunc_roller.rollouts() _compare_rollout_batch(self, actual, expected)
def _test_basic_equivalence_case(self, stateful, state_tuple, **roller_kwargs): """ Test BasicRoller equivalence for a single env in a specific case. """ env_fn = lambda: SimpleEnv(3, (4, 5), 'uint8') env = env_fn() model = SimpleModel(env.action_space.low.shape, stateful=stateful, state_tuple=state_tuple) basic_roller = BasicRoller(env, model, **roller_kwargs) expected = basic_roller.rollouts() batched_env = batched_gym_env([env_fn], sync=True) ep_roller = EpisodeRoller(batched_env, model, **roller_kwargs) actual = ep_roller.rollouts() _compare_rollout_batch(self, actual, expected)
def _test_batches_consistency(self, batch_size, trunc_start): """ Make sure that batches() produces the same outputs that we got with step(). """ env = TupleCartPole() try: roller = BasicRoller(env, self.model, min_episodes=7) rollouts = roller.rollouts() if trunc_start: rollouts = self._truncate_first(rollouts) num_batches = 10 for batch in self.model.batches(rollouts, batch_size=batch_size): num_batches -= 1 if num_batches == 0: break self._test_batch(rollouts, batch) finally: env.close()
def test_trunc_basic_equivalence(stateful, state_tuple): """ Test that TruncatedRoller is equivalent to BasicRoller for batches of one environment when the episodes end cleanly. """ env_fn = lambda: SimpleEnv(3, (4, 5), 'uint8') env = env_fn() model = SimpleModel(env.action_space.low.shape, stateful=stateful, state_tuple=state_tuple) basic_roller = BasicRoller(env, model, min_episodes=5) expected = basic_roller.rollouts() total_timesteps = sum([x.num_steps for x in expected]) batched_env = batched_gym_env([env_fn], sync=True) trunc_roller = TruncatedRoller(batched_env, model, total_timesteps) actual = trunc_roller.rollouts() _compare_rollout_batch(actual, expected)
def run_algorithm(algo_name): """ Run the specified training algorithm. """ env = gym.make('CartPole-v0') action_dist = gym_space_distribution(env.action_space) obs_vectorizer = gym_space_vectorizer(env.observation_space) with tf.Session() as sess: model = MLP(sess, action_dist, obs_vectorizer, layer_sizes=[32]) # Deal with CartPole-v0 reward scale. model.scale_outputs(20) roller = BasicRoller(env, model, min_episodes=30) inner_loop = algorithm_inner_loop(algo_name, model) sess.run(tf.global_variables_initializer()) print('running algorithm:', algo_name) for i in range(50): rollouts = roller.rollouts() print('batch %d: mean=%f' % (i, mean_total_reward(rollouts))) inner_loop(rollouts)
def training_loop(env_id=None, timesteps=int(5e6), param_scale=1, log_file=None): """ Run CMA on the environment. """ if log_file is None: log_file = os.path.join('results', env_id + '.monitor.csv') env = LoggedEnv(gym.make(env_id), log_file) with tf.Session() as sess: model = ContinuousMLP(sess, env.action_space, gym_space_vectorizer(env.observation_space)) roller = BasicRoller(env, model, min_episodes=4, min_steps=500) sess.run(tf.global_variables_initializer()) trainer = CMATrainer(sess, scale=param_scale) steps = 0 rewards = [] while steps < timesteps: sub_steps, sub_rewards = trainer.train(roller) steps += sub_steps rewards.extend(sub_rewards) print('%s: steps=%d mean=%f batch_mean=%f' % (env_id, steps, np.mean(rewards), np.mean(sub_rewards)))
if __name__ == '__main__': monitor = "results/unbiased_random/1" action_repeat = True single_life = True render = None env = retro.make("SuperMarioBros-Nes") env = MarioDiscretizer(env) if single_life: env = SingleLifeEnv(env) if monitor is not None: env = Monitor(env, monitor, video_callable=lambda i: False) if render is not None: env = AutoRenderer(env, auto_render_period=render) if action_repeat: env = FrameStack(env, 4) # model = WeightedRandomAgent() model = RandomAgent(lambda: env.action_space.sample()) player = BasicRoller(env, model, min_episodes=1) # total_rollouts = [player.rollouts() for rollout_i in trange(40)] # flat_rollouts = reduce(list.__add__, total_rollouts) # total_rewards = map(lambda r: r.total_reward, flat_rollouts) # [filename for path in dirs for filename in os.listdir(path)] total_rewards = [] for i in tqdm(range(150)): rollouts = player.rollouts() total_rewards += [roll.total_reward for roll in rollouts] print(total_rewards) rewards_numbers = list(zip(count(), total_rewards)) sorted_reward_numbers = sorted(rewards_numbers, key=lambda t: t[1]) print(sorted_reward_numbers[-5:])