def run(args): log_dir = args.dir_path env = gym.make(args.env) state_dim = env.observation_space.shape[0] action_dim = env.action_space.shape[0] max_action = int(env.action_space.high[0]) env.seed(args.seed) np.random.seed(args.seed) torch.manual_seed(args.seed) ddpg = DDPG(state_dim, action_dim, max_action, args) ounoise = OUNoise(action_dim) def get_action(state, noise=None): action = ddpg.actor(FloatTensor(state)) action = (action.data.numpy() + noise.add()) if noise else action.data.numpy() return np.clip(action, -max_action, max_action) def rollout(eval=False): state, done, ep_reward, ep_len = env.reset(), False, 0.0, 0 while not done and ep_len < args.max_ep_len: if not eval: action = get_action(state, noise=ounoise) else: action = get_action(state) next_state, reward, done, _ = env.step(action) if not eval: done = False if ep_len + 1 == args.max_ep_len else done ddpg.replay_buffer.store( (state, next_state, action, reward, done)) ep_reward += reward ep_len += 1 state = next_state return ep_reward, ep_len for epoch in range(args.epochs): ep_reward, ep_len = rollout(eval=False) if epoch > args.start_epoch: for _ in range(ep_len): ddpg.train() ddpg.update_nets() if epoch % args.save_freq == 0: test_rewards = [] for i in range(10): reward, _ = rollout() test_rewards.append(reward) test_rewards = np.array(test_rewards) np.savez(log_dir + '/policy_weights', ddpg.actor.get_params()) logz.log_tabular("Epoch", epoch) logz.log_tabular("AverageTestReward", np.mean(test_rewards)) logz.log_tabular("StdTestRewards", np.std(test_rewards)) logz.log_tabular("MaxTestRewardRollout", np.max(test_rewards)) logz.log_tabular("MinTestRewardRollout", np.min(test_rewards)) logz.dump_tabular()
def train(self): """ Runs training. See the in-line comments. """ args = self.args t_start = time.time() num_mbs = len(self.data_mb_list['X_train']) for ii in range(args.train_iters): # Sample minibatch and form feed dictionary. real_BO = self.data_mb_list['X_train'][ii % num_mbs] prior_BP = self._sample_prior() feed = {self.D_data_BO: real_BO, self.G_data_BP: prior_BP} # Update Generator and Discriminator, I think they should be separate. _, loss_D = self.sess.run([self.train_D_op, self.loss_D], feed) _, loss_G = self.sess.run([self.train_G_op, self.loss_G], feed) if (ii % args.log_every_t_iter == 0): print("\n ************ Iteration %i ************" % ii) # -------------------------------------------------------------- # Logging. Also record time and get a fresh set of real vs fake # images to evaluate (but NOT train) the Discriminator. # -------------------------------------------------------------- new_feed = { self.D_data_BO: self.data_mb_list['X_train'][(ii + 1) % num_mbs], self.G_data_BP: self._sample_prior() } dout_real, dout_fake = \ self.sess.run([self.D_real_B, self.D_fake_B], new_feed) num_correct = np.sum(dout_real > 0.0) + np.sum(dout_fake < 0.0) elapsed_time_hours = (time.time() - t_start) / (60.0**2) logz.log_tabular("AvgRealScore", np.mean(dout_real)) logz.log_tabular("AvgFakeScore", np.mean(dout_fake)) logz.log_tabular("LossDis", loss_D) logz.log_tabular("LossGen", loss_G) logz.log_tabular("DisNumCorrect", num_correct) logz.log_tabular("TimeHours", elapsed_time_hours) logz.log_tabular("Iterations", ii) logz.dump_tabular() if (ii % args.snapshot_every_t_iter == 0) and (self.log_dir is not None): # -------------------------------------------------------------- # See if we're making cool images and also save weights. # Unfortunately some of this is highly specific to MNIST... # Don't worry about the reshaping order because all that the # computer sees is just the 784-dimensional vector (for now). # -------------------------------------------------------------- bs = args.test_cols * args.test_rows dims = int(np.sqrt(self.odim)) prior = np.random.standard_normal(size=(bs, self.prior_dim)) gen_out_BO = self.sess.run(self.G_out_BO, {self.G_data_BP: prior}) gen_out_BDD = np.reshape(gen_out_BO, (bs, dims, dims)) weights_v = self.sess.run(self.weights_v) self._save_snapshot(ii, weights_v, gen_out_BDD)
def train(self): """ Runs training. See the in-line comments. """ args = self.args t_start = time.time() num_mbs = len(self.data_mb_list['X_train']) for ii in range(args.train_iters): # Sample minibatch + standard Gaussian noise and form feed. real_BO = self.data_mb_list['X_train'][ii % num_mbs] std_norm_BZ = np.random.standard_normal((self.bsize,args.latent_dim)) feed = {self.data_BO: real_BO, self.std_norm_BZ: std_norm_BZ} _, neg_lb_loss, kldiv, log_p, first, second, logstd_BO = self.sess.run( [self.train_op, self.neg_lb_llhd, self.kldiv, self.log_p, self.first_B, self.second_B, self.d_logstd_BO], feed ) if (ii % args.log_every_t_iter == 0): print("\n ************ Iteration %i ************" % ii) #print("first {}".format(first)) #print("second {}".format(second)) elapsed_time_hours = (time.time() - t_start) / (60.0 ** 2) logz.log_tabular("LogProb", log_p) logz.log_tabular("KlDiv", kldiv) logz.log_tabular("NegLbLhd", neg_lb_loss) logz.log_tabular("TimeHours", elapsed_time_hours) logz.log_tabular("Iterations", ii) logz.dump_tabular() if (ii % args.snapshot_every_t_iter == 0) and (self.log_dir is not None): # -------------------------------------------------------------- # See if we're making cool images and also save weights. # Unfortunately some of this is highly specific to MNIST... # Don't worry about the reshaping order because all that the # computer sees is just the 784-dimensional vector (for now). # We use a different batch size here, `bs`. # -------------------------------------------------------------- bs = args.test_cols * args.test_rows dims = int(np.sqrt(self.odim)) latent_BZ = np.random.standard_normal((bs,args.latent_dim)) feed = {self.latent_BZ: latent_BZ} dec_out_BO, dec_logstd_BO = \ self.sess.run([self.d_mean_BO, self.d_logstd_BO], feed) # With the mean and (log) std, we can sample. eps_BO = np.random.standard_normal(size=dec_out_BO.shape) sampled_BO = dec_out_BO + (np.exp(dec_logstd_BO) * eps_BO) dec_out_BDD = np.reshape(sampled_BO, (bs,dims,dims)) weights_v = self.sess.run(self.weights_v) self._save_snapshot(ii, weights_v, dec_out_BDD)
def log_diagnostics(self, paths, infodict, vfdict): """ Just logging using the `logz` functionality. """ ob_no = np.concatenate([path["observation"] for path in paths]) vpred_n = np.concatenate([path["baseline"] for path in paths]) vtarg_n = np.concatenate([path["reward"] for path in paths]) elapsed_time = (time.time() - self.start_time) # In seconds episode_rewards = np.array([path["reward"].sum() for path in paths]) episode_lengths = np.array([utils.pathlength(path) for path in paths]) # These are *not* logged in John Schulman's code. #logz.log_tabular("Success", infodict["Success"]) #logz.log_tabular("LagrangeM", infodict["LagrangeM"]) #logz.log_tabular("gNorm", infodict["gNorm"]) # These *are* logged in John Schulman's code. First, rewards: logz.log_tabular("NumEpBatch", len(paths)) logz.log_tabular("EpRewMean", episode_rewards.mean()) logz.log_tabular("EpRewMax", episode_rewards.max()) logz.log_tabular("EpRewSEM", episode_rewards.std()/np.sqrt(len(paths))) logz.log_tabular("EpLenMean", episode_lengths.mean()) logz.log_tabular("EpLenMax", episode_lengths.max()) logz.log_tabular("RewPerStep", episode_rewards.sum()/episode_lengths.sum()) logz.log_tabular("vf_mse_before", vfdict["MSEBefore"]) logz.log_tabular("vf_mse_after", vfdict["MSEAfter"]) logz.log_tabular("vf_PredStdevBefore", vfdict["PredStdevBefore"]) logz.log_tabular("vf_PredStdevAfter", vfdict["PredStdevAfter"]) logz.log_tabular("vf_TargStdev", vfdict["TargStdev"]) logz.log_tabular("vf_EV_before", utils.explained_variance_1d(vpred_n, vtarg_n)) logz.log_tabular("vf_EV_after", utils.explained_variance_1d(self.vf.predict(ob_no), vtarg_n)) # If overfitting, EVAfter >> EVBefore. Also, we fit the value function # _after_ using it to compute the baseline to avoid introducing bias. logz.log_tabular("pol_surr_before", infodict["pol_surr_before"]) logz.log_tabular("pol_surr_after", infodict["pol_surr_after"]) logz.log_tabular("pol_kl_before", infodict["pol_kl_before"]) logz.log_tabular("pol_kl_after", infodict["pol_kl_after"]) logz.log_tabular("pol_ent_before", infodict["pol_ent_before"]) logz.log_tabular("pol_ent_after", infodict["pol_ent_after"]) logz.log_tabular("TimeElapsed", elapsed_time) logz.dump_tabular()
def AC_train(exp_name, env_name, n_iter, gamma, min_timesteps_per_batch, max_path_length, learning_rate, num_target_updates, num_grad_steps_per_target_update, animate, logdir, normalize_advantages, seed, n_layers, size): start = time.time() # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters # args = inspect.getargspec(PG_train)[0] # params = {k: locals()[k] if k in locals() else None for k in args} params = locals() print(params) logz.save_params(params) # Make the gym environment env = gym.make(env_name) # Set random seeds tf.set_random_seed(seed) np.random.seed(seed) env.seed(seed) # Maximum length for episodes max_path_length = max_path_length or env.spec.max_episode_steps # Is this env continuous, or self.discrete? discrete = isinstance(env.action_space, gym.spaces.Discrete) # Observation and action sizes ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.n if discrete else env.action_space.shape[0] # initialize Policy Gradient Agent network_args = { 'n_layers': n_layers, 'size': size, 'learning_rate': learning_rate, 'num_target_updates': num_target_updates, 'num_grad_steps_per_target_update': num_grad_steps_per_target_update } env_args = { 'ob_dim': ob_dim, 'ac_dim': ac_dim, 'discrete': discrete, } sample_traj_args = { 'animate': animate, 'max_path_length': max_path_length, 'min_timesteps_per_batch': min_timesteps_per_batch, } estimate_return_args = { 'gamma': gamma, 'normalize_advantages': normalize_advantages, } # Agent agent = ACAgent(network_args, env_args, sample_traj_args, estimate_return_args) agent.build_computation_graph() agent.init_tf_sess() # start training total_timesteps = 0 for itr in range(n_iter): print("********** Iteration %i ************" % itr) paths, timesteps_this_batch = agent.sample_trajs(itr, env) total_timesteps += timesteps_this_batch # Build arrays for observation, action for the policy gradient update by concatenating # across paths ob_no = np.concatenate([path["observation"] for path in paths]) ac_na = np.concatenate([path["action"] for path in paths]) re_n = np.concatenate([path["reward"] for path in paths]) next_ob_no = np.concatenate( [path["next_observation"] for path in paths]) terminal_n = np.concatenate([path["terminal"] for path in paths]) agent.update_critic(ob_no, next_ob_no, re_n, terminal_n) adv_n = agent.estimate_advantage(ob_no, next_ob_no, re_n, terminal_n) agent.update_actor(ob_no, ac_na, adv_n) # Log diagnostics returns = [path["reward"].sum() for path in paths] ep_lengths = [len(path["reward"]) for path in paths] logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", itr) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular() logz.pickle_tf_vars()
def train(self): for epoch in range(self.epochs): surr_grads = [] ddpg_grads = 0 if epoch >= self.start_epoch: self.ddpg.actor.set_params(self.policy.w_policy) self.ddpg.actor_t.set_params(self.policy.w_policy) for step in range(self.rl_train_steps): grad = self.ddpg.train() ddpg_grads += grad if step >= self.rl_train_steps - self.k: surr_grads.append(grad.flatten()) self.policy.update_by_ddpg(ddpg_grads / self.rl_train_steps) # if epoch % 50 == 0: # self.ddpg.replay_buffer.buffer_flush() # self.policy.w_policy = self.ddpg.actor.get_params() self.noise.update(np.array(surr_grads).T) epsilons = self.noise.sample( self.pop_size) # policy_size x pop_size pos_rewards, neg_rewards = [], [] policy_weights = self.policy.w_policy # action_dim x state_dim for epsilon in epsilons: self.policy.w_policy = policy_weights + epsilon.reshape( self.policy.w_policy.shape) pos_reward, pos_len = self.evaluate() pos_rewards.append(pos_reward) self.policy.w_policy = policy_weights - epsilon.reshape( self.policy.w_policy.shape) neg_reward, neg_len = self.evaluate() neg_rewards.append(neg_reward) self.policy.w_policy = policy_weights std_rewards = np.array(pos_rewards + neg_rewards).std() if self.elite_size != 0: scores = { k: max(pos_reward, neg_reward) for k, ( pos_reward, neg_reward) in enumerate(zip(pos_rewards, neg_rewards)) } sorted_scores = sorted(scores.keys(), key=lambda x: scores[x], reverse=True)[:self.elite_size] elite_pos_rewards = [pos_rewards[k] for k in sorted_scores] elite_neg_rewards = [neg_rewards[k] for k in sorted_scores] elite_epsilons = [epsilons[k] for k in sorted_scores] self.policy.update_by_ges(elite_pos_rewards, elite_neg_rewards, elite_epsilons, std_rewards) else: self.policy.update_by_ges(pos_rewards, neg_rewards, epsilons, std_rewards) if epoch % self.save_freq == 0: train_rewards = np.array(pos_rewards + neg_rewards) test_rewards = [] for _ in range(10): reward, _ = self.evaluate() test_rewards.append(reward) test_rewards = np.array(test_rewards) np.savez(self.log_dir + '/policy_weights', self.policy.w_policy) logz.log_tabular("Epoch", epoch) logz.log_tabular("AverageTrainReward", np.mean(train_rewards)) logz.log_tabular("StdTrainRewards", np.std(train_rewards)) logz.log_tabular("MaxTrainRewardRollout", np.max(train_rewards)) logz.log_tabular("MinTrainRewardRollout", np.min(train_rewards)) logz.log_tabular("AverageTestReward", np.mean(test_rewards)) logz.log_tabular("StdTestRewards", np.std(test_rewards)) logz.log_tabular("MaxTestRewardRollout", np.max(test_rewards)) logz.log_tabular("MinTestRewardRollout", np.min(test_rewards)) logz.dump_tabular()
def TD3_train(env, logdir='.', actor_critic=actor_critic, iterations=600000, replay_size=int(1e6), gamma=0.99, polyak=0.995, actor_lr=1e-3, critic_lr=1e-3, batch_size=100, start_steps=10000, act_noise=0.1, target_noise=0.2, noise_clip=0.5, policy_delay=4): # Configure output directory for logging logz.configure_output_dir(logdir) # Log experimental parameters # args = inspect.getargspec(PG_train)[0] # params = {k: locals()[k] if k in locals() else None for k in args} params = locals() print(params) logz.save_params(params) td3 = TD3Agent(env, actor_critic, gamma, polyak, actor_lr, critic_lr, act_noise) td3.build_computation_graph() td3.init_tf_sess() td3.graph_initialization() ob_dim = env.observation_space.shape[0] ac_dim = env.action_space.shape[0] replay_buffer = ReplayBuffer(ob_dim, ac_dim, replay_size) start_time = time.time() ob = env.reset() ac, rew, done = 0, 0, 0 actor_loss = [] critic_loss = [] for ii in range(iterations): if ii < start_steps: ac = env.action_space.sample() else: ac = td3.sample_action(ob) ob_next, rew, done = env.step(ac) replay_buffer.store(ob, ac, rew, ob_next, done) if done is True: ob = env.reset() # if iteration < start_step, only put steps into buffer if ii < start_steps: continue batch = replay_buffer.sample_batch(batch_size=batch_size) # update critic a_loss = td3.update_critic(batch['obs1'], batch['obs2'], batch['acts'], batch['rews'], batch['done']) actor_loss.append(a_loss) if ii % policy_delay == 0: # Delayed actor update and target update # update actor and target c_loss = td3.update_actor_and_target(batch['obs1'], batch['obs2'], batch['acts'], batch['rews'], batch['done']) critic_loss.append(c_loss) if ii % 10000 == 0: logz.log_tabular("Time", time.time() - start_time) logz.log_tabular("Iteration", ii) logz.log_tabular("AverageActorLoss", np.mean(np.array(actor_loss))) logz.log_tabular("AverageCriticLoss", np.mean(np.array(critic_loss))) logz.log_tabular("AverageActorStd", np.std(np.array(actor_loss))) logz.log_tabular("AverageCriticStd", np.std(np.array(critic_loss))) logz.dump_tabular() logz.pickle_tf_vars()
def train_PG(exp_name, env_name, n_iters, gamma, min_timesteps_per_batch, max_path_length, lr, normalize_advantages, nn_baseline, seed, n_layers, hidden_size, discrete, logdir): start = time.time() # env env = gym.make(env_name) #TODO: # env = ChallengeSeqDecEnvironment(experimentCount=3005, userID="jingw2", \ # timeout=5, realworkercount=8) # env.state_size = 1 # env.action_size = 2 # set up logger setup_logger(logdir, locals()) # random seeds torch.manual_seed(seed) np.random.seed(seed) if hasattr(env, 'seed'): env.seed(seed) # sete attributes if isinstance(env, gym.Env): max_path_length = max_path_length or env.spec.max_episode_steps discrete = isinstance(env.action_space, gym.spaces.Discrete) state_size = env.observation_space.shape[0] action_size = env.action_space.n if discrete else env.action_space.shape[ 0] else: if hasattr(env, 'state_size'): state_size = env.state_size else: raise Exception( "Environment has attribute state_size or use gym.Env!") if hasattr(env, 'action_size'): action_size = env.action_size else: raise Exception( "Environment has attribute action_size or use gym.Env!") net_args = { "n_layers": n_layers, "state_size": state_size, "action_size": action_size, "discrete": discrete, "hidden_size": hidden_size, "learing_rate": lr, "output_activation": nn.Sigmoid() } trajectory_args = { "max_path_length": max_path_length, "min_timesteps_per_batch": min_timesteps_per_batch } reward_args = { "gamma": gamma, "nn_baseline": nn_baseline, "normalize_advantage": normalize_advantages } agent = Agent(net_args, trajectory_args, reward_args) # create networks agent.build_net() total_timesteps = 0 for it in range(n_iters): print("=============Iteration {}==============".format(it)) paths, timesteps_this_batch = agent.sample_trajectories(it, env) #TODO: # env = ChallengeSeqDecEnvironment(experimentCount=3005, userID="jingw2", \ # timeout=5, realworkercount=8) total_timesteps += timesteps_this_batch states = np.concatenate([path["state"] for path in paths]) actions = np.concatenate([path["action"] for path in paths]) rewards = [path["reward"] for path in paths] states_input = torch.Tensor(states).float() actions_input = torch.Tensor(actions).float() # q_n, adv = agent.estimate_return(states_input, rewards) # agent.train_op(states_input, actions_input, q_n, adv) agent.train_op() returns = [path["reward"].sum() for path in paths] ep_lengths = [pathlength(path) for path in paths] # best_idx = np.argmax(returns) # best_path = paths[best_idx] # best_policy = {} # for i in range(5): # best_policy[str(i+1)] = best_path["action"][i].tolist() # data = {"best_policy": [best_policy], "best_reward": returns[best_idx]} # data = pd.DataFrame(data) # if os.path.exists("best_policy_pg.csv"): # policy_df = pd.read_csv("best_policy_pg.csv") # policy_df.loc[len(policy_df)] = [best_policy, returns[best_idx]] # else: # policy_df = data # policy_df.to_csv("best_policy_pg.csv", index=False) logz.log_tabular("Time", time.time() - start) logz.log_tabular("Iteration", it) logz.log_tabular("AverageReturn", np.mean(returns)) logz.log_tabular("StdReturn", np.std(returns)) logz.log_tabular("MaxReturn", np.max(returns)) logz.log_tabular("MinReturn", np.min(returns)) logz.log_tabular("EpLenMean", np.mean(ep_lengths)) logz.log_tabular("EpLenStd", np.std(ep_lengths)) logz.log_tabular("TimestepsThisBatch", timesteps_this_batch) logz.log_tabular("TimestepsSoFar", total_timesteps) logz.dump_tabular()
def train(self): """ Algorithm 1 in the DDPG paper. """ num_episodes = 0 t_start = time.time() obs = self.env.reset() for t in range(self.args.n_iter): if (t % self.args.log_every_t_iter == 0) and (t > self.args.wait_until_rbuffer): print("\n*** DDPG Iteration {} ***".format(t)) # Sample actions with noise injection and manage buffer. act = self.actor.sample_action(obs, train=True) new_obs, rew, done, info = self.env.step(act) self.rbuffer.add_sample(s=obs, a=act, r=rew, done=done) if done: obs = self.env.reset() num_episodes += 1 else: obs = new_obs if (t > self.args.wait_until_rbuffer) and ( t % self.args.learning_freq == 0): # Sample from the replay buffer. states_t_BO, actions_t_BA, rewards_t_B, states_tp1_BO, done_mask_B = \ self.rbuffer.sample(num=self.args.batch_size) feed = { 'obs_t_BO': states_t_BO, 'act_t_BA': actions_t_BA, 'rew_t_B': rewards_t_B, 'obs_tp1_BO': states_tp1_BO, 'done_mask_B': done_mask_B } # Update the critic, get sampled policy gradients, update actor. a_grads_BA, l2_error = self.critic.update_weights(feed) actor_gradients = self.actor.update_weights(feed, a_grads_BA) # Update both target networks. self.critic.update_target_net() self.actor.update_target_net() if (t % self.args.log_every_t_iter == 0) and (t > self.args.wait_until_rbuffer): # Do some rollouts here and then record statistics. Note that # some of these stats rely on stuff computed from sampling the # replay buffer, so be careful interpreting these. The code # probably needs to guard against this case as well. stats = self._do_rollouts() hours = (time.time() - t_start) / (60 * 60.) logz.log_tabular("MeanReward", np.mean(stats['reward'])) logz.log_tabular("MaxReward", np.max(stats['reward'])) logz.log_tabular("MinReward", np.min(stats['reward'])) logz.log_tabular("StdReward", np.std(stats['reward'])) logz.log_tabular("MeanLength", np.mean(stats['length'])) logz.log_tabular("NumTrainingEps", num_episodes) logz.log_tabular("L2ErrorCritic", l2_error) logz.log_tabular("QaGradL2Norm", np.linalg.norm(a_grads_BA)) logz.log_tabular("TimeHours", hours) logz.log_tabular("Iterations", t) logz.dump_tabular()
def run_vpg(args, vf_params, logdir, env, sess, continuous_control): """ General purpose method to run vanilla policy gradients, for both continuous and discrete action environments. Parameters ---------- args: [Namespace] Contains user-provided (or default) arguments for VPGs. vf_params: [dict] Dictionary of parameters for the value function. logdir: [string] Where we store the outputs, can be None to avoid saving. env: [OpenAI gym env] The environment the agent is in, from OpenAI gym. sess: [tf Session] Current Tensorflow session, to be passed to (at least) the policy function, and the value function as well if it's a neural network. continuous_control: [boolean] True if continuous control (i.e. actions), false if otherwise. """ ob_dim = env.observation_space.shape[0] if args.vf_type == 'linear': vf = vfuncs.LinearValueFunction(**vf_params) elif args.vf_type == 'nn': vf = vfuncs.NnValueFunction(session=sess, ob_dim=ob_dim, **vf_params) if continuous_control: ac_dim = env.action_space.shape[0] policyfn = policies.GaussianPolicy(sess, ob_dim, ac_dim) else: ac_dim = env.action_space.n policyfn = policies.GibbsPolicy(sess, ob_dim, ac_dim) sess.__enter__() # equivalent to `with sess:` tf.global_variables_initializer().run() # pylint: disable=E1101 total_timesteps = 0 stepsize = args.initial_stepsize for i in range(args.n_iter): print("\n********** Iteration %i ************" % i) # Collect paths until we have enough timesteps. timesteps_this_batch = 0 paths = [] while True: ob = env.reset() terminated = False obs, acs, rewards = [], [], [] animate_this_episode = (len(paths) == 0 and (i % 100 == 0) and args.render) while True: if animate_this_episode: env.render() obs.append(ob) ac = policyfn.sample_action(ob) acs.append(ac) ob, rew, done, _ = env.step(ac) rewards.append(rew) if done: break path = { "observation": np.array(obs), "terminated": terminated, "reward": np.array(rewards), "action": np.array(acs) } paths.append(path) timesteps_this_batch += utils.pathlength(path) if timesteps_this_batch > args.min_timesteps_per_batch: break total_timesteps += timesteps_this_batch # Estimate advantage function using baseline vf (these are lists!). # return_t: list of sum of discounted rewards (to end of episode), one per time # vpred_t: list of value function's predictions of components of return_t vtargs, vpreds, advs = [], [], [] for path in paths: rew_t = path["reward"] return_t = utils.discount(rew_t, args.gamma) vpred_t = vf.predict(path["observation"]) adv_t = return_t - vpred_t advs.append(adv_t) vtargs.append(return_t) vpreds.append(vpred_t) # Build arrays for policy update and **re-fit the baseline**. ob_no = np.concatenate([path["observation"] for path in paths]) ac_n = np.concatenate([path["action"] for path in paths]) adv_n = np.concatenate(advs) std_adv_n = (adv_n - adv_n.mean()) / (adv_n.std() + 1e-8) vtarg_n = np.concatenate(vtargs) vpred_n = np.concatenate(vpreds) vf.fit(ob_no, vtarg_n) # Policy update, plus diagnostics stuff. Is there a better way to handle # the continuous vs discrete control cases? if continuous_control: surr_loss, oldmean_na, oldlogstd_a = policyfn.update_policy( ob_no, ac_n, std_adv_n, stepsize) kl, ent = policyfn.kldiv_and_entropy(ob_no, oldmean_na, oldlogstd_a) else: surr_loss, oldlogits_na = policyfn.update_policy( ob_no, ac_n, std_adv_n, stepsize) kl, ent = policyfn.kldiv_and_entropy(ob_no, oldlogits_na) # A step size heuristic to ensure that we don't take too large steps. if args.use_kl_heuristic: if kl > args.desired_kl * 2: stepsize /= 1.5 print('PG stepsize -> %s' % stepsize) elif kl < args.desired_kl / 2: stepsize *= 1.5 print('PG stepsize -> %s' % stepsize) else: print('PG stepsize OK') # Log diagnostics if i % args.log_every_t_iter == 0: logz.log_tabular("EpRewMean", np.mean([path["reward"].sum() for path in paths])) logz.log_tabular( "EpLenMean", np.mean([utils.pathlength(path) for path in paths])) logz.log_tabular("KLOldNew", kl) logz.log_tabular("Entropy", ent) logz.log_tabular("EVBefore", utils.explained_variance_1d(vpred_n, vtarg_n)) logz.log_tabular( "EVAfter", utils.explained_variance_1d(vf.predict(ob_no), vtarg_n)) logz.log_tabular("SurrogateLoss", surr_loss) logz.log_tabular("TimestepsSoFar", total_timesteps) # If you're overfitting, EVAfter will be way larger than EVBefore. # Note that we fit the value function AFTER using it to compute the # advantage function to avoid introducing bias logz.dump_tabular()
def train(self): for epoch in range(self.epochs): # Sample noises from the noise generator. epsilons = self.noise.sample(self.pop_size) pos_rewards, neg_rewards = [], [] policy_weights = self.policy.w_policy # Generate 2 * pop_size policies and rollouts. for epsilon in epsilons: self.policy.w_policy = policy_weights + self.noise_std * epsilon pos_reward, pos_len = self.evaluate() pos_rewards.append(pos_reward) self.policy.w_policy = policy_weights - self.noise_std * epsilon neg_reward, neg_len = self.evaluate() neg_rewards.append(neg_reward) self.policy.w_policy = policy_weights std_rewards = np.array(pos_rewards + neg_rewards).std() # ARS update if self.elite_size != 0: scores = { k: max(pos_reward, neg_reward) for k, ( pos_reward, neg_reward) in enumerate(zip(pos_rewards, neg_rewards)) } sorted_scores = sorted(scores.keys(), key=lambda x: scores[x], reverse=True)[:self.elite_size] elite_pos_rewards = [pos_rewards[k] for k in sorted_scores] elite_neg_rewards = [neg_rewards[k] for k in sorted_scores] elite_epsilons = [epsilons[k] for k in sorted_scores] self.policy.update(elite_pos_rewards, elite_neg_rewards, elite_epsilons, std_rewards) else: self.policy.update(pos_rewards, neg_rewards, epsilons, std_rewards) # Save policy and log the information if epoch % self.save_freq == 0: train_rewards = np.array(pos_rewards + neg_rewards) test_rewards = [] for _ in range(10): reward, _ = self.evaluate() test_rewards.append(reward) test_rewards = np.array(test_rewards) np.savez(self.log_dir + '/policy_weights', self.policy.w_policy) logz.log_tabular("Epoch", epoch) logz.log_tabular("AverageTrainReward", np.mean(train_rewards)) logz.log_tabular("StdTrainRewards", np.std(train_rewards)) logz.log_tabular("MaxTrainRewardRollout", np.max(train_rewards)) logz.log_tabular("MinTrainRewardRollout", np.min(train_rewards)) logz.log_tabular("AverageTestReward", np.mean(test_rewards)) logz.log_tabular("StdTestRewards", np.std(test_rewards)) logz.log_tabular("MaxTestRewardRollout", np.max(test_rewards)) logz.log_tabular("MinTestRewardRollout", np.min(test_rewards)) logz.dump_tabular()