def __init__(self, env, gamma): self.env = env self.env.seed(543) torch.manual_seed(543) self.policy_model = PolicyModel() self.optimizer = optim.Adam(self.policy_model.parameters(), lr=0.009) self.gamma = gamma self.eps = np.finfo(np.float32).eps.item() self.loss_list = [] self.ep_no_list = []
def __init__(self): """ init """ self.parse_args = self._init_parser() self.bl_decay = self.parse_args.bl_decay self.log_dir = self.parse_args.log_dir self.early_stop = self.parse_args.early_stop self.data_path = self.parse_args.data_path self.num_models = self.parse_args.num_models self.batch_size = self.parse_args.batch_size self.chunk_size = self.parse_args.chunk_size self._init_dir_path() self.model = PolicyModel(self.parse_args) algo_hyperparas = {'lr': self.parse_args.learning_rate} self.algorithm = ReinforcePolicyGradient(self.model, hyperparas=algo_hyperparas) self.autodl_agent = AutoDLAgent(self.algorithm, self.parse_args) self.total_reward = 0
#Create the environment #---------------------------- env = gym.make(env_id) if args.unwrap: env = env.unwrapped a_dim = env.action_space.shape[0] a_low = env.action_space.low[0] a_hight = env.action_space.high[0] s_dim = env.observation_space.shape[0] #Create the model #---------------------------- config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) policy = PolicyModel(sess, s_dim, a_dim, a_low, a_hight) #Start playing #---------------------------- sess.run(tf.global_variables_initializer()) #Load the model saver = tf.train.Saver(max_to_keep=2) ckpt = tf.train.get_checkpoint_state(save_dir) if ckpt: print("Loading the model ... ", end="") saver.restore(sess, ckpt.model_checkpoint_path) print("Done.") logstd = np.zeros((1, a_dim), dtype=np.float32) logstd.fill(-6.0)
adv_ph = tf.placeholder(tf.float32, [None], name="advantage") return_ph = tf.placeholder(tf.float32, [None], name="return") lr_ph = tf.placeholder(tf.float32, []) clip_ph = tf.placeholder(tf.float32, []) #Create the model #---------------------------- config = tf.ConfigProto( allow_soft_placement=True, intra_op_parallelism_threads=n_env, inter_op_parallelism_threads=n_env ) config.gpu_options.allow_growth = True sess = tf.Session(config=config) policy = PolicyModel(sess, s_dim, a_dim, a_low, a_high, "policy") #Loss #---------------------------- neg_logprob = policy.distrib.neg_logp(action_ph) v_pred = policy.value v_pred_clip = old_v_pred_ph + tf.clip_by_value(v_pred - old_v_pred_ph, -clip_ph, clip_ph) v_loss1 = tf.square(v_pred - return_ph) v_loss2 = tf.square(v_pred_clip - return_ph) v_loss = 0.5 * tf.reduce_mean(tf.maximum(v_loss1, v_loss2)) ratio = tf.exp(old_neg_logprob_ph - neg_logprob) pg_loss1 = -adv_ph * ratio pg_loss2 = -adv_ph * tf.clip_by_value(ratio, 1.0 - clip_ph, 1.0 + clip_ph)
if "FIRE" in env.unwrapped.get_action_meanings(): env = env_wrapper.FireResetEnv(env) env = env_wrapper.WarpFrame(env) env = env_wrapper.FrameStack(env, n_stack) a_dim = env.action_space.n img_height, img_width, c_dim = env.observation_space.shape #Create the model #---------------------------- config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) policy = PolicyModel(sess, img_height, img_width, c_dim, a_dim) #Start training #---------------------------- sess.run(tf.global_variables_initializer()) #Load the model if not os.path.exists(save_dir): os.mkdir(save_dir) saver = tf.train.Saver(max_to_keep=2) ckpt = tf.train.get_checkpoint_state(save_dir) if ckpt: print("Loading the model ... ", end="") saver.restore(sess, ckpt.model_checkpoint_path) print("Done.")
#Create multiple environments #---------------------------- env = MultiEnv([make_env(i, env_id=env_id) for i in range(n_env)]) a_dim = env.ac_space.n img_height, img_width, c_dim = env.ob_space.shape runner = MultiEnvRunner(env, img_height, img_width, c_dim, n_step, n_stack, gamma) #Create the model #---------------------------- config = tf.ConfigProto(intra_op_parallelism_threads=n_env, inter_op_parallelism_threads=n_env) config.gpu_options.allow_growth = True sess = tf.Session(config=config) policy = PolicyModel(sess, img_height, img_width, c_dim * n_stack, a_dim) #Placeholders #---------------------------- action_ph = tf.placeholder(tf.int32, [None], name="action") adv_ph = tf.placeholder(tf.float32, [None], name="advantage") discount_return_ph = tf.placeholder(tf.float32, [None], name="discounted_return") lr_ph = tf.placeholder(tf.float32, []) #Loss #---------------------------- nll_loss = -policy.cat_dist.log_prob(action_ph) pg_loss = tf.reduce_mean(adv_ph * nll_loss) value_loss = tf.reduce_mean( tf.squared_difference(tf.squeeze(policy.value), discount_return_ph) / 2.0)
env_id = args.env save_dir = "./save_" + env_id #Create the environment #---------------------------- env = gym.make(env_id) if args.unwrap: env = env.unwrapped s_dim = env.observation_space.shape[0] a_dim = env.action_space.n #Create the model #---------------------------- config = tf.ConfigProto() config.gpu_options.allow_growth = True sess = tf.Session(config=config) policy = PolicyModel(sess, s_dim, a_dim) #Start playing #---------------------------- sess.run(tf.global_variables_initializer()) #Load the model saver = tf.train.Saver(max_to_keep=2) ckpt = tf.train.get_checkpoint_state(save_dir) if ckpt: print("Loading the model ... ", end="") saver.restore(sess, ckpt.model_checkpoint_path) print("Done.") for it in range(100): ob = env.reset()
env = MultiEnv([make_env(i, env_id=env_id, unwrap=args.unwrap) for i in range(n_env)]) s_dim = env.ob_space.shape[0] a_dim = env.ac_space.n runner = MultiEnvRunner(env, s_dim, a_dim, n_step, gamma, lamb) #Create the model #---------------------------- config = tf.ConfigProto( allow_soft_placement=True, intra_op_parallelism_threads=n_env, inter_op_parallelism_threads=n_env ) config.gpu_options.allow_growth = True sess = tf.Session(config=config) policy = PolicyModel(sess, s_dim, a_dim, "policy") dis = DiscriminatorModel(sess, s_dim, a_dim, name="discriminator") #Placeholders #---------------------------- #action_ph: (mb_size) #old_neg_logprob_ph: (mb_size) #old_v_pred_ph: (mb_size) #adv_ph: (mb_size) #return_ph: (mb_size) action_ph = tf.placeholder(tf.int32, [None], name="action") old_neg_logprob_ph = tf.placeholder(tf.float32, [None], name="old_negtive_log_prob") old_v_pred_ph = tf.placeholder(tf.float32, [None], name="old_value_pred") adv_ph = tf.placeholder(tf.float32, [None], name="advantage") return_ph = tf.placeholder(tf.float32, [None], name="return")
class ReinforceCartPole: def __init__(self, env, gamma): self.env = env self.env.seed(543) torch.manual_seed(543) self.policy_model = PolicyModel() self.optimizer = optim.Adam(self.policy_model.parameters(), lr=0.009) self.gamma = gamma self.eps = np.finfo(np.float32).eps.item() self.loss_list = [] self.ep_no_list = [] def get_action(self, state): state_torch = torch.from_numpy(state).float().unsqueeze(0) probs = self.policy_model(state_torch) first_action_probability = probs[0][0] random_no = random.random() if random_no <= first_action_probability: action = torch.tensor(0, dtype=torch.long) else: action = torch.tensor(1, dtype=torch.long) m = Categorical(probs) # print(action) log_prob = m.log_prob(action) return action, log_prob def get_returns(self, episode_rewards): return_sum = 0.0 returns = [] for r in reversed(episode_rewards): return_sum = r + (self.gamma * return_sum) returns.append(return_sum) returns = torch.tensor(list(reversed(returns))) returns = (returns - returns.mean()) / (returns.std() + self.eps) return returns def optimize(self, episode_log_probs, episode_rewards): returns = self.get_returns(episode_rewards) policy_loss = [] for logp, ret in zip(episode_log_probs, returns): policy_loss.append(-logp * ret) self.optimizer.zero_grad() loss = torch.cat(policy_loss).sum() self.loss_list.append(loss.item()) loss.backward() self.optimizer.step() def train(self, no_episodes, limit=4000, rendering=False, max_steps=500000): running_reward = 10.0 plot_rewards = [] plot_episode_nos = [] plot_mean_rewards = [] plot_mr_epno = [] for ep in range(1, no_episodes): state = self.env.reset() episode_rewards = [] episode_log_probs = [] ep_reward = 0.0 for s in range(max_steps): action, log_prob = self.get_action(state) episode_log_probs.append(log_prob) state, r, done, _ = self.env.step(action.item()) if rendering: self.env.render() episode_rewards.append(r) ep_reward += r # print(next_state) # print(r) # print('\n') if done: # time.sleep(0.5) break running_reward = 0.05 * ep_reward + (1 - 0.05) * running_reward plot_rewards.append(ep_reward) plot_episode_nos.append(ep) if ep > 100: plot_mean_rewards.append( sum(plot_rewards[-100:]) / len(plot_rewards[-100:])) plot_mr_epno.append(ep) self.optimize(episode_log_probs, episode_rewards) if ep % 25 == 0: print( 'Episode Number: {}\t Latest Reward: {:.2f}\tAverage Running reward: {:.2f}' .format(ep, ep_reward, running_reward)) #if running_reward > self.env.spec.reward_threshold: if running_reward > limit: print("Solved! Running reward is now {} and " "the last episode runs to {} time steps!".format( running_reward, s)) break self.plot(plot_rewards, plot_mean_rewards, plot_mr_epno, plot_episode_nos) self.env.close() def plot(self, plot_rewards, plot_mean_rewards, plot_mr_epno, plot_episode_nos): plt.plot(plot_episode_nos, plot_rewards) plt.plot(plot_mr_epno, plot_mean_rewards) plt.ylabel('Running Rewards') plt.xlabel('Episode No') plt.legend(['Reward', '100 Episode Mean Reward'], loc='upper left') plt.savefig('RewardsVsEpisodeNo.png') plt.clf() plt.plot(plot_episode_nos, self.loss_list) plt.ylabel('Training Loss') plt.xlabel('Episode No') # plt.legend(['AgentScore'], loc='upper left') plt.savefig('LossVsEpisodeNo.png')
#Create the environment #---------------------------- env = gym.make(env_id) s_dim = env.observation_space.shape[0] a_dim = env.action_space.shape[0] a_low = env.action_space.low[0] a_high = env.action_space.high[0] #Create the model #---------------------------- config = tf.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True sess = tf.Session(config=config) policy = PolicyModel(sess, s_dim, a_dim, a_low, a_high, name="policy") dis = DiscriminatorModel(sess, s_dim, a_dim, name="discriminator") #Placeholders #---------------------------- #action_ph: (mb_size, a_dim) action_ph = tf.placeholder(tf.float32, [None, a_dim], name="action") lr_ph = tf.placeholder(tf.float32, []) #Loss #---------------------------- loss = tf.reduce_mean(policy.distrib.neg_logp(action_ph))