def sac(env_fn, seed=0, gamma=.99, lam=.97, hidden_sizes=(200, 100), alpha=.0, v_lr=1e-3, q_lr=1e-3, pi_lr=1e-3, polyak=1e-2, epochs=50, steps_per_epoch=1000, batch_size=100, start_steps=1000, logger_kwargs=dict(), replay_size=int(1e6), max_ep_len=1000, save_freq=1): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() env = env_fn() # Dimensions obs_dim = env.observation_space.shape[0] act_dim = env.action_space.n # act_limit = env.action_space.high[0] # Placeholders x_ph = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32) a_ph = tf.placeholder(shape=[None, 1], dtype=tf.float32) x2_ph = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32) r_ph = tf.placeholder(shape=[None], dtype=tf.float32) d_ph = tf.placeholder(shape=[None], dtype=tf.float32) # Networks def mlp(x, hidden_sizes=(32, ), activation=tf.tanh, output_activation=None): for h in hidden_sizes[:-1]: x = tf.layers.dense(x, units=h, activation=activation) return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) def mlp_categorical_policy(x, a, hidden_sizes, activation, output_activation, action_space): act_dim = action_space.n logits = mlp(x, list(hidden_sizes) + [act_dim], activation, None) pi_all = tf.nn.softmax(logits) logpi_all = tf.nn.log_softmax(logits) # pi = tf.squeeze(tf.random.categorical(logits,1), axis=1) pi = tf.random.categorical(logits, 1) # a = tf.cast( a, tf.uint8) # logp = tf.reduce_sum(tf.one_hot(a, depth=act_dim) * logp_all, axis=1) # logp_pi = tf.reduce_sum(tf.one_hot( tf.squeeze( pi, axis=1), depth=act_dim) * logp_all, axis=1) return pi, pi_all, logpi_all LOG_STD_MIN = -20 LOG_STD_MAX = 2 with tf.variable_scope("main"): activation = tf.tanh with tf.variable_scope("pi"): pi, pi_all, logpi_all = mlp_categorical_policy( x_ph, a_ph, hidden_sizes, activation, None, env.action_space) print("### DEBUG @ main-discrete.py pi and others' dimensions") print(pi) print(pi_all) print(logpi_all) input() with tf.variable_scope("q1"): q1 = tf.squeeze(mlp(tf.concat([x_ph, a_ph], -1), hidden_sizes + (act_dim, ), activation, None), axis=-1) with tf.variable_scope("q1", reuse=True): q1_pi = tf.squeeze(mlp( tf.concat([x_ph, tf.cast(pi, tf.float32)], axis=-1), hidden_sizes + (act_dim, ), activation, None), axis=-1) with tf.variable_scope("q2"): q2 = tf.squeeze(mlp(tf.concat([x_ph, a_ph], -1), hidden_sizes + (act_dim, ), activation, None), axis=-1) with tf.variable_scope("q2", reuse=True): q2_pi = tf.squeeze(mlp( tf.concat([x_ph, tf.cast(pi, tf.float32)], -1), hidden_sizes + (act_dim, ), activation, None), axis=-1) with tf.variable_scope("v"): # v = mlp( x_ph, hidden_sizes+(1,), activation, None) v = tf.squeeze(mlp(x_ph, hidden_sizes + (1, ), activation, None), axis=-1) with tf.variable_scope("target"): with tf.variable_scope("v"): v_targ = tf.squeeze(mlp(x2_ph, hidden_sizes + (1, ), activation, None), axis=-1) # helpers for var count def get_vars(scope=''): return [x for x in tf.trainable_variables() if scope in x.name] def count_vars(scope=''): v = get_vars(scope) return sum([np.prod(var.shape.as_list()) for var in v]) # Count variables var_counts = tuple( count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main']) print( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t v: %d, \t total: %d\n' % var_counts) # Targets q_backup_prestop = r_ph + gamma * (1 - d_ph) * v_targ v_backup_prestop = tf.minimum(q1_pi, q2_pi) - alpha * logp_pi q_backup, v_backup = tf.stop_gradient(q_backup_prestop), tf.stop_gradient( v_backup_prestop) # Q Loss q1_loss = tf.reduce_mean((q1 - q_backup)**2) q2_loss = tf.reduce_mean((q2 - q_backup)**2) q_loss = q1_loss + q2_loss # V Loss v_loss = tf.reduce_mean((v - v_backup)**2) # Pol loss pi_loss = tf.reduce_mean(-q1_pi + alpha * logp_pi) # Training ops v_trainop = tf.train.AdamOptimizer(v_lr).minimize( v_loss, var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v")) q_trainop = tf.train.AdamOptimizer(q_lr).minimize( q_loss, var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/q")) pi_trainop = tf.train.AdamOptimizer(pi_lr).minimize( pi_loss, var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/pi")) assert polyak <= .5 # Target update op init_v_target = tf.group([ tf.assign(v_target, v_main) for v_main, v_target in zip( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v"), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target/v")) ]) update_v_target = tf.group([ tf.assign(v_target, (1 - polyak) * v_target + polyak * v_main) for v_main, v_target in zip( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v"), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target/v")) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(init_v_target) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'q1': q1, 'q2': q2, 'v': v }) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 # print( o.reshape(-1, 1)) # input() while not (d or (ep_len == max_ep_len)): o, r, d, _ = test_env.step( sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0][0]) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) #Buffer init buffer = ReplayBuffer(obs_dim, 1, replay_size) # Main loop start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs for t in range(total_steps): if t > start_steps: a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0][0] else: a = env.action_space.sample() o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 d = False or (ep_len == max_ep_len) # Still needed ? o2 = np.squeeze(o2) buffer.store(o, a, r, o2, d) o = o2 if d or (ep_len == max_ep_len): for j in range(ep_len): batch = buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # DEBUG: # v_backup_prestop_out = sess.run( v_backup_prestop, feed_dict=feed_dict) # print( v_backup_prestop_out.shape) # print( v_backup_prestop_out) # input() # Value gradient steps v_step_ops = [v_loss, v, v_trainop] outs = sess.run(v_step_ops, feed_dict) logger.store(LossV=outs[0], VVals=outs[1]) # Q Gradient steps q_step_ops = [q_loss, q1, q2, q_trainop] outs = sess.run(q_step_ops, feed_dict) logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2]) # Policy gradient steps # TODO Add entropy logging pi_step_ops = [pi_loss, pi_trainop, update_v_target] outs = sess.run(pi_step_ops, feed_dict=feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0., 0 if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Saving the model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
class DQNAgent: def __init__(self, env, state_size, action_size, batch_size, gamma, lr, update_every, tau, eps_start, eps_end, eps_decay, seed): for key, value in locals().items(): if key != 'self': setattr(self, key, value) random.seed(seed) torch.manual_seed(seed) np.random.seed(seed) self.Q_target = LinearModel(state_size, action_size) self.Q_local = LinearModel(state_size, action_size) self.memory = ReplayBuffer(batch_size=batch_size) self.optim = torch.optim.Adam(self.Q_local.parameters(), lr=lr) self.update_counter = 0 def env_reset(self, train_mode=True): return self.env.reset() def env_step(self, action): return self.env.step(action) def env_render(self, train_mode=False): return self.env.render() def env_close(self, train_mode=True): if not train_mode: return self.env.close() def get_action(self, state, epsilon=0.): if random.random() < epsilon: return np.random.choice(np.arange(self.action_size)) state = torch.tensor(state, dtype=torch.float32).unsqueeze(0) self.Q_local.eval() with torch.no_grad(): action = np.argmax(self.Q_local(state).data.numpy()) return action def step(self, state, action, reward, next_state, done): self.memory.store( (state, action, reward, next_state, 1 if done else 0)) self.update_counter = (self.update_counter + 1) % self.update_every if self.update_counter == 0: self.update_Q() def update_Q(self): states, actions, rewards, next_states, dones = self.memory.sample() Q_target_next = self.Q_target(next_states).detach().max( dim=1, keepdim=True)[0] Q_target_pred = rewards + self.gamma * Q_target_next * (1.0 - dones) self.Q_local.eval() Q = self.Q_local(states).gather(1, actions) loss = F.mse_loss(Q, Q_target_pred) self.Q_local.train() self.Q_local.zero_grad() loss.backward() self.optim.step() for t_param, l_param in zip(self.Q_target.parameters(), self.Q_local.parameters()): t_param.data.copy_(self.tau * l_param.data + (1.0 - self.tau) * t_param.data) def train(self, num_episodes, max_t=1000, is_finished=None, render=False): scores = [] eps = self.eps_start for i in range(num_episodes): state = self.env_reset(train_mode=True) score = 0 for _ in range(max_t): action = self.get_action(state, eps) if render: self.env_render(train_mode=True) next_state, reward, done, _ = self.env_step(action) self.step(state, action, reward, next_state, done) score += reward state = next_state if done: break eps = max(self.eps_end, eps * self.eps_decay) scores.append(score) if is_finished and is_finished(scores, num_episodes): break if render: self.env_close(train_mode=False) return scores def run(self, num_episodes=1, max_t=1000, render=None): if render == None: render = num_episodes == 1 scores = [] for i in range(num_episodes): state = self.env_reset(train_mode=False) score = 0 for _ in range(max_t): action = self.get_action(state) if render: self.env_render(train_mode=False) next_state, reward, done, _ = self.env_step(action) score += reward state = next_state if done: break scores.append(score) if render: self.env_close(train_mode=False) return scores
class BaseStudent(SerializableAgent): def __init__(self, env : gym.Env , trajs_path : str , model_path : str , run_seed : int , batch_size : int , buffer_size_in_trajs : int , teacher : BaseAgent, ): super(BaseStudent, self).__init__( env = env , trajs_path = trajs_path, model_path = model_path, ) self.run_seed = run_seed self.batch_size = batch_size self.buffer_size_in_trajs = buffer_size_in_trajs self.teacher = teacher self._fill_buffer() def matchup(self) -> np.ndarray: samples = self.buffer.sample_all() state = samples['state' ] action = samples['action'] action_hat = np.array([self.select_action(s) for s in state]) match_samp = np.equal(action, action_hat) return match_samp def rollout(self) -> Tuple[List[Tuple[np.ndarray, np.ndarray]], List[float], float]: state = self.env.reset() traj = [] match = [] retvrn = 0 done = False while not done: action = self.select_action(state) reward, next_state, done = self.perform_action(action) traj += [(state, action)] match += [action == self.teacher.select_action(state)] retvrn += reward state = next_state return traj, match, retvrn def test(self, num_episodes : int, ) -> Tuple[float, float, float]: self.test_mode = True trajs = [] matches = [] returns = [] for episode_index in range(num_episodes): traj, match, retvrn = self.rollout() trajs += [traj] matches += match returns += [retvrn] np.save(self.trajs_path, {'trajs': trajs, 'returns': returns}) return np.sum(matches) / len(matches), np.mean(returns), np.std(returns) def serialize(self): raise NotImplementedError def deserialize(self): raise NotImplementedError def _fill_buffer(self): trajs = np.load(self.teacher.trajs_path, allow_pickle = True)[()] \ ['trajs'][self.run_seed:self.run_seed + self.buffer_size_in_trajs] pairs = [pair for traj in trajs for i, pair in enumerate(traj) if i % 20 == 0] if len(pairs) < self.batch_size: self.batch_size = len(pairs) self.buffer = ReplayBuffer( state_dim = self.env.observation_space.shape[0], total_size = len(pairs) , batch_size = self.batch_size , ) for pair in pairs: self.buffer.store( state = pair[0], action = pair[1], reward = None , next_state = None , done = None , )
def ddpg(env_fn, actor_critic=a2c, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=.99, polyak=.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=.1, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] act_limit = env.action_space.high[0] ac_kwargs['action_space'] = env.action_space x_ph, a_ph, x2_ph, r_ph, d_ph = \ tf.placeholder( name='x_ph', shape=[None, obs_dim], dtype=tf.float32), \ tf.placeholder( name='a_ph', shape=[None, act_dim], dtype=tf.float32), \ tf.placeholder( name='x2_ph', shape=[None, obs_dim], dtype=tf.float32), \ tf.placeholder( name='r_ph', shape=[None], dtype=tf.float32), \ tf.placeholder( name='d_ph', shape=[None], dtype=tf.float32) # Main networks with tf.variable_scope('main'): pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # Target networks with tf.variable_scope('target'): pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) replaybuffer = ReplayBuffer(obs_dim, act_dim, replay_size) # helpers for var count def get_vars(scope=''): return [x for x in tf.trainable_variables() if scope in x.name] def count_vars(scope=''): v = get_vars(scope) return sum([np.prod(var.shape.as_list()) for var in v]) var_counts = tuple( count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts) # Bellman backup for Q function backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ) # Losses pi_loss = -tf.reduce_mean(q_pi) q_loss = tf.reduce_mean((q - backup)**2) # Optimizer and train ops train_pi_op = tf.train.AdamOptimizer(pi_lr).minimize( pi_loss, var_list=get_vars('main/pi')) train_q_op = tf.train.AdamOptimizer(q_loss).minimize( q_loss, var_list=get_vars('main/q')) # Update target networks target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Init targets target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'q': q }) def get_actions(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_actions(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: for t in range(total_steps): if t > start_steps: a = get_actions(o, act_noise) else: a = env.action_space.sample() o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 d = False if ep_len == max_ep_len else d # Storing experience replaybuffer.store(o, a, r, o2, d) o = o2 if d or (ep_len == max_ep_len): for _ in range(ep_len): batch = replaybuffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # Q-learning update outs = sess.run([q_loss, q, train_q_op], feed_dict) logger.store(LossQ=outs[0], QVals=outs[1]) # Policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def train(self): replay_buffer=ReplayBuffer(self.state_dim, self.act_dim, self.replay_size) total_steps = self.steps_per_epoch * self.epochs state = self.env.reset() state = state.astype(np.float32) ep_len, ep_rew, ep_count = 0,0,0 all_ep_rew= [] for t in range(total_steps): # randomly sample actions untils start_steps have elapsed if t > self.start_steps: act = self.sample_action(state) act = act.numpy() else: act = self.env.action_space.sample() state_, r, d, _ = self.env.step(act) state_ = state_.astype(np.float32) d = False if ep_len==self.max_ep_len else d ep_len+=1 ep_rew+=r # Store transitions replay_buffer.store(state,act,r,state_,d) state = state_ # End of trajectory if d or (ep_len==self.max_ep_len): state = self.env.reset() state = state.astype(np.float32) if len(all_ep_rew) < 5: all_ep_rew.append(ep_rew) else: all_ep_rew.append(ep_rew) all_ep_rew[-1] = (np.mean(all_ep_rew[-5:])) # smoothing epoch=(t+1)//self.steps_per_epoch print("Training | Epoch:{} | Episode:{} | Steps: {}/{} | Episode Reward: {:.4f}".format(epoch, ep_count, t, total_steps, ep_rew)) ep_len, ep_rew = 0,0 ep_count += 1 # Update if t>self.update_after and t%self.update_every==0: for j in range(self.update_every): batch=replay_buffer.sample_batch(self.batch_size) self.update_critic(batch) if j%self.policy_delay==0: self.update_actor(batch) self.update_targets() # End of epoch if (t+1) % self.steps_per_epoch==0: epoch=(t+1)//self.steps_per_epoch # save model if (epoch % self.save_freq == 0) or (epoch == self.epochs): self.actor.save_weights(self.save_path+'actor_checkpoint'+str(epoch)) self.critic_net1.save_weights(self.save_path+'critic_net1_checkpoint'+str(epoch)) self.critic_net2.save_weights(self.save_path+'critic_net2_checkpoint'+str(epoch)) plt.figure() plt.plot(all_ep_rew) plt.xlabel('episodes') plt.ylabel('total reward per episode') plt.show()
def train(env_fn, env_name, ac_kwargs=dict(), seed=0, steps_per_epoch=1000, epochs=3000, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=3e-4, batch_size=64, start_steps=10000, update_after=10000, update_every=1, num_test_episodes=10, value_coef=0.5, entropy_coef=0.02, max_ep_len=1000, logger_kwargs=dict(), save_freq=10, device=torch.device('cpu')): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() env.seed(seed) test_env.seed(seed) obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] actor_critic = MLPActorCritic(env.observation_space, env.action_space, **ac_kwargs).to(device) sql = SQL(actor_critic, lr, batch_size, update_every, gamma, polyak, value_coef, entropy_coef) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size, device=device) rewards_log = [] episode_rewards = deque(maxlen=10) # Set up model saving logger.setup_pytorch_saver(sql.actor_critic) def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): action = sql.actor_critic.act( torch.as_tensor(o, dtype=torch.float32).to(device)) o, r, d, _ = test_env.step(action) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) episode_rewards.append(ep_ret) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy (with some noise, via act_noise). if t > start_steps: a = sql.actor_critic.act( torch.as_tensor(o, dtype=torch.float32).to(device)) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after: for _ in range(update_every): batch = replay_buffer.sample_batch(batch_size) loss = sql.update(data=batch) logger.store(Loss=loss) else: logger.store(Loss=0.) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if save_freq != 0 and ((epoch % save_freq == 0) or (epoch == epochs)): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() rewards_log.append(np.mean(episode_rewards)) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Time', time.time() - start_time) logger.log_tabular('Loss', average_only=True) logger.dump_tabular() rewards_log = np.array(rewards_log) save_path = '../../log/modified_sql/' + env_name + '/' + str(seed) + '.npy' np.save(save_path, rewards_log)
def td3( env_fn, actor_critic=a2c, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=.99, polyak=.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=.1, target_noise=.2, noise_clip=.5, policy_delay=2, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): logger = EpochLogger( **logger_kwargs) logger.save_config( locals()) tf.set_random_seed(seed) np.random.seed( seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping act_limit = env.action_space.high[0] # Share action sapce info with A2C ac_kwargs['action_space'] = env.action_space x_ph, a_ph, x2_ph, r_ph, d_ph = \ tf.placeholder( name='x_ph', shape=(None, obs_dim), dtype=tf.float32), \ tf.placeholder( name='a_ph', shape=(None, act_dim), dtype=tf.float32), \ tf.placeholder( name='x2_ph', shape=(None, obs_dim), dtype=tf.float32),\ tf.placeholder( name='r_ph', shape=(None), dtype=tf.float32), \ tf.placeholder( name='d_ph', shape=(None), dtype=tf.float32) # Actor policy and value with tf.variable_scope('main'): pi, q1, q2, q1_pi = actor_critic( x_ph, a_ph, **ac_kwargs) # Tghis seems a bit memory inneficient: what happens to the q values created # along with the target policy ? the poluicy created along the q targets ? # Not referenced, but still declared right, a the cost of GPU memory # Target policy with tf.variable_scope( 'target'): pi_targ, _, _, _ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Target Q networks with tf.variable_scope( 'target', reuse=True): epsilon = tf.random_normal( tf.shape( pi_targ), stddev=target_noise) epsilon = tf.clip_by_value( epsilon, -noise_clip, noise_clip) a2 = pi_targ + epsilon a2 = tf.clip_by_value( a2, -act_limit, act_limit) # Target Q-Values using actions from target policy _, q1_targ, q2_targ, _ = actor_critic(x2_ph, a2, **ac_kwargs) replaybuffer = ReplayBuffer( obs_dim, act_dim, size=replay_size) # helpers for var count def get_vars(scope=''): return [x for x in tf.trainable_variables() if scope in x.name] def count_vars(scope=''): v = get_vars(scope) return sum([np.prod(var.shape.as_list()) for var in v]) # Count variables var_counts = tuple( count_vars( scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n' % var_counts) # CLiped Double Q-Learning with Bellman backup min_q_targ = tf.minimum( q1_targ, q2_targ) backup = tf.stop_gradient( r_ph + gamma * (1 -d_ph) * min_q_targ) # TD3 Losses pi_loss = - tf.reduce_mean( q1_pi) q1_loss = tf.reduce_mean( (q1 - backup)**2) q2_loss = tf.reduce_mean( (q2 - backup)**2) q_loss = q1_loss + q2_loss # Trainin ops pi_train = tf.train.AdamOptimizer(pi_lr).minimize( pi_loss) q_train = tf.train.AdamOptimizer(q_lr).minimize( q_loss) # Polyak wise target update target_update = tf.group( [ tf.assign( v_targ, polyak * v_targ + (1-polyak) * v_main) for v_main, v_targ in zip( get_vars('main'), get_vars('target'))]) target_init = tf.group( [ tf.assign( v_targ, v_main) for v_targ, v_main in zip( get_vars('target'), get_vars('main'))]) sess = tf.Session() sess.run( tf.global_variables_initializer()) sess.run( target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'pi': pi, 'q1': q1, 'q2': q2}) def get_action( o, noise_scale): a = sess.run( pi, feed_dict={ x_ph: o.reshape(1,-1)}) a += noise_scale * np.random.randn( act_dim) return np.clip( a, -act_limit, act_limit) def test_agent( n=10): for j in range( n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0 ,0 while not ( d or (ep_len == max_ep_len)): o, r, d, _ = test_env.step( get_action( o, 0)) ep_ret += r ep_len += 1 logger.store( TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0 , 0 total_steps = steps_per_epoch * epochs # Main loop for t in range( total_steps): if t > start_steps: a = get_action( o, act_noise) else: a = env.action_space.sample() o2, r, d, _ = env.step( a) ep_ret += r ep_len += 1 d = False or ( ep_len == max_ep_len) o2 = np.squeeze( o2) # print( "O2: ", o2) replaybuffer.store( o, a, r, o2, d) o = o2 if d or ( ep_len == max_ep_len): for j in range( ep_len): batch = replaybuffer.sample_batch( batch_size) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } q_step_ops = [q_loss, q1, q2, q_train] outs = sess.run( q_step_ops, feed_dict) logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2]) if j % policy_delay == 0: outs = sess.run( [pi_loss, pi_train, target_update], feed_dict) logger.store( LossPi=outs[0]) logger.store( EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Saving the model if (epoch % save_freq == 0) or ( epoch == epochs - 1): logger.save_state({'env': env}, None) test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
target_update_freq, gamma, explor_period, seed, env) env.reset() ####### #prefill buffer prefill_buffer_size = 50000 buffer.reset() for _ in range(prefill_buffer_size): action = np.random.randint(0, len(env.action_Space)) current_state = np.copy(env.state) next_state, reward, done = env.step(action) buffer.store(current_state, action, reward, done, prefill=True) if done: env.reset() #reset when prefilling is done env.reset() ########### #train the network dqn.training = True training_steps = 10000 total_reward = 0
def sac(env_fn, seed=0, gamma=.99, lam=.97, hidden_sizes=(200, 100), alpha=.5, v_lr=1e-3, q_lr=1e-3, pi_lr=1e-3, polyak=1e-2, epochs=50, steps_per_epoch=1000, batch_size=100, start_steps=10000, logger_kwargs=dict(), replay_size=int(1e6), max_ep_len=1000, save_freq=1): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() env = env_fn() # Dimensions obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] act_limit = env.action_space.high[0] # Placeholders x_ph = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32) a_ph = tf.placeholder(shape=[None, act_dim], dtype=tf.float32) x2_ph = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32) r_ph = tf.placeholder(shape=[None], dtype=tf.float32) d_ph = tf.placeholder(shape=[None], dtype=tf.float32) # Networks def mlp(x, hidden_sizes=(32, ), activation=tf.tanh, output_activation=None): for h in hidden_sizes[:-1]: x = tf.layers.dense(x, units=h, activation=activation) return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) # Why isn't the k used here ? def gaussian_likelihood(x, mu, log_std): EPS = 1e-8 pre_sum = -0.5 * ( ((x - mu) / (tf.exp(log_std) + EPS))**2 + 2 * log_std + np.log(2 * np.pi)) return tf.reduce_sum(pre_sum, axis=1) def clip_but_pass_gradient(x, l=-1., u=1.): clip_up = tf.cast(x > u, tf.float32) clip_low = tf.cast(x < l, tf.float32) return x + tf.stop_gradient((u - x) * clip_up + (l - x) * clip_low) LOG_STD_MIN = -20 LOG_STD_MAX = 2 def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation): act_dim = a.shape.as_list()[-1] net = mlp(x, list(hidden_sizes), activation, activation) mu = tf.layers.dense(net, act_dim, activation=output_activation) """ Because algorithm maximizes trade-off of reward and entropy, entropy must be unique to state---and therefore log_stds need to be a neural network output instead of a shared-across-states learnable parameter vector. But for deep Relu and other nets, simply sticking an activationless dense layer at the end would be quite bad---at the beginning of training, a randomly initialized net could produce extremely large values for the log_stds, which would result in some actions being either entirely deterministic or too random to come back to earth. Either of these introduces numerical instability which could break the algorithm. To protect against that, we'll constrain the output range of the log_stds, to lie within [LOG_STD_MIN, LOG_STD_MAX]. This is slightly different from the trick used by the original authors of SAC---they used tf.clip_by_value instead of squashing and rescaling. I prefer this approach because it allows gradient propagation through log_std where clipping wouldn't, but I don't know if it makes much of a difference. """ log_std = tf.layers.dense(net, act_dim, activation=tf.tanh) log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) std = tf.exp(log_std) pi = mu + tf.random_normal(tf.shape(mu)) * std logp_pi = gaussian_likelihood(pi, mu, log_std) return mu, pi, logp_pi def apply_squashing_func(mu, pi, logp_pi): mu = tf.tanh(mu) pi = tf.tanh(pi) # To avoid evil machine precision error, strictly clip 1-pi**2 to [0,1] range. logp_pi -= tf.reduce_sum( tf.log(clip_but_pass_gradient(1 - pi**2, l=0, u=1) + 1e-6), axis=1) return mu, pi, logp_pi with tf.variable_scope("main"): activation = tf.tanh with tf.variable_scope("pi"): # mu = mlp( x_ph, hidden_sizes, activation, None) # log_std = mlp( mu, (act_dim,), activation, None) # # Avoid out of range log_std. Refer to Github for explanation. # log_std = LOG_STD_MIN + .5 * ( LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) # # mu = mlp( mu, (act_dim,), activation, None) # # pi = mu + tf.exp( log_std) * tf.random_normal( tf.shape(mu)) # logp_pi = gaussian_likelihood( pi, mu, log_std) # # # Follow SpinningUp Implementation # mu = tf.tanh(mu) # pi = tf.tanh(pi) # # def clip_but_pass_gradient(x, l=-1., u=1.): # clip_up = tf.cast(x > u, tf.float32) # clip_low = tf.cast(x < l, tf.float32) # # What is this supposed to mean even ? # return x + tf.stop_gradient((u - x)*clip_up + (l - x)*clip_low) # # # Shameless copy paste # logp_pi -= tf.reduce_sum(tf.log(clip_but_pass_gradient(1 - pi**2, l=0, u=1) + 1e-6), axis=1) # Not working version bak # squashed_pi = tf.tanh( pi) # # # To be sure # pi = tf.clip_by_value( pi, -act_limit, act_limit) # # # Must take in the squased polic # log_squash_pi = gaussian_likelihood( squashed_pi, mu, log_std) # Shamefull plug mu, pi, logp_pi = mlp_gaussian_policy(x_ph, a_ph, hidden_sizes, tf.tanh, None) mu, pi, logp_pi = apply_squashing_func(mu, pi, logp_pi) with tf.variable_scope("q1"): q1 = tf.squeeze(mlp(tf.concat([x_ph, a_ph], -1), hidden_sizes + (1, ), activation, None), axis=-1) with tf.variable_scope("q1", reuse=True): q1_pi = tf.squeeze(mlp(tf.concat([x_ph, pi], -1), hidden_sizes + (1, ), activation, None), axis=-1) with tf.variable_scope("q2"): q2 = tf.squeeze(mlp(tf.concat([x_ph, a_ph], -1), hidden_sizes + (1, ), activation, None), axis=-1) with tf.variable_scope("q2", reuse=True): q2_pi = tf.squeeze(mlp(tf.concat([x_ph, pi], -1), hidden_sizes + (1, ), activation, None), axis=-1) with tf.variable_scope("v"): # v = mlp( x_ph, hidden_sizes+(1,), activation, None) v = tf.squeeze(mlp(x_ph, hidden_sizes + (1, ), activation, None), axis=-1) with tf.variable_scope("target"): with tf.variable_scope("v"): v_targ = tf.squeeze(mlp(x2_ph, hidden_sizes + (1, ), activation, None), axis=-1) # helpers for var count def get_vars(scope=''): return [x for x in tf.trainable_variables() if scope in x.name] def count_vars(scope=''): v = get_vars(scope) return sum([np.prod(var.shape.as_list()) for var in v]) # Count variables var_counts = tuple( count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main']) print( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t v: %d, \t total: %d\n' % var_counts) # Targets q_backup_prestop = r_ph + gamma * (1 - d_ph) * v_targ v_backup_prestop = tf.minimum(q1_pi, q2_pi) - alpha * logp_pi q_backup, v_backup = tf.stop_gradient(q_backup_prestop), tf.stop_gradient( v_backup_prestop) # Q Loss q1_loss = tf.reduce_mean((q1 - q_backup)**2) q2_loss = tf.reduce_mean((q2 - q_backup)**2) q_loss = q1_loss + q2_loss # V Loss v_loss = tf.reduce_mean((v - v_backup)**2) # Pol loss pi_loss = tf.reduce_mean(-q1_pi + alpha * logp_pi) # Training ops v_trainop = tf.train.AdamOptimizer(v_lr).minimize( v_loss, var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v")) q_trainop = tf.train.AdamOptimizer(q_lr).minimize( q_loss, var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/q")) pi_trainop = tf.train.AdamOptimizer(pi_lr).minimize( pi_loss, var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/pi")) assert polyak <= .5 # Target update op init_v_target = tf.group([ tf.assign(v_target, v_main) for v_main, v_target in zip( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v"), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target/v")) ]) update_v_target = tf.group([ tf.assign(v_target, (1 - polyak) * v_target + polyak * v_main) for v_main, v_target in zip( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v"), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target/v")) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(init_v_target) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'q1': q1, 'q2': q2, 'v': v }) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 # print( o.reshape(-1, 1)) # input() while not (d or (ep_len == max_ep_len)): o, r, d, _ = test_env.step( sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) #Buffer init buffer = ReplayBuffer(obs_dim, act_dim, replay_size) # Main loop start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs for t in range(total_steps): if t > start_steps: a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)}) else: a = env.action_space.sample() o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 o2, r, d, _ = env.step(o) d = False or (ep_len == max_ep_len) # Still needed ? o2 = np.squeeze(o2) buffer.store(o, a, r, o2, d) o = o2 if d or (ep_len == max_ep_len): for j in range(ep_len): batch = buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # DEBUG: # v_backup_prestop_out = sess.run( v_backup_prestop, feed_dict=feed_dict) # print( v_backup_prestop_out.shape) # print( v_backup_prestop_out) # input() # Value gradient steps v_step_ops = [v_loss, v, v_trainop] outs = sess.run(v_step_ops, feed_dict) logger.store(LossV=outs[0], VVals=outs[1]) # Q Gradient steps q_step_ops = [q_loss, q1, q2, q_trainop] outs = sess.run(q_step_ops, feed_dict) logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2]) # Policy gradient steps # TODO Add entropy logging pi_step_ops = [pi_loss, pi_trainop, update_v_target] outs = sess.run(pi_step_ops, feed_dict=feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0., 0 if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Saving the model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()