def train_model(context, data, training_batch): # Crear clase de datos sintéticos context.synthetic_data = SyntheticData(context=context, data=data, window=10000, frequency=30) # Crear configuración del modelo junto con redes neuronales create_model(context) # Configurar resumen de operaciones summary_ops, summary_vars = build_summaries() writer = tf.summary.FileWriter( "/home/enzo/PycharmProjects/DDPGPorfolioOptimization/summaries", context.sess.graph) if os.path.exists(context.model_path): context.saver.restore(context.sess, context.model_path) # Inicializar la memoria de repetición replay_buffer = ReplayBuffer(context.buffer_size) for episode in range(context.max_episodes): data, close_prices = context.synthetic_data.get_trayectory( t_intervals=context.max_ep_steps + context.n) # Resetear los valores del portafolio al inicio de cada episodio context.portfolio_value_memory = [] context.portfolio_value_memory.append(context.init_train_portfolio) context.train_invested_quantity = 0.0 context.assets_quantity_invested = [] context.portfolio_w_memory = [] context.init_portfolio_w = [] for i in range(len(context.assets) + 1): context.init_portfolio_w.append(0.0) context.portfolio_w_memory.append(context.init_portfolio_w) for i in range(len(context.assets)): context.assets_quantity_invested.append(0.0) context.train_cash = context.init_train_portfolio context.last_train_operation = 2 context.open_trade = False ep_reward = 0 ep_ave_max_q = 0 ep_loss = 0 # Se resta uno para tomar el cuenta la obtención del siguiente estado for i in range(context.max_ep_steps - 1): # Obtener el estado s = data[:, i:i + context.n, :] # Aplicar un error a la acción que permita equilibrar el problema de explotación/exploración random = np.random.rand() if random > context.epsilon: if s.shape == (len(context.assets), context.n, len(context.features)): a = context.actor.predict([s])[0] else: print("Episodio:", episode, "Paso:", i, "La forma del estado actual es incorrecta") continue else: rand_array = np.random.rand(len(context.assets) + 1) a = np.exp(rand_array) / np.sum(np.exp(rand_array)) context.epsilon = context.epsilon * context.epsilon_decay # Siguiente estado s2 = data[:, i + 1:i + 1 + context.n, :] if not s2.shape == (len( context.assets), context.n, len(context.features)): print("Episodio:", episode, "Paso:", i, "La forma del siguiente estado es incorrecta") continue # Recompensa this_closes = close_prices[:, i + context.n] previous_closes = close_prices[:, i + context.n - 1] r = get_reward(context, this_closes, previous_closes, a) # Punto terminal if i == (context.max_ep_steps - context.n - 2): t = True else: t = False replay_buffer.add(s, a, r, t, s2) if replay_buffer.size() > context.minibatch_size: s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch( context.minibatch_size) # Calcular objetivos target_q = context.critic.predict_target( s2_batch, context.actor.predict_target(s2_batch)) y_i = [] for k in range(context.minibatch_size): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + context.gamma * target_q[k]) # Actualizar el crítico dados los objetivos predicted_q_value_batch = np.reshape( y_i, (context.minibatch_size, 1)) predicted_q_value, losses, _ = context.critic.train( s_batch, a_batch, predicted_q_value_batch) ep_loss += np.mean(losses) ep_ave_max_q += np.amax(predicted_q_value) # Actualizar la política del actor utilizando el ejemplar de gradiente a_outs = context.actor.predict(s_batch) grads = context.critic.action_gradients(s_batch, a_outs) context.actor.train(s_batch, grads[0]) # Actualizar las redes objetivo context.actor.update_target_network() context.critic.update_target_network() ep_reward += r if i == (context.max_ep_steps - 2): summary_str = context.sess.run(summary_ops, feed_dict={ summary_vars[0]: ep_reward, summary_vars[1]: ep_ave_max_q / float(i), summary_vars[2]: ep_loss / float(i) }) writer.add_summary(summary_str, episode) writer.flush() print( '| Reward: {:.5f} | Episode: {:d} | Qmax: {:.4f} | Porfolio value: {:.4f} | Epsilon: {:.5f} ' .format(ep_reward, episode, (ep_ave_max_q / float(i)), context.portfolio_value_memory[-1], context.epsilon)) _ = context.saver.save(context.sess, context.model_path)
def sac(env_fn, seed=0, gamma=.99, lam=.97, hidden_sizes=(200, 100), alpha=.0, v_lr=1e-3, q_lr=1e-3, pi_lr=1e-3, polyak=1e-2, epochs=50, steps_per_epoch=1000, batch_size=100, start_steps=1000, logger_kwargs=dict(), replay_size=int(1e6), max_ep_len=1000, save_freq=1): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() env = env_fn() # Dimensions obs_dim = env.observation_space.shape[0] act_dim = env.action_space.n # act_limit = env.action_space.high[0] # Placeholders x_ph = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32) a_ph = tf.placeholder(shape=[None, 1], dtype=tf.float32) x2_ph = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32) r_ph = tf.placeholder(shape=[None], dtype=tf.float32) d_ph = tf.placeholder(shape=[None], dtype=tf.float32) # Networks def mlp(x, hidden_sizes=(32, ), activation=tf.tanh, output_activation=None): for h in hidden_sizes[:-1]: x = tf.layers.dense(x, units=h, activation=activation) return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) def mlp_categorical_policy(x, a, hidden_sizes, activation, output_activation, action_space): act_dim = action_space.n logits = mlp(x, list(hidden_sizes) + [act_dim], activation, None) pi_all = tf.nn.softmax(logits) logpi_all = tf.nn.log_softmax(logits) # pi = tf.squeeze(tf.random.categorical(logits,1), axis=1) pi = tf.random.categorical(logits, 1) # a = tf.cast( a, tf.uint8) # logp = tf.reduce_sum(tf.one_hot(a, depth=act_dim) * logp_all, axis=1) # logp_pi = tf.reduce_sum(tf.one_hot( tf.squeeze( pi, axis=1), depth=act_dim) * logp_all, axis=1) return pi, pi_all, logpi_all LOG_STD_MIN = -20 LOG_STD_MAX = 2 with tf.variable_scope("main"): activation = tf.tanh with tf.variable_scope("pi"): pi, pi_all, logpi_all = mlp_categorical_policy( x_ph, a_ph, hidden_sizes, activation, None, env.action_space) print("### DEBUG @ main-discrete.py pi and others' dimensions") print(pi) print(pi_all) print(logpi_all) input() with tf.variable_scope("q1"): q1 = tf.squeeze(mlp(tf.concat([x_ph, a_ph], -1), hidden_sizes + (act_dim, ), activation, None), axis=-1) with tf.variable_scope("q1", reuse=True): q1_pi = tf.squeeze(mlp( tf.concat([x_ph, tf.cast(pi, tf.float32)], axis=-1), hidden_sizes + (act_dim, ), activation, None), axis=-1) with tf.variable_scope("q2"): q2 = tf.squeeze(mlp(tf.concat([x_ph, a_ph], -1), hidden_sizes + (act_dim, ), activation, None), axis=-1) with tf.variable_scope("q2", reuse=True): q2_pi = tf.squeeze(mlp( tf.concat([x_ph, tf.cast(pi, tf.float32)], -1), hidden_sizes + (act_dim, ), activation, None), axis=-1) with tf.variable_scope("v"): # v = mlp( x_ph, hidden_sizes+(1,), activation, None) v = tf.squeeze(mlp(x_ph, hidden_sizes + (1, ), activation, None), axis=-1) with tf.variable_scope("target"): with tf.variable_scope("v"): v_targ = tf.squeeze(mlp(x2_ph, hidden_sizes + (1, ), activation, None), axis=-1) # helpers for var count def get_vars(scope=''): return [x for x in tf.trainable_variables() if scope in x.name] def count_vars(scope=''): v = get_vars(scope) return sum([np.prod(var.shape.as_list()) for var in v]) # Count variables var_counts = tuple( count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main']) print( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t v: %d, \t total: %d\n' % var_counts) # Targets q_backup_prestop = r_ph + gamma * (1 - d_ph) * v_targ v_backup_prestop = tf.minimum(q1_pi, q2_pi) - alpha * logp_pi q_backup, v_backup = tf.stop_gradient(q_backup_prestop), tf.stop_gradient( v_backup_prestop) # Q Loss q1_loss = tf.reduce_mean((q1 - q_backup)**2) q2_loss = tf.reduce_mean((q2 - q_backup)**2) q_loss = q1_loss + q2_loss # V Loss v_loss = tf.reduce_mean((v - v_backup)**2) # Pol loss pi_loss = tf.reduce_mean(-q1_pi + alpha * logp_pi) # Training ops v_trainop = tf.train.AdamOptimizer(v_lr).minimize( v_loss, var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v")) q_trainop = tf.train.AdamOptimizer(q_lr).minimize( q_loss, var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/q")) pi_trainop = tf.train.AdamOptimizer(pi_lr).minimize( pi_loss, var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/pi")) assert polyak <= .5 # Target update op init_v_target = tf.group([ tf.assign(v_target, v_main) for v_main, v_target in zip( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v"), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target/v")) ]) update_v_target = tf.group([ tf.assign(v_target, (1 - polyak) * v_target + polyak * v_main) for v_main, v_target in zip( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v"), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target/v")) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(init_v_target) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'q1': q1, 'q2': q2, 'v': v }) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 # print( o.reshape(-1, 1)) # input() while not (d or (ep_len == max_ep_len)): o, r, d, _ = test_env.step( sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0][0]) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) #Buffer init buffer = ReplayBuffer(obs_dim, 1, replay_size) # Main loop start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs for t in range(total_steps): if t > start_steps: a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0][0] else: a = env.action_space.sample() o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 d = False or (ep_len == max_ep_len) # Still needed ? o2 = np.squeeze(o2) buffer.store(o, a, r, o2, d) o = o2 if d or (ep_len == max_ep_len): for j in range(ep_len): batch = buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # DEBUG: # v_backup_prestop_out = sess.run( v_backup_prestop, feed_dict=feed_dict) # print( v_backup_prestop_out.shape) # print( v_backup_prestop_out) # input() # Value gradient steps v_step_ops = [v_loss, v, v_trainop] outs = sess.run(v_step_ops, feed_dict) logger.store(LossV=outs[0], VVals=outs[1]) # Q Gradient steps q_step_ops = [q_loss, q1, q2, q_trainop] outs = sess.run(q_step_ops, feed_dict) logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2]) # Policy gradient steps # TODO Add entropy logging pi_step_ops = [pi_loss, pi_trainop, update_v_target] outs = sess.run(pi_step_ops, feed_dict=feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0., 0 if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Saving the model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
def ddpg(env_fn, actor_critic=a2c, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=.99, polyak=.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=.1, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] act_limit = env.action_space.high[0] ac_kwargs['action_space'] = env.action_space x_ph, a_ph, x2_ph, r_ph, d_ph = \ tf.placeholder( name='x_ph', shape=[None, obs_dim], dtype=tf.float32), \ tf.placeholder( name='a_ph', shape=[None, act_dim], dtype=tf.float32), \ tf.placeholder( name='x2_ph', shape=[None, obs_dim], dtype=tf.float32), \ tf.placeholder( name='r_ph', shape=[None], dtype=tf.float32), \ tf.placeholder( name='d_ph', shape=[None], dtype=tf.float32) # Main networks with tf.variable_scope('main'): pi, q, q_pi = actor_critic(x_ph, a_ph, **ac_kwargs) # Target networks with tf.variable_scope('target'): pi_targ, _, q_pi_targ = actor_critic(x2_ph, a_ph, **ac_kwargs) replaybuffer = ReplayBuffer(obs_dim, act_dim, replay_size) # helpers for var count def get_vars(scope=''): return [x for x in tf.trainable_variables() if scope in x.name] def count_vars(scope=''): v = get_vars(scope) return sum([np.prod(var.shape.as_list()) for var in v]) var_counts = tuple( count_vars(scope) for scope in ['main/pi', 'main/q', 'main']) print('\nNumber of parameters: \t pi: %d, \t q: %d, \t total: %d\n' % var_counts) # Bellman backup for Q function backup = tf.stop_gradient(r_ph + gamma * (1 - d_ph) * q_pi_targ) # Losses pi_loss = -tf.reduce_mean(q_pi) q_loss = tf.reduce_mean((q - backup)**2) # Optimizer and train ops train_pi_op = tf.train.AdamOptimizer(pi_lr).minimize( pi_loss, var_list=get_vars('main/pi')) train_q_op = tf.train.AdamOptimizer(q_loss).minimize( q_loss, var_list=get_vars('main/q')) # Update target networks target_update = tf.group([ tf.assign(v_targ, polyak * v_targ + (1 - polyak) * v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) # Init targets target_init = tf.group([ tf.assign(v_targ, v_main) for v_main, v_targ in zip(get_vars('main'), get_vars('target')) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'q': q }) def get_actions(o, noise_scale): a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})[0] a += noise_scale * np.random.randn(act_dim) return np.clip(a, -act_limit, act_limit) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 while not (d or (ep_len == max_ep_len)): # Take deterministic actions at test time (noise_scale=0) o, r, d, _ = test_env.step(get_actions(o, 0)) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs # Main loop: for t in range(total_steps): if t > start_steps: a = get_actions(o, act_noise) else: a = env.action_space.sample() o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 d = False if ep_len == max_ep_len else d # Storing experience replaybuffer.store(o, a, r, o2, d) o = o2 if d or (ep_len == max_ep_len): for _ in range(ep_len): batch = replaybuffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # Q-learning update outs = sess.run([q_loss, q, train_q_op], feed_dict) logger.store(LossQ=outs[0], QVals=outs[1]) # Policy update outs = sess.run([pi_loss, train_pi_op, target_update], feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('QVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()
class DDPG: def __init__(self, args): """ init function Args: - args: class with args parameter """ self.state_size = args.state_size self.action_size = args.action_size self.bs = args.bs self.gamma = args.gamma self.epsilon = args.epsilon self.tau = args.tau self.discrete = args.discrete self.randomer = OUNoise(args.action_size) self.buffer = ReplayBuffer(args.max_buff) self.actor = Actor(self.state_size, self.action_size) self.actor_target = Actor(self.state_size, self.action_size) self.actor_opt = AdamW(self.actor.parameters(), args.lr_actor) self.critic = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) self.critic_opt = AdamW(self.critic.parameters(), args.lr_critic) hard_update(self.actor_target, self.actor) hard_update(self.critic_target, self.critic) def reset(self): """ reset noise and model """ self.randomer.reset() def get_action(self, state): """ get distribution of action Args: - state: list, shape == [state_size] """ state = torch.tensor(state, dtype=torch.float).unsqueeze(0) action = self.actor(state).detach() action = action.squeeze(0).numpy() action += self.epsilon * self.randomer.noise() action = np.clip(action, -1.0, 1.0) return action def learning(self): """ learn models """ s1, a1, r1, t1, s2 = self.buffer.sample_batch(self.bs) # bool -> int t1 = 1 - t1 s1 = torch.tensor(s1, dtype=torch.float) a1 = torch.tensor(a1, dtype=torch.float) r1 = torch.tensor(r1, dtype=torch.float) t1 = torch.tensor(t1, dtype=torch.float) s2 = torch.tensor(s2, dtype=torch.float) a2 = self.actor_target(s2).detach() q2 = self.critic_target(s2, a2).detach() q2_plus_r = r1[:, None] + t1[:, None] * self.gamma * q2 q1 = self.critic.forward(s1, a1) # critic gradient critic_loss = nn.MSELoss() loss_critic = critic_loss(q1, q2_plus_r) self.critic_opt.zero_grad() loss_critic.backward() self.critic_opt.step() # actor gradient pred_a = self.actor.forward(s1) loss_actor = (-self.critic.forward(s1, pred_a)).mean() self.actor_opt.zero_grad() loss_actor.backward() self.actor_opt.step() # Notice that we only have gradient updates for actor and critic, not target # actor_opt.step() and critic_opt.step() soft_update(self.actor_target, self.actor, self.tau) soft_update(self.critic_target, self.critic, self.tau) return loss_actor.item(), loss_critic.item()
def train(env_fn, env_name, ac_kwargs=dict(), seed=0, steps_per_epoch=1000, epochs=3000, replay_size=int(1e6), gamma=0.99, polyak=0.995, lr=3e-4, batch_size=64, start_steps=10000, update_after=10000, update_every=1, num_test_episodes=10, value_coef=0.5, entropy_coef=0.02, max_ep_len=1000, logger_kwargs=dict(), save_freq=10, device=torch.device('cpu')): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) torch.manual_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() env.seed(seed) test_env.seed(seed) obs_dim = env.observation_space.shape act_dim = env.action_space.shape[0] # Action limit for clamping: critically, assumes all dimensions share the same bound! act_limit = env.action_space.high[0] actor_critic = MLPActorCritic(env.observation_space, env.action_space, **ac_kwargs).to(device) sql = SQL(actor_critic, lr, batch_size, update_every, gamma, polyak, value_coef, entropy_coef) # Experience buffer replay_buffer = ReplayBuffer(obs_dim=obs_dim, act_dim=act_dim, size=replay_size, device=device) rewards_log = [] episode_rewards = deque(maxlen=10) # Set up model saving logger.setup_pytorch_saver(sql.actor_critic) def test_agent(): for j in range(num_test_episodes): o, d, ep_ret, ep_len = test_env.reset(), False, 0, 0 while not (d or (ep_len == max_ep_len)): action = sql.actor_critic.act( torch.as_tensor(o, dtype=torch.float32).to(device)) o, r, d, _ = test_env.step(action) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) episode_rewards.append(ep_ret) # Prepare for interaction with environment total_steps = steps_per_epoch * epochs start_time = time.time() o, ep_ret, ep_len = env.reset(), 0, 0 # Main loop: collect experience in env and update/log each epoch for t in range(total_steps): # Until start_steps have elapsed, randomly sample actions # from a uniform distribution for better exploration. Afterwards, # use the learned policy (with some noise, via act_noise). if t > start_steps: a = sql.actor_critic.act( torch.as_tensor(o, dtype=torch.float32).to(device)) else: a = env.action_space.sample() # Step the env o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 # Ignore the "done" signal if it comes from hitting the time # horizon (that is, when it's an artificial terminal signal # that isn't based on the agent's state) d = False if ep_len == max_ep_len else d # Store experience to replay buffer replay_buffer.store(o, a, r, o2, d) # Super critical, easy to overlook step: make sure to update # most recent observation! o = o2 # End of trajectory handling if d or (ep_len == max_ep_len): logger.store(EpRet=ep_ret, EpLen=ep_len) o, ep_ret, ep_len = env.reset(), 0, 0 # Update handling if t >= update_after: for _ in range(update_every): batch = replay_buffer.sample_batch(batch_size) loss = sql.update(data=batch) logger.store(Loss=loss) else: logger.store(Loss=0.) # End of epoch handling if (t + 1) % steps_per_epoch == 0: epoch = (t + 1) // steps_per_epoch # Save model if save_freq != 0 and ((epoch % save_freq == 0) or (epoch == epochs)): logger.save_state({'env': env}, None) # Test the performance of the deterministic version of the agent. test_agent() rewards_log.append(np.mean(episode_rewards)) # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Time', time.time() - start_time) logger.log_tabular('Loss', average_only=True) logger.dump_tabular() rewards_log = np.array(rewards_log) save_path = '../../log/modified_sql/' + env_name + '/' + str(seed) + '.npy' np.save(save_path, rewards_log)
def train(self): replay_buffer=ReplayBuffer(self.state_dim, self.act_dim, self.replay_size) total_steps = self.steps_per_epoch * self.epochs state = self.env.reset() state = state.astype(np.float32) ep_len, ep_rew, ep_count = 0,0,0 all_ep_rew= [] for t in range(total_steps): # randomly sample actions untils start_steps have elapsed if t > self.start_steps: act = self.sample_action(state) act = act.numpy() else: act = self.env.action_space.sample() state_, r, d, _ = self.env.step(act) state_ = state_.astype(np.float32) d = False if ep_len==self.max_ep_len else d ep_len+=1 ep_rew+=r # Store transitions replay_buffer.store(state,act,r,state_,d) state = state_ # End of trajectory if d or (ep_len==self.max_ep_len): state = self.env.reset() state = state.astype(np.float32) if len(all_ep_rew) < 5: all_ep_rew.append(ep_rew) else: all_ep_rew.append(ep_rew) all_ep_rew[-1] = (np.mean(all_ep_rew[-5:])) # smoothing epoch=(t+1)//self.steps_per_epoch print("Training | Epoch:{} | Episode:{} | Steps: {}/{} | Episode Reward: {:.4f}".format(epoch, ep_count, t, total_steps, ep_rew)) ep_len, ep_rew = 0,0 ep_count += 1 # Update if t>self.update_after and t%self.update_every==0: for j in range(self.update_every): batch=replay_buffer.sample_batch(self.batch_size) self.update_critic(batch) if j%self.policy_delay==0: self.update_actor(batch) self.update_targets() # End of epoch if (t+1) % self.steps_per_epoch==0: epoch=(t+1)//self.steps_per_epoch # save model if (epoch % self.save_freq == 0) or (epoch == self.epochs): self.actor.save_weights(self.save_path+'actor_checkpoint'+str(epoch)) self.critic_net1.save_weights(self.save_path+'critic_net1_checkpoint'+str(epoch)) self.critic_net2.save_weights(self.save_path+'critic_net2_checkpoint'+str(epoch)) plt.figure() plt.plot(all_ep_rew) plt.xlabel('episodes') plt.ylabel('total reward per episode') plt.show()
def td3( env_fn, actor_critic=a2c, ac_kwargs=dict(), seed=0, steps_per_epoch=5000, epochs=100, replay_size=int(1e6), gamma=.99, polyak=.995, pi_lr=1e-3, q_lr=1e-3, batch_size=100, start_steps=10000, act_noise=.1, target_noise=.2, noise_clip=.5, policy_delay=2, max_ep_len=1000, logger_kwargs=dict(), save_freq=1): logger = EpochLogger( **logger_kwargs) logger.save_config( locals()) tf.set_random_seed(seed) np.random.seed( seed) env, test_env = env_fn(), env_fn() obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] # Action limit for clamping act_limit = env.action_space.high[0] # Share action sapce info with A2C ac_kwargs['action_space'] = env.action_space x_ph, a_ph, x2_ph, r_ph, d_ph = \ tf.placeholder( name='x_ph', shape=(None, obs_dim), dtype=tf.float32), \ tf.placeholder( name='a_ph', shape=(None, act_dim), dtype=tf.float32), \ tf.placeholder( name='x2_ph', shape=(None, obs_dim), dtype=tf.float32),\ tf.placeholder( name='r_ph', shape=(None), dtype=tf.float32), \ tf.placeholder( name='d_ph', shape=(None), dtype=tf.float32) # Actor policy and value with tf.variable_scope('main'): pi, q1, q2, q1_pi = actor_critic( x_ph, a_ph, **ac_kwargs) # Tghis seems a bit memory inneficient: what happens to the q values created # along with the target policy ? the poluicy created along the q targets ? # Not referenced, but still declared right, a the cost of GPU memory # Target policy with tf.variable_scope( 'target'): pi_targ, _, _, _ = actor_critic(x2_ph, a_ph, **ac_kwargs) # Target Q networks with tf.variable_scope( 'target', reuse=True): epsilon = tf.random_normal( tf.shape( pi_targ), stddev=target_noise) epsilon = tf.clip_by_value( epsilon, -noise_clip, noise_clip) a2 = pi_targ + epsilon a2 = tf.clip_by_value( a2, -act_limit, act_limit) # Target Q-Values using actions from target policy _, q1_targ, q2_targ, _ = actor_critic(x2_ph, a2, **ac_kwargs) replaybuffer = ReplayBuffer( obs_dim, act_dim, size=replay_size) # helpers for var count def get_vars(scope=''): return [x for x in tf.trainable_variables() if scope in x.name] def count_vars(scope=''): v = get_vars(scope) return sum([np.prod(var.shape.as_list()) for var in v]) # Count variables var_counts = tuple( count_vars( scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main']) print('\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t total: %d\n' % var_counts) # CLiped Double Q-Learning with Bellman backup min_q_targ = tf.minimum( q1_targ, q2_targ) backup = tf.stop_gradient( r_ph + gamma * (1 -d_ph) * min_q_targ) # TD3 Losses pi_loss = - tf.reduce_mean( q1_pi) q1_loss = tf.reduce_mean( (q1 - backup)**2) q2_loss = tf.reduce_mean( (q2 - backup)**2) q_loss = q1_loss + q2_loss # Trainin ops pi_train = tf.train.AdamOptimizer(pi_lr).minimize( pi_loss) q_train = tf.train.AdamOptimizer(q_lr).minimize( q_loss) # Polyak wise target update target_update = tf.group( [ tf.assign( v_targ, polyak * v_targ + (1-polyak) * v_main) for v_main, v_targ in zip( get_vars('main'), get_vars('target'))]) target_init = tf.group( [ tf.assign( v_targ, v_main) for v_targ, v_main in zip( get_vars('target'), get_vars('main'))]) sess = tf.Session() sess.run( tf.global_variables_initializer()) sess.run( target_init) # Setup model saving logger.setup_tf_saver(sess, inputs={'x': x_ph, 'a': a_ph}, outputs={'pi': pi, 'q1': q1, 'q2': q2}) def get_action( o, noise_scale): a = sess.run( pi, feed_dict={ x_ph: o.reshape(1,-1)}) a += noise_scale * np.random.randn( act_dim) return np.clip( a, -act_limit, act_limit) def test_agent( n=10): for j in range( n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0 ,0 while not ( d or (ep_len == max_ep_len)): o, r, d, _ = test_env.step( get_action( o, 0)) ep_ret += r ep_len += 1 logger.store( TestEpRet=ep_ret, TestEpLen=ep_len) start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0 , 0 total_steps = steps_per_epoch * epochs # Main loop for t in range( total_steps): if t > start_steps: a = get_action( o, act_noise) else: a = env.action_space.sample() o2, r, d, _ = env.step( a) ep_ret += r ep_len += 1 d = False or ( ep_len == max_ep_len) o2 = np.squeeze( o2) # print( "O2: ", o2) replaybuffer.store( o, a, r, o2, d) o = o2 if d or ( ep_len == max_ep_len): for j in range( ep_len): batch = replaybuffer.sample_batch( batch_size) feed_dict = {x_ph: batch['obs1'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } q_step_ops = [q_loss, q1, q2, q_train] outs = sess.run( q_step_ops, feed_dict) logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2]) if j % policy_delay == 0: outs = sess.run( [pi_loss, pi_train, target_update], feed_dict) logger.store( LossPi=outs[0]) logger.store( EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Saving the model if (epoch % save_freq == 0) or ( epoch == epochs - 1): logger.save_state({'env': env}, None) test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('Time', time.time()-start_time) logger.dump_tabular()
class DQNAgent: def __init__(self, dimS, nA, gamma=0.99, hidden1=64, hidden2=64, lr=1e-3, tau=1e-3, buffer_size=100000, batch_size=64, render=False): args = locals() print('agent spec') print('-' * 80) print(args) print('-' * 80) self.dimS = dimS self.nA = nA # set networks self.Q = Critic(dimS, nA, hidden_size1=hidden1, hidden_size2=hidden2) self.target_Q = copy.deepcopy(self.Q) # freeze the target network for p in self.target_Q.parameters(): p.requires_grad_(False) self.optimizer = Adam(self.Q.parameters(), lr=lr) self.gamma = gamma self.tau = tau self.buffer = ReplayBuffer(dimS, buffer_size) self.batch_size = batch_size self.render = render return def hard_target_update(self): # hard target update # this will not be used in our implementation self.target_Q.load_state_dict(self.Q.state_dict()) return def target_update(self): # soft target update # when \tau = 1, this is equivalent to hard target update for p, target_p in zip(self.Q.parameters(), self.target_Q.parameters()): target_p.data.copy_(self.tau * p.data + (1.0 - self.tau) * target_p.data) return def get_action(self, state, eps): self.Q.eval() dimS = self.dimS nA = self.nA s = torch.tensor(state, dtype=torch.float).view(1, dimS) q = self.Q(s) # simple implementation of \epsilon-greedy method if np.random.rand() < eps: a = np.random.randint(nA) else: # greedy selection a = np.argmax(q.cpu().data.numpy()) return a def train(self): self.Q.train() gamma = self.gamma batch = self.buffer.sample_batch(self.batch_size) # unroll batch with torch.no_grad(): observations = torch.tensor(batch['state'], dtype=torch.float) actions = torch.tensor(batch['action'], dtype=torch.long) rewards = torch.tensor(batch['reward'], dtype=torch.float) next_observations = torch.tensor(batch['next_state'], dtype=torch.float) terminals = torch.tensor(batch['done'], dtype=torch.float) mask = 1.0 - terminals next_q = torch.unsqueeze( self.target_Q(next_observations).max(1)[0], 1) target = rewards + gamma * mask * next_q out = self.Q(observations).gather(1, actions) loss_ftn = MSELoss() loss = loss_ftn(out, target) self.optimizer.zero_grad() loss.backward() self.optimizer.step() self.target_update() return def save_model(self, path): checkpoint_path = path + 'model.pth.tar' torch.save( { 'critic': self.Q.state_dict(), 'target_critic': self.target_Q.state_dict(), 'optimizer': self.optimizer.state_dict() }, checkpoint_path) return def load_model(self, path): checkpoint = torch.load(path) self.Q.load_state_dict(checkpoint['critic']) self.target_Q.load_state_dict(checkpoint['target_critic']) self.optimizer.load_state_dict(checkpoint['optimizer']) return
class SACAgent: def __init__(self, dimS, dimA, ctrl_range, gamma=0.99, pi_lr=1e-4, q_lr=1e-3, polyak=1e-3, alpha=0.2, hidden1=400, hidden2=300, buffer_size=1000000, batch_size=128, device='cpu', render=False): self.dimS = dimS self.dimA = dimA self.ctrl_range = ctrl_range self.gamma = gamma self.pi_lr = pi_lr self.q_lr = q_lr self.polyak = polyak self.alpha = alpha self.batch_size = batch_size # networks definition # pi : actor network, Q : 2 critic network self.pi = SACActor(dimS, dimA, hidden1, hidden2, ctrl_range).to(device) self.Q = DoubleCritic(dimS, dimA, hidden1, hidden2).to(device) # target networks self.target_Q = copy.deepcopy(self.Q).to(device) freeze(self.target_Q) self.buffer = ReplayBuffer(dimS, dimA, limit=buffer_size) self.Q_optimizer = Adam(self.Q.parameters(), lr=self.q_lr) self.pi_optimizer = Adam(self.pi.parameters(), lr=self.pi_lr) self.device = device self.render = render return def act(self, state, eval=False): state = torch.tensor(state, dtype=torch.float).to(self.device) with torch.no_grad(): action, _ = self.pi(state, eval=eval, with_log_prob=False) action = action.cpu().detach().numpy() return action def target_update(self): for params, target_params in zip(self.Q.parameters(), self.target_Q.parameters()): target_params.data.copy_(self.polyak * params.data + (1.0 - self.polyak) * target_params.data) return def train(self): device = self.device batch = self.buffer.sample_batch(batch_size=self.batch_size) # unroll batch obs_batch = torch.tensor(batch.obs, dtype=torch.float).to(device) act_batch = torch.tensor(batch.act, dtype=torch.float).to(device) next_obs_batch = torch.tensor(batch.next_obs, dtype=torch.float).to(device) rew_batch = torch.tensor(batch.rew, dtype=torch.float).to(device) done_batch = torch.tensor(batch.done, dtype=torch.float).to(device) masks = torch.tensor([1.]) - done_batch with torch.no_grad(): next_actions, log_probs = self.pi(next_obs_batch, with_log_prob=True) target_q1, target_q2 = self.target_Q(next_obs_batch, next_actions) target_q = torch.min(target_q1, target_q2) target = rew_batch + self.gamma * masks * (target_q - self.alpha * log_probs) out1, out2 = self.Q(obs_batch, act_batch) Q_loss1 = torch.mean((out1 - target)**2) Q_loss2 = torch.mean((out2 - target)**2) Q_loss = Q_loss1 + Q_loss2 self.Q_optimizer.zero_grad() Q_loss.backward() self.Q_optimizer.step() actions, log_probs = self.pi(obs_batch, with_log_prob=True) freeze(self.Q) q1, q2 = self.Q(obs_batch, actions) q = torch.min(q1, q2) pi_loss = torch.mean(self.alpha * log_probs - q) self.pi_optimizer.zero_grad() pi_loss.backward() self.pi_optimizer.step() unfreeze(self.Q) self.target_update() return def single_eval(self, env_id, render=False): """ evaluation of the agent on a single episode """ env = gym.make(env_id) state = env.reset() ep_reward = 0 done = False while not done: if render: env.render() action = self.act(state, eval=True) state, reward, done, _ = env.step(action) ep_reward += reward if render: env.close() return ep_reward def eval(self, env_id, t, eval_num=10): scores = np.zeros(eval_num) for i in range(eval_num): render = True if (self.render == True and i == 0) else False scores[i] = self.single_eval(env_id, False) avg = np.mean(scores) print('step {} : {:.4f}'.format(t, avg)) return [t, avg] def save_model(self, path): print('adding checkpoints...') checkpoint_path = path + 'model.pth.tar' torch.save( { 'actor': self.pi.state_dict(), 'critic': self.Q.state_dict(), 'target_critic': self.target_Q.state_dict(), 'actor_optimizer': self.pi_optimizer.state_dict(), 'critic_optimizer': self.Q_optimizer.state_dict() }, checkpoint_path) return def load_model(self, path): print('networks loading...') checkpoint = torch.load(path) self.pi.load_state_dict(checkpoint['actor']) self.Q.load_state_dict(checkpoint['critic']) self.target_Q.load_state_dict(checkpoint['target_critic']) self.pi_optimizer.load_state_dict(checkpoint['actor_optimizer']) self.Q_optimizer.load_state_dict(checkpoint['critic_optimizer']) return
class DDPGAgent: def __init__(self, dimS, dimA, gamma=0.99, actor_lr=1e-4, critic_lr=1e-3, tau=1e-3, sigma=0.1, hidden_size1=400, hidden_size2=300, buffer_size=int(1e6), batch_size=128, render=False): self.dimS = dimS self.dimA = dimA self.gamma = gamma self.pi_lr = actor_lr self.q_lr = critic_lr self.tau = tau self.sigma = sigma self.batch_size = batch_size # networks definition # pi : actor network, Q : critic network self.pi = Actor(dimS, dimA, hidden_size1, hidden_size2) self.Q = Critic(dimS, dimA, hidden_size1, hidden_size2) # target networks self.targ_pi = copy.deepcopy(self.pi) self.targ_Q = copy.deepcopy(self.Q) self.buffer = ReplayBuffer(dimS, dimA, limit=buffer_size) self.Q_optimizer = torch.optim.Adam(self.Q.parameters(), lr=self.q_lr) self.pi_optimizer = torch.optim.Adam(self.pi.parameters(), lr=self.pi_lr) self.render = render def target_update(self): # soft-update for both actors and critics # \theta^\prime = \tau * \theta + (1 - \tau) * \theta^\prime for th, targ_th in zip(self.pi.parameters(), self.targ_pi.parameters()): # th : theta targ_th.data.copy_(self.tau * th.data + (1.0 - self.tau) * targ_th.data) for th, targ_th in zip(self.Q.parameters(), self.targ_Q.parameters()): targ_th.data.copy_(self.tau * th.data + (1.0 - self.tau) * targ_th.data) def get_action(self, state, eval=False): state = torch.tensor(state, dtype=torch.float) with torch.no_grad(): action = self.pi(state) action = action.numpy() if not eval: # for exploration, we use a behavioral policy of the form # \beta(s) = \pi(s) + N(0, \sigma^2) noise = self.sigma * np.random.randn(self.dimA) return action + noise else: return action def train(self): """ train actor-critic network using DDPG """ batch = self.buffer.sample_batch(batch_size=self.batch_size) # unroll batch observations = torch.tensor(batch['state'], dtype=torch.float) actions = torch.tensor(batch['action'], dtype=torch.float) rewards = torch.tensor(batch['reward'], dtype=torch.float) next_observations = torch.tensor(batch['next_state'], dtype=torch.float) terminal_flags = torch.tensor(batch['done'], dtype=torch.float) mask = torch.tensor([1.]) - terminal_flags # compute TD targets based on target networks # if done, set target value to reward target = rewards + self.gamma * mask * self.targ_Q(next_observations, self.targ_pi(next_observations)) out = self.Q(observations, actions) loss_ftn = MSELoss() loss = loss_ftn(out, target) self.Q_optimizer.zero_grad() loss.backward() self.Q_optimizer.step() pi_loss = - torch.mean(self.Q(observations, self.pi(observations))) self.pi_optimizer.zero_grad() pi_loss.backward() self.pi_optimizer.step() self.target_update() def save_model(self, path): checkpoint_path = path + 'model.pth.tar' torch.save( {'actor': self.pi.state_dict(), 'critic': self.Q.state_dict(), 'target_actor': self.targ_pi.state_dict(), 'target_critic': self.targ_Q.state_dict(), 'actor_optimizer': self.pi_optimizer.state_dict(), 'critic_optimizer': self.Q_optimizer.state_dict() }, checkpoint_path) return def load_model(self, path): checkpoint = torch.load(path) self.pi.load_state_dict(checkpoint['actor']) self.Q.load_state_dict(checkpoint['critic']) self.targ_pi.load_state_dict(checkpoint['target_actor']) self.targ_Q.load_state_dict(checkpoint['target_critic']) self.pi_optimizer.load_state_dict(checkpoint['actor_optimizer']) self.Q_optimizer.load_state_dict(checkpoint['critic_optimizer']) return
def sac(env_fn, seed=0, gamma=.99, lam=.97, hidden_sizes=(200, 100), alpha=.5, v_lr=1e-3, q_lr=1e-3, pi_lr=1e-3, polyak=1e-2, epochs=50, steps_per_epoch=1000, batch_size=100, start_steps=10000, logger_kwargs=dict(), replay_size=int(1e6), max_ep_len=1000, save_freq=1): logger = EpochLogger(**logger_kwargs) logger.save_config(locals()) tf.set_random_seed(seed) np.random.seed(seed) env, test_env = env_fn(), env_fn() env = env_fn() # Dimensions obs_dim = env.observation_space.shape[0] act_dim = env.action_space.shape[0] act_limit = env.action_space.high[0] # Placeholders x_ph = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32) a_ph = tf.placeholder(shape=[None, act_dim], dtype=tf.float32) x2_ph = tf.placeholder(shape=[None, obs_dim], dtype=tf.float32) r_ph = tf.placeholder(shape=[None], dtype=tf.float32) d_ph = tf.placeholder(shape=[None], dtype=tf.float32) # Networks def mlp(x, hidden_sizes=(32, ), activation=tf.tanh, output_activation=None): for h in hidden_sizes[:-1]: x = tf.layers.dense(x, units=h, activation=activation) return tf.layers.dense(x, units=hidden_sizes[-1], activation=output_activation) # Why isn't the k used here ? def gaussian_likelihood(x, mu, log_std): EPS = 1e-8 pre_sum = -0.5 * ( ((x - mu) / (tf.exp(log_std) + EPS))**2 + 2 * log_std + np.log(2 * np.pi)) return tf.reduce_sum(pre_sum, axis=1) def clip_but_pass_gradient(x, l=-1., u=1.): clip_up = tf.cast(x > u, tf.float32) clip_low = tf.cast(x < l, tf.float32) return x + tf.stop_gradient((u - x) * clip_up + (l - x) * clip_low) LOG_STD_MIN = -20 LOG_STD_MAX = 2 def mlp_gaussian_policy(x, a, hidden_sizes, activation, output_activation): act_dim = a.shape.as_list()[-1] net = mlp(x, list(hidden_sizes), activation, activation) mu = tf.layers.dense(net, act_dim, activation=output_activation) """ Because algorithm maximizes trade-off of reward and entropy, entropy must be unique to state---and therefore log_stds need to be a neural network output instead of a shared-across-states learnable parameter vector. But for deep Relu and other nets, simply sticking an activationless dense layer at the end would be quite bad---at the beginning of training, a randomly initialized net could produce extremely large values for the log_stds, which would result in some actions being either entirely deterministic or too random to come back to earth. Either of these introduces numerical instability which could break the algorithm. To protect against that, we'll constrain the output range of the log_stds, to lie within [LOG_STD_MIN, LOG_STD_MAX]. This is slightly different from the trick used by the original authors of SAC---they used tf.clip_by_value instead of squashing and rescaling. I prefer this approach because it allows gradient propagation through log_std where clipping wouldn't, but I don't know if it makes much of a difference. """ log_std = tf.layers.dense(net, act_dim, activation=tf.tanh) log_std = LOG_STD_MIN + 0.5 * (LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) std = tf.exp(log_std) pi = mu + tf.random_normal(tf.shape(mu)) * std logp_pi = gaussian_likelihood(pi, mu, log_std) return mu, pi, logp_pi def apply_squashing_func(mu, pi, logp_pi): mu = tf.tanh(mu) pi = tf.tanh(pi) # To avoid evil machine precision error, strictly clip 1-pi**2 to [0,1] range. logp_pi -= tf.reduce_sum( tf.log(clip_but_pass_gradient(1 - pi**2, l=0, u=1) + 1e-6), axis=1) return mu, pi, logp_pi with tf.variable_scope("main"): activation = tf.tanh with tf.variable_scope("pi"): # mu = mlp( x_ph, hidden_sizes, activation, None) # log_std = mlp( mu, (act_dim,), activation, None) # # Avoid out of range log_std. Refer to Github for explanation. # log_std = LOG_STD_MIN + .5 * ( LOG_STD_MAX - LOG_STD_MIN) * (log_std + 1) # # mu = mlp( mu, (act_dim,), activation, None) # # pi = mu + tf.exp( log_std) * tf.random_normal( tf.shape(mu)) # logp_pi = gaussian_likelihood( pi, mu, log_std) # # # Follow SpinningUp Implementation # mu = tf.tanh(mu) # pi = tf.tanh(pi) # # def clip_but_pass_gradient(x, l=-1., u=1.): # clip_up = tf.cast(x > u, tf.float32) # clip_low = tf.cast(x < l, tf.float32) # # What is this supposed to mean even ? # return x + tf.stop_gradient((u - x)*clip_up + (l - x)*clip_low) # # # Shameless copy paste # logp_pi -= tf.reduce_sum(tf.log(clip_but_pass_gradient(1 - pi**2, l=0, u=1) + 1e-6), axis=1) # Not working version bak # squashed_pi = tf.tanh( pi) # # # To be sure # pi = tf.clip_by_value( pi, -act_limit, act_limit) # # # Must take in the squased polic # log_squash_pi = gaussian_likelihood( squashed_pi, mu, log_std) # Shamefull plug mu, pi, logp_pi = mlp_gaussian_policy(x_ph, a_ph, hidden_sizes, tf.tanh, None) mu, pi, logp_pi = apply_squashing_func(mu, pi, logp_pi) with tf.variable_scope("q1"): q1 = tf.squeeze(mlp(tf.concat([x_ph, a_ph], -1), hidden_sizes + (1, ), activation, None), axis=-1) with tf.variable_scope("q1", reuse=True): q1_pi = tf.squeeze(mlp(tf.concat([x_ph, pi], -1), hidden_sizes + (1, ), activation, None), axis=-1) with tf.variable_scope("q2"): q2 = tf.squeeze(mlp(tf.concat([x_ph, a_ph], -1), hidden_sizes + (1, ), activation, None), axis=-1) with tf.variable_scope("q2", reuse=True): q2_pi = tf.squeeze(mlp(tf.concat([x_ph, pi], -1), hidden_sizes + (1, ), activation, None), axis=-1) with tf.variable_scope("v"): # v = mlp( x_ph, hidden_sizes+(1,), activation, None) v = tf.squeeze(mlp(x_ph, hidden_sizes + (1, ), activation, None), axis=-1) with tf.variable_scope("target"): with tf.variable_scope("v"): v_targ = tf.squeeze(mlp(x2_ph, hidden_sizes + (1, ), activation, None), axis=-1) # helpers for var count def get_vars(scope=''): return [x for x in tf.trainable_variables() if scope in x.name] def count_vars(scope=''): v = get_vars(scope) return sum([np.prod(var.shape.as_list()) for var in v]) # Count variables var_counts = tuple( count_vars(scope) for scope in ['main/pi', 'main/q1', 'main/q2', 'main/v', 'main']) print( '\nNumber of parameters: \t pi: %d, \t q1: %d, \t q2: %d, \t v: %d, \t total: %d\n' % var_counts) # Targets q_backup_prestop = r_ph + gamma * (1 - d_ph) * v_targ v_backup_prestop = tf.minimum(q1_pi, q2_pi) - alpha * logp_pi q_backup, v_backup = tf.stop_gradient(q_backup_prestop), tf.stop_gradient( v_backup_prestop) # Q Loss q1_loss = tf.reduce_mean((q1 - q_backup)**2) q2_loss = tf.reduce_mean((q2 - q_backup)**2) q_loss = q1_loss + q2_loss # V Loss v_loss = tf.reduce_mean((v - v_backup)**2) # Pol loss pi_loss = tf.reduce_mean(-q1_pi + alpha * logp_pi) # Training ops v_trainop = tf.train.AdamOptimizer(v_lr).minimize( v_loss, var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v")) q_trainop = tf.train.AdamOptimizer(q_lr).minimize( q_loss, var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/q")) pi_trainop = tf.train.AdamOptimizer(pi_lr).minimize( pi_loss, var_list=tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/pi")) assert polyak <= .5 # Target update op init_v_target = tf.group([ tf.assign(v_target, v_main) for v_main, v_target in zip( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v"), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target/v")) ]) update_v_target = tf.group([ tf.assign(v_target, (1 - polyak) * v_target + polyak * v_main) for v_main, v_target in zip( tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="main/v"), tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES, scope="target/v")) ]) sess = tf.Session() sess.run(tf.global_variables_initializer()) sess.run(init_v_target) # Setup model saving logger.setup_tf_saver(sess, inputs={ 'x': x_ph, 'a': a_ph }, outputs={ 'pi': pi, 'q1': q1, 'q2': q2, 'v': v }) def test_agent(n=10): for j in range(n): o, r, d, ep_ret, ep_len = test_env.reset(), 0, False, 0, 0 # print( o.reshape(-1, 1)) # input() while not (d or (ep_len == max_ep_len)): o, r, d, _ = test_env.step( sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)})) ep_ret += r ep_len += 1 logger.store(TestEpRet=ep_ret, TestEpLen=ep_len) #Buffer init buffer = ReplayBuffer(obs_dim, act_dim, replay_size) # Main loop start_time = time.time() o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0, 0 total_steps = steps_per_epoch * epochs for t in range(total_steps): if t > start_steps: a = sess.run(pi, feed_dict={x_ph: o.reshape(1, -1)}) else: a = env.action_space.sample() o2, r, d, _ = env.step(a) ep_ret += r ep_len += 1 o2, r, d, _ = env.step(o) d = False or (ep_len == max_ep_len) # Still needed ? o2 = np.squeeze(o2) buffer.store(o, a, r, o2, d) o = o2 if d or (ep_len == max_ep_len): for j in range(ep_len): batch = buffer.sample_batch(batch_size) feed_dict = { x_ph: batch['obs'], x2_ph: batch['obs2'], a_ph: batch['acts'], r_ph: batch['rews'], d_ph: batch['done'] } # DEBUG: # v_backup_prestop_out = sess.run( v_backup_prestop, feed_dict=feed_dict) # print( v_backup_prestop_out.shape) # print( v_backup_prestop_out) # input() # Value gradient steps v_step_ops = [v_loss, v, v_trainop] outs = sess.run(v_step_ops, feed_dict) logger.store(LossV=outs[0], VVals=outs[1]) # Q Gradient steps q_step_ops = [q_loss, q1, q2, q_trainop] outs = sess.run(q_step_ops, feed_dict) logger.store(LossQ=outs[0], Q1Vals=outs[1], Q2Vals=outs[2]) # Policy gradient steps # TODO Add entropy logging pi_step_ops = [pi_loss, pi_trainop, update_v_target] outs = sess.run(pi_step_ops, feed_dict=feed_dict) logger.store(LossPi=outs[0]) logger.store(EpRet=ep_ret, EpLen=ep_len) o, r, d, ep_ret, ep_len = env.reset(), 0, False, 0., 0 if t > 0 and t % steps_per_epoch == 0: epoch = t // steps_per_epoch # Saving the model if (epoch % save_freq == 0) or (epoch == epochs - 1): logger.save_state({'env': env}, None) test_agent() # Log info about epoch logger.log_tabular('Epoch', epoch) logger.log_tabular('EpRet', with_min_and_max=True) logger.log_tabular('TestEpRet', with_min_and_max=True) logger.log_tabular('EpLen', average_only=True) logger.log_tabular('TestEpLen', average_only=True) logger.log_tabular('TotalEnvInteracts', t) logger.log_tabular('Q1Vals', with_min_and_max=True) logger.log_tabular('Q2Vals', with_min_and_max=True) logger.log_tabular('VVals', with_min_and_max=True) logger.log_tabular('LossPi', average_only=True) logger.log_tabular('LossQ', average_only=True) logger.log_tabular('LossV', average_only=True) logger.log_tabular('Time', time.time() - start_time) logger.dump_tabular()