class ActorCritic: def __init__(self, sess, training_steps=5000000, learning_rate=0.0001, momentum=0.95, memory_size=100000, discount_rate=0.95, eps_min=0.05): self.activation = tf.nn.relu self.optimizer = tf.train.MomentumOptimizer self.learning_rate = learning_rate self.momentum = momentum self._build_graph() self.memory_size = memory_size self.memory = ReplayMemory(self.memory_size) ''' The discount rate is the parameter that indicates how many actions will be considered in the future to evaluate the reward of a given action. A value of 0 means the agent only considers the present action, and a value close to 1 means the agent considers actions very far in the future. ''' self.discount_rate = discount_rate self.eps_min = eps_min self.eps_decay_steps = int(training_steps / 2) self.sess = sess self.init = tf.global_variables_initializer() def cnn_model(self, X_state, name): """ Creates a CNN network with two convolutional layers followed by two fully connected layers. :param X_state: Placeholder for the state of the game :param name: Name of the network (actor or critic) :return : The output (logits) layer and the trainable variables """ initializer = tf.contrib.layers.variance_scaling_initializer() conv1_fmaps = 32 conv1_ksize = 8 conv1_stride = 2 conv1_pad = 'SAME' conv2_fmaps = 64 conv2_ksize = 4 conv2_stride = 2 conv2_pad = 'SAME' n_fc1 = 256 with tf.variable_scope(name) as scope: conv1 = tf.layers.conv2d(X_state, filters=conv1_fmaps, kernel_size=conv1_ksize, activation=self.activation, strides=conv1_stride, padding=conv1_pad, name='conv1') conv2 = tf.layers.conv2d(conv1, filters=conv2_fmaps, kernel_size=conv2_ksize, activation=self.activation, strides=conv2_stride, padding=conv2_pad, name='conv2') conv2_flat = tf.reshape(conv2, shape=[-1, conv2_fmaps * 5 * 5]) fc1 = tf.layers.dense(conv2_flat, n_fc1, activation=self.activation, name='fc1', kernel_initializer=initializer) logits = tf.layers.dense(fc1, N_OUTPUTS, kernel_initializer=initializer) trainable_vars = tf.get_collection( tf.GraphKeys.TRAINABLE_VARIABLES, scope=scope.name) trainable_vars_by_name = { var.name[len(scope.name):]: var for var in trainable_vars } return logits, trainable_vars_by_name def _build_graph(self): """ Creates the Tensorflow graph of the CNN network. Two networks will be used, one for the actor, and one for the critic. """ X_state = tf.placeholder(tf.float32, shape=[None, 20, 20, CHANNELS]) actor_q_values, actor_vars = self.cnn_model(X_state, name="actor") critic_q_values, critic_vars = self.cnn_model(X_state, name="critic") with tf.variable_scope("train"): X_action = tf.placeholder(tf.int32, shape=[None]) y = tf.placeholder(tf.float32, shape=[None, 1]) '''A one hot vector (tf.one_hot) is used to only keep the Q-value corresponding to chosen action in the memory. By multiplying the one-hot vector with the actor_q_values, this will zero out all of the Q-values except for the one corresponding to the memorized action. Then, by making sum along the first axis (axis=1), we obtain the desired Q-value prediction for each memory. ''' q_value = tf.reduce_sum(actor_q_values * tf.one_hot(X_action, N_OUTPUTS), axis=1, keep_dims=True) error = tf.abs(y - q_value) loss = tf.reduce_mean(clipped_error(error)) global_step = tf.Variable(0, trainable=False, name='global_step') # iteration step optimizer = self.optimizer(self.learning_rate, self.momentum, use_nesterov=True) training_op = optimizer.minimize(loss, global_step=global_step) self.saver = tf.train.Saver() self.X_state = X_state self.X_action = X_action self.y = y self.training_op = training_op self.loss = loss self.actor_q_values, self.actor_vars = actor_q_values, actor_vars self.critic_q_values, self.critic_vars = critic_q_values, critic_vars self.global_step = global_step with tf.variable_scope('summary'): self.loss_summary = tf.summary.scalar('loss', loss) self.mean_score = tf.placeholder(tf.float32, None) self.score_summary = tf.summary.scalar('mean score', self.mean_score) self.summary_merged = tf.summary.merge( [self.loss_summary, self.score_summary]) def start(self, checkpoint_path): """ Intialize the model or restore the model if it already exists. :return: Iteration that we want the model to start training """ if os.path.isfile(checkpoint_path + '.index'): self.saver.restore(self.sess, checkpoint_path) training_start = 1 print('Restoring model...') else: # Make the model warm up before training training_start = 10000 self.init.run() self.make_copy().run() print('New model...') return training_start return training_start def train(self, checkpoint_path, file_writer, mean_score): """ Trains the agent and writes regularly a training summary. :param checkpoint_path: The path where the model will be saved :param file_writer: The file where the training summary will be written for Tensorboard visualization :param mean_score: The mean game score """ copy_steps = 5000 save_steps = 2000 summary_steps = 500 cur_states, actions, rewards, next_states, dones = self.sample_memories( ) next_q_values = self.critic_q_values.eval( feed_dict={self.X_state: next_states}) max_next_q_values = np.max(next_q_values, axis=1, keepdims=True) y_vals = rewards + (1 - dones) * self.discount_rate * max_next_q_values _, loss_val = self.sess.run([self.training_op, self.loss], feed_dict={ self.X_state: cur_states, self.X_action: actions, self.y: y_vals }) step = self.global_step.eval() # Regularly copy the online DQN to the target DQN if step % copy_steps == 0: self.make_copy().run() # Save the model regularly if step % save_steps == 0: self.saver.save(self.sess, checkpoint_path) # Write the training summary regularly if step % summary_steps == 0: summary = self.sess.run(self.summary_merged, feed_dict={ self.X_state: cur_states, self.X_action: actions, self.y: y_vals, self.mean_score: mean_score }) file_writer.add_summary(summary, step) def predict(self, cur_state): """ Makes the actor predict q-values based on the current state of the game. :param cur_state: Current state of the game :return The Q-values predicted by the actor """ q_values = self.actor_q_values.eval( feed_dict={self.X_state: [cur_state]}) return q_values def remember(self, cur_state, action, reward, new_state, done): self.memory.append([cur_state, action, reward, new_state, done]) def act(self, cur_state, step): """ :param cur_state: Current state of the game :param step: Training step :return: Action selected by the agent """ eps_max = 1.0 epsilon = max( self.eps_min, eps_max - (eps_max - self.eps_min) * 2 * step / self.eps_decay_steps) if np.random.rand() < epsilon: return np.random.randint(N_OUTPUTS), epsilon # Random action else: q_values = self.predict(cur_state) return np.argmax(q_values), epsilon # Optimal action def make_copy(self): """ Makes regular copies of the training varibales from the critic to the actor. Credits goes to https://github.com/ageron/handson-ml/blob/master/16_reinforcement_learning.ipynb. :return: A copy of the training variables """ copy_ops = [ target_var.assign(self.actor_vars[var_name]) for var_name, target_var in self.critic_vars.items() ] copy_online_to_target = tf.group(*copy_ops) return copy_online_to_target def sample_memories(self, batch_size=32): """ Extracts memories from the agent's memory. Credits goes to https://github.com/ageron/handson-ml/blob/master/16_reinforcement_learning.ipynb. :param batch_size: Size of the batch that we extract form the memory :return: State, action, reward, next_state, and done values as np.arrays """ cols = [[], [], [], [], []] # state, action, reward, next_state, done for memory in self.memory.sample(batch_size): for col, value in zip(cols, memory): col.append(value) cols = [np.array(col) for col in cols] return cols[0], cols[1], cols[2].reshape(-1, 1), cols[3], cols[4].reshape( -1, 1)
class TD3Agent: def __init__(self, env, n_episodes=3000, time_steps=500, gamma=0.99, batch_size=64, memory_capacity=100000, tau=1e-2, lr=0.00001, pi_update_steps=2, render=False): self.env = env self.gamma = gamma self.time_steps = time_steps self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.batch_size = batch_size self.memory_capacity = memory_capacity self.tau = tau self.lr = lr self.pi_update_steps = pi_update_steps self.render = render self.device = torch.device( "cuda" if torch.cuda.is_available() else "cpu") # Create actor and critic network self.actor = Actor(state_dim=self.state_dim, action_dim=self.action_dim).to(self.device) self.actor_target = Actor(state_dim=self.state_dim, action_dim=self.action_dim).to(self.device) self.critic = Critic(state_dim=self.state_dim, action_dim=self.action_dim).to(self.device) self.critic_target = Critic(state_dim=self.state_dim, action_dim=self.action_dim).to(self.device) # Same weights for target network as for original network for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_(param.data) self.critic_loss_fct = torch.nn.MSELoss() self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=self.lr) self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=self.lr * 10) self.n_episodes = n_episodes self.replay_memory = ReplayMemory(capacity=self.memory_capacity, batch_size=batch_size) self.res = pd.DataFrame({ 'episodes': [], 'states': [], 'rewards': [], 'steps': [], 'actor_losses': [], 'critic_losses': [], }) def train(self): for i in range(self.n_episodes): state = self.env.reset() for step in range(self.time_steps): if self.render: self.env.render() state = tt(state) action = self.actor(state).cpu().detach().numpy() noise = np.random.normal(0, 0.1, size=self.env.action_space.shape[0]) action = np.clip(action + noise, self.env.action_space.low[0], self.env.action_space.high[0]) next_state, reward, done, _ = self.env.step(action) # Save step in memory self.replay_memory.append(state=state, action=action, reward=reward, next_state=next_state, done=done) res = { 'episodes': i + 1, 'states': state.tolist(), 'rewards': reward, 'steps': step + 1 } # Start training, if batch size reached if len(self.replay_memory) < self.batch_size: self.res = self.res.append([res]) continue # Sample batch from memory states, actions, rewards, next_states, dones = self.replay_memory.sample_batch( ) # Critic loss q1, q2 = self.critic(states, actions) next_actions = self.actor_target(next_states) noise = tt(torch.Tensor(actions.cpu()).data.normal_(0, 0.2)) noise = noise.clamp(-0.5, 0.5) next_actions = (next_actions + noise).clamp( self.env.action_space.low[0], self.env.action_space.high[0]) # Get next state q values by Clipped Double Q-Learning q1_ns, q2_ns = self.critic_target(next_states, next_actions.detach()) q_ns = torch.min(q1_ns, q2_ns) td_target = rewards + self.gamma * q_ns loss_critic = self.critic_loss_fct( q1, td_target) + self.critic_loss_fct(q2, td_target) res['critic_losses'] = float(loss_critic) # Optimize critic self.critic_optim.zero_grad() loss_critic.backward() self.critic_optim.step() # Delayed Policy Updates if step % self.pi_update_steps == 0: q1, _ = self.critic(states, self.actor(states)) # Actor loss loss_actor = -q1.mean() res['actor_losses'] = float(loss_actor) # Optimize actor self.actor_optim.zero_grad() loss_actor.backward() self.actor_optim.step() # update target networks for param, target_param in zip( self.critic.parameters(), self.critic_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) for param, target_param in zip( self.actor.parameters(), self.actor_target.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data) self.res = self.res.append([res]) state = next_state if done: break logging.info(f'Episode {i + 1}:') logging.info( f'\t Steps: {self.res.loc[self.res["episodes"] == i + 1]["steps"].max()}' ) logging.info( f'\t Reward: {self.res.loc[self.res["episodes"] == i + 1]["rewards"].sum()}' ) self.env.close() return self.res
class DDPGAgent: def __init__(self, env, n_episodes=3000, time_steps=500, gamma=0.99, batch_size=32, memory_capacity=100000, tau=1e-2, eps=0.1, lr=0.00001, render=False): self.env = env self.gamma = gamma self.time_steps = time_steps self.state_dim = env.observation_space.shape[0] self.action_dim = env.action_space.shape[0] self.batch_size = batch_size self.memory_capacity = memory_capacity self.tau = tau self.eps = eps self.lr = lr self.render = render # Same weights for target network as for original network self.actor = Actor(state_dim=self.state_dim, action_dim=self.action_dim) self.actor_target = Actor(state_dim=self.state_dim, action_dim=self.action_dim) self.critic = Critic(state_dim=self.state_dim, action_dim=self.action_dim) self.critic_target = Critic(state_dim=self.state_dim, action_dim=self.action_dim) for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()): target_param.data.copy_(param.data) for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_(param.data) self.critic_loss_fct = torch.nn.MSELoss() self.actor_optim = torch.optim.Adam(self.actor.parameters(), lr=self.lr) self.critic_optim = torch.optim.Adam(self.critic.parameters(), lr=self.lr * 10) self.n_episodes = n_episodes self.replay_memory = ReplayMemory(capacity=self.memory_capacity, batch_size=batch_size) self.res = pd.DataFrame({ 'episodes': [], 'states': [], 'rewards': [], 'steps': [] }) def train(self): for i in range(self.n_episodes): steps = 0 state = self.env.reset() for step in range(self.time_steps): if self.render: self.env.render() state = tt(state) action = self.actor(state).detach().numpy() # Exploration p = np.random.random() if p < self.eps: action = np.random.uniform(low=-1, high=1, size=(1, )) # Do one step in env next_state, reward, done, _ = self.env.step(action) res = { 'episodes': i + 1, 'states': state.tolist(), 'rewards': reward, 'steps': step + 1 } # Save step in memory self.replay_memory.append(state=state, action=action, reward=reward, next_state=next_state, done=done) # Start training, if batch size reached if len(self.replay_memory) < self.batch_size: continue # Sample batch from memory states, actions, rewards, next_states, dones = self.replay_memory.sample_batch( ) # Critic loss q_values = self.critic(states, actions) next_actions = self.actor_target(next_states) q_values_ns = self.critic_target(next_states, next_actions.detach()) td_target = rewards + self.gamma * q_values_ns loss_critic = self.critic_loss_fct(q_values, td_target) # Actor loss loss_actor = -(self.critic(states, self.actor(states)).mean()) # Optimize actor self.actor_optim.zero_grad() loss_actor.backward() self.actor_optim.step() # Optimize critic self.critic_optim.zero_grad() loss_critic.backward() self.critic_optim.step() # update target networks for target_param, param in zip(self.actor_target.parameters(), self.actor.parameters()): target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau)) for target_param, param in zip(self.critic_target.parameters(), self.critic.parameters()): target_param.data.copy_(param.data * self.tau + target_param.data * (1.0 - self.tau)) self.res = self.res.append([res]) state = next_state steps += 1 if done: break logging.info(f'Episode {i + 1}:') logging.info( f'\t Steps: {self.res.loc[self.res["episodes"] == i + 1]["steps"].max()}' ) logging.info( f'\t Reward: {self.res.loc[self.res["episodes"] == i + 1]["rewards"].sum()}' ) self.env.close() return self.res