def train(sess, env, actor, critic, RESTORE): sess.run(tf.global_variables_initializer()) # Initialize random noise generator exploration_noise = OUNoise(env.action_space.n) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay buffER replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) # Store q values for illustration purposes q_max_array = [] reward_array = [] for i in range(MAX_EPISODES): s = env.reset() ep_reward = 0 ep_ave_max_q = 0 for j in range(MAX_EP_STEPS): # if i % 40 == 0 and i > 1: # env.render() # Begin "Experimentation and Evaluation Phase" # Seleect next experimental action by adding noise to action prescribed by policy a = actor.predict(np.reshape(s, (1, actor.s_dim))) # If in a testing episode, do not add noise # if i%100 is not 49 and i%100 is not 99: noise = exploration_noise.noise() a = a + noise # Take step with experimental action action = np.argmax(a) s2, r, terminal, info = env.step(action) # s2, r, terminal, info = env.step(np.reshape(a.T,newshape=(env.action_space.n,))) # Add transition to replay buffer if not testing episode # if i%100 is not 49 and i%100 is not 99: replay_buffer.add(np.reshape(s, (actor.s_dim, )), np.reshape(a, (actor.a_dim, )), r, terminal, np.reshape(s2, (actor.s_dim, ))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MINIBATCH_SIZE: s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch( MINIBATCH_SIZE) # Find target estimate to use for updating the Q-function # Predict_traget function determines Q-value of next state target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) # Complete target estimate (R(t+1) + Q(s(t+1),a(t+1))) y_i = [] for k in range(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # Perform gradient descent to update critic predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) ep_ave_max_q += np.amax(predicted_q_value, axis=0) # Perform "Learning" phase by moving policy parameters in direction of deterministic policy gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() s = s2 ep_reward += r # If episode is finished, print results if terminal: print('| Reward: %.2i' % int(ep_reward), " | Episode", i, '| Qmax: %.4f' % (ep_ave_max_q / float(j))) q_max_array.append(ep_ave_max_q / float(j)) #reward_array.append(ep_reward) break ep_reward = 0 s = env.reset() for j in range(MAX_EP_STEPS): a = actor.predict(np.reshape(s, (1, actor.s_dim))) # Take step with experimental action action = np.argmax(a) s2, r, terminal, info = env.step(action) ep_reward += r s = s2 if terminal: print('Normal | Reward: %.2i' % int(ep_reward), " | Episode", i) reward_array.append(ep_reward) break # Max Q plot plt.plot(range(1, MAX_EPISODES + 1), q_max_array, 'b-') plt.xlabel('Episode Number') plt.ylabel('Max Q-Value') plt.savefig('Q.png') plt.show() # Reward plot plt.plot(range(1, MAX_EPISODES + 1), reward_array, 'g-') plt.xlabel('Episode Number') plt.ylabel('Reward') plt.savefig('Reward.png') plt.show() save_result([[str(i[0]) for i in q_max_array], [str(i) for i in reward_array]])
class DDPGAgent(object): """ class of the DDPG Agent """ def __init__(self, config): """Initialize an Agent object. Args: param1: (config) """ self.state_size = config.state_dim self.action_size = config.action_dim self.seed = np.random.seed(config.seed) self.n_agents = config.n_agents self.batch_size = config.batch_size self.tau = config.tau self.gamma = config.gamma self.device = config.device # Actor Network (w/ Target Network) self.actor_local = Actor(config).to(config.device) self.actor_target = Actor(config).to(config.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=config.lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(config).to(config.device) self.critic_target = Critic(config).to(config.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=config.lr_critic) # Noise process self.noise = OUNoise(config) # Replay memory self.memory = ReplayBuffer(config) #self.timesteps = 0 def act(self, states, epsilon, add_noise=True): """ Given a list of states for each agent it returns the actions to be taken by each agent based on the current policy. Returns a numpy array of shape [n_agents, n_actions] NOTE: clips actions to be between -1, 1 Args: states: (torch) states epsilon: (float) add_noise: (bool) add noise to the actions """ states = torch.from_numpy(states).float().to(self.device) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(states).cpu().data.numpy() self.actor_local.train() if add_noise and epsilon > np.random.random(): actions += [self.noise.sample() for _ in range(self.n_agents)] return np.clip(actions, -1, 1) def reset_noise(self): """ reset noise""" self.noise.reset() def learn(self): """Update policy and value parameters using given batch of experience tuples. actor_target(state) -> action critic_target(state, action) -> Q-value """ if self.batch_size > self.memory.size(): return states, actions, rewards, next_states, dones = self.memory.sample() # ---------------------------- update critic ---------------------------- # Get predicted next-state actions and Q values from target model actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target) self.soft_update(self.actor_local, self.actor_target) def soft_update(self, local_model, target_model): """Soft update model parameters. Args: param1: (torch network) local_model param2: (torch network) target_model """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)
class NECAgent: """ NEC agent """ def __init__(self, config): self.nec_net = NEC(config).to(config['device']) self.train_eps = config['train_eps'] self.eval_eps = config['eval_eps'] self.num_actions = config['num_actions'] self.replay_buffer = ReplayBuffer(config['observation_shape'], config['replay_buffer_size']) self.batch_size = config['batch_size'] self.discount = config['discount'] self.n_step_horizon = config['horizon'] self.episode = 0 self.logger = ScoreLogger(config['env_name'], config['exp_name']) self.env_name = config['env_name'] self.exp_name = config['exp_name'] self.device = config['device'] self.train() # make sure model is on appropriate device at this point before constructing optimizer self.optimizer = RMSprop(self.nec_net.parameters(), lr=config['learning_rate'], alpha=config['rmsprop_alpha'], eps=config['rmsprop_epsilon']) self.loss_fn = MSELoss() def train(self): self.training = True self.nec_net.train() def eval(self): self.training = False self.nec_net.eval() def new_episode(self): # trackers for computing N-step returns and updating replay and dnd memories at the end of episode self.observations, self.keys, self.actions, self.values, self.rewards = [], [], [], [], [] self.episode += 1 def set_epsilon(self, eps): self.train_eps = eps def step(self, obs): q_values, key = self.nec_net.lookup(obs) eps = self.train_eps if self.training else self.eval_eps # do epsilon-greedy crap action = np.random.choice(np.arange( self.num_actions)) if np.random.rand() < eps else _argmax(q_values) # update trackers if self.training: self.actions.append(action) self.observations.append(obs) self.keys.append(key) self.values.append(np.max(q_values)) return action def update(self, consequence): """ Called from main training loop to inform agent of consequence of last action including reward and if the episode terminated """ reward, done = consequence if self.env_name.startswith("CartPole"): reward = reward if not done else -reward # update reward tracker self.rewards.append(reward) if done: episode_length = len(self.actions) # compute N-step returns in reverse order returns, n_step_returns = [None] * (episode_length + 1), [None] * episode_length returns[episode_length] = 0 for t in range(episode_length - 1, -1, -1): returns[t] = self.rewards[t] + self.discount * returns[t + 1] if episode_length - t > self.n_step_horizon: n_step_returns[t] = returns[ t] + self.discount**self.n_step_horizon * ( self.values[t + self.n_step_horizon] - returns[t + self.n_step_horizon]) else: # use on-policy monte carlo returns when below horizon n_step_returns[t] = returns[t] self.keys, n_step_returns = torch.stack(self.keys), np.array( n_step_returns, dtype=np.float32) # for fancy indexing # batch update of replay memory self.replay_buffer.append_batch( np.stack(self.observations), np.asarray(self.actions, dtype=np.int64), n_step_returns) # batch update of episodic memories unique_actions = np.unique(self.actions) for action in unique_actions: action_idxs = np.nonzero(self.actions == action)[0] self.nec_net.update_memory(action, self.keys[action_idxs], n_step_returns[action_idxs]) # save/log metrics for plotting or whatever solved = self.logger.add_score(sum(self.rewards), self.episode) if solved: path = f'{os.getcwd()}/cartpole/trained_agents/nec_{self.exp_name}.pth' torch.save(self.nec_net.state_dict(), path) return True return False def optimize(self): """ Here, we sample from the replay buffer and train the NEC model end-to-end with backprop """ if self.replay_buffer.size() < self.batch_size: return observations, actions, returns = self.replay_buffer.sample( self.batch_size) self.optimizer.zero_grad() q_values = self.nec_net(observations.to(self.device))[range( self.batch_size), actions] # pick q_values for chosen actions loss = self.loss_fn(q_values, returns.to(self.device)) loss.backward() self.optimizer.step() def get_q_values(self, observations, actions): """ Computes q_values for observation, action pairs passed in. Used for testing """ with torch.no_grad(): self.eval() observations = torch.from_numpy(observations) q_values = self.nec_net(observations)[range(len(actions)), actions] return q_values.numpy()
def train(sess, env, actor, critic, RESTORE): sess.run(tf.global_variables_initializer()) # Initialize random noise generator exploration_noise = OUNoise(env.action_space.shape[0]) # Initialize target network weights actor.update_target_network() critic.update_target_network() # Initialize replay buffER replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) totSteps = 0 # Store q values for illustration purposes q_max_array = [] actor.learning_rate = MAX_ACTOR_LEARNING_RATE critic.learning_rate = MAX_CRITIC_LEARNING_RATE for i in xrange(MAX_EPISODES): s = env.reset() s = normalize(s) ep_reward = 0 ep_ave_max_q = 0 # update learning rates using cosine annealing T_cur = i % LR_CYCLE actor.learning_rate = MIN_ACTOR_LEARNING_RATE +\ 0.5 * (MAX_ACTOR_LEARNING_RATE - MIN_ACTOR_LEARNING_RATE) * \ (1 + np.cos(np.pi * T_cur / LR_CYCLE)) critic.learning_rate = MIN_CRITIC_LEARNING_RATE +\ 0.5 * (MAX_CRITIC_LEARNING_RATE - MIN_CRITIC_LEARNING_RATE) * \ (1 + np.cos(np.pi * T_cur / LR_CYCLE)) for j in xrange(MAX_EP_STEPS): totSteps += 1 # Begin "Experimentation and Evaluation Phase" # Select next experimental action by adding noise to action prescribed by policy a = actor.predict(np.reshape(s, (1, actor.s_dim, 1))) # If in a testing episode, do not add noise if i < EXPLORATION_SIZE and not (i % 100 is 49 or i % 100 is 99): noise = exploration_noise.noise() a = a + noise # Constrain action a = np.clip(a, -15, 15) # Take step with experimental action s2, r, terminal, info = env.step( np.reshape(a.T, newshape=(env.action_space.shape[0], )), CONST_THROTTLE) #print("car pos: " + str(env.car_dist_s)) #print("action: " + str(a)) #print("reward: " + str(r)) s2 = normalize(s2) # Add transition to replay buffer if not testing episode if i % 100 is not 49 and i % 100 is not 99: replay_buffer.add(np.reshape(s, (actor.s_dim, 1)), np.reshape(a, (actor.a_dim, )), r, terminal, np.reshape(s2, (actor.s_dim, 1))) # Keep adding experience to the memory until # there are at least minibatch size samples if replay_buffer.size() > MEMORY_WARMUP: s_batch, a_batch, r_batch, t_batch, s2_batch = replay_buffer.sample_batch( MINIBATCH_SIZE) # Find target estimate to use for updating the Q-function # Predict_traget function determines Q-value of next state target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) # Complete target estimate (R(t+1) + Q(s(t+1),a(t+1))) y_i = [] for k in xrange(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # Perform gradient descent to update critic predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) ep_ave_max_q += np.amax(predicted_q_value, axis=0) # Perform "Learning" phase by moving policy parameters in direction of deterministic policy gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() s = s2 ep_reward += r # If episode is finished, print results if terminal: if i % 100 is 49 or i % 100 is 99: print("Testing") kmodel = Sequential() actVars = [] for var in tf.trainable_variables(): if 'non-target' in str(var): actVars.append(var) kmodel.add( Dense(units=l1size, activation='tanh', weights=[ sess.run(actVars[0]), sess.run(actVars[1]) ], input_dim=actor.s_dim)) kmodel.add( Dense(units=l2size, activation='tanh', weights=[ sess.run(actVars[2]), sess.run(actVars[3]) ])) kmodel.add( Dense(units=1, activation='tanh', weights=[ sess.run(actVars[4]), sess.run(actVars[5]) ])) optimizer = optimizers.RMSprop(lr=0.00025, rho=0.9, epsilon=1e-06) kmodel.compile(loss="mse", optimizer=optimizer) kmodel.save(modelfile) else: print("Training") print('| Reward: %.2i' % int(ep_reward), " | Episode", i, '| Qmax: %.4f' % (ep_ave_max_q / float(j))) q_max_array.append(ep_ave_max_q / float(j)) print('Finished in ' + str(j) + ' steps') break plt.plot(q_max_array) plt.xlabel('Episode Number') plt.ylabel('Max Q-Value') plt.show() kmodel = Sequential() actVars = [] for var in tf.trainable_variables(): if 'non-target' in str(var): actVars.append(var) kmodel.add( Dense(units=l1size, activation='tanh', weights=[sess.run(actVars[0]), sess.run(actVars[1])], input_dim=actor.s_dim)) kmodel.add( Dense(units=l2size, activation='tanh', weights=[sess.run(actVars[2]), sess.run(actVars[3])])) kmodel.add( Dense(units=1, activation='tanh', weights=[sess.run(actVars[4]), sess.run(actVars[5])])) optimizer = optimizers.RMSprop(lr=0.00025, rho=0.9, epsilon=1e-06) kmodel.compile(loss="mse", optimizer=optimizer) kmodel.summary() kmodel.save(modelfile)