class DDPG(): def __init__(self, task, sess): self.sess = sess self.env = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.actor_lr = 0.0001 self.tau = 0.001 self.minibatch_size = 64 self.critic_lr = 0.001 self.gamma = 0.99 self.buffer_size = 1000000 self.random_seed = 1234 self.summary_dir = "/" #self.max_episode = 100 #self.max_episode_len = 100 self.mu = 0 self.actor = ActorNetwork(self.sess, self.state_size, self.action_size, self.action_low, self.action_high, self.actor_lr, self.tau, self.minibatch_size) self.critic = CriticNetwork(self.sess, self.state_size, self.action_size, self.critic_lr, self.tau, self.gamma, self.actor.get_num_trainable_vars()) # Initialize replay memory self.replay_buffer = ReplayBuffer(self.buffer_size, self.random_seed) self.sess.run(tf.global_variables_initializer()) self.actor.update_target_network() self.critic.update_target_network() self.noise = OUNoise(self.action_size, self.mu) self.sess.run(tf.global_variables_initializer()) def reset_episode(self): #self.actor_noise.reset() state = self.env.reset() self.last_state = state self.ep_ave_max_q = 0 self.ep_reward = 0 return state def step(self, s, a, r, terminal, s2): # Save experience / reward #self.memory.add(self.last_state, action, reward, next_state, done) #summary_ops, summary_vars = self.build_summaries() self.replay_buffer.add(np.reshape(s, (self.actor.s_dim, )), np.reshape(a, (self.actor.a_dim, )), r, terminal, np.reshape(s2, (self.actor.s_dim, ))) # Learn, if enough samples are available in memory if self.replay_buffer.size() > self.minibatch_size: s_batch, a_batch, r_batch, t_batch, s2_batch = self.replay_buffer.sample_batch( self.minibatch_size) #self.train(s_batch, a_batch, r_batch, t_batch, s2_batch) target_q = self.critic.predict_target( s2_batch, self.actor.predict_target(s2_batch)) y_i = [] for k in range(self.minibatch_size): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + self.critic.gamma * target_q[k]) # Update the critic given the targets predicted_q_value, _ = self.critic.train( s_batch, a_batch, np.reshape(y_i, (self.minibatch_size, 1))) #self.ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = self.actor.predict(s_batch) grads = self.critic.action_gradients(s_batch, a_outs) self.actor.train(s_batch, grads[0]) # Update target networks self.actor.update_target_network() self.critic.update_target_network() # Roll over last state and action self.last_state = s2 ''' self.ep_reward +=r if terminal: summary_str = self.sess.run( , feed_dict={summary_vars[0]: self.ep_reward, summary_vars[1]: self.ep_ave_max_q / float(j)}) writer.add_summary(summary_str, i) #writer.flush() print('| Reward: {:d} |Qmax: {:.4f}'.format(int(self.ep_reward), \ (self.ep_ave_max_q / float(j)))) ''' def act(self, states): """Returns actions for given state(s) as per current policy.""" states = np.reshape(states, [-1, self.state_size]) actions = self.actor.predict(states)[0] #actornoises = OrnsteinUhlenbeckActionNoise(mu=np.zeros(self.action_size)) #print(actions) return actions + self.noise.sample() # add some noise for exploration def train(self, s_batch, a_batch, r_batch, t_batch, s2_batch): target_q = self.critic.predict_target( s2_batch, self.actor.predict_target(s2_batch)) y_i = [] for k in range(self.minibatch_size): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + self.critic.gamma * target_q[k]) # Update the critic given the targets predicted_q_value, _ = self.critic.train( s_batch, a_batch, np.reshape(y_i, (self.minibatch_size, 1))) #self.ep_ave_max_q += np.amax(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = self.actor.predict(s_batch) grads = self.critic.action_gradients(s_batch, a_outs) self.actor.train(s_batch, grads[0]) # Update target networks self.actor.update_target_network() self.critic.update_target_network() def build_summaries(self): episode_reward = tf.Variable(0.) tf.summary.scalar("Reward", episode_reward) episode_ave_max_q = tf.Variable(0.) tf.summary.scalar("Qmax Value", episode_ave_max_q) summary_vars = [episode_reward, episode_ave_max_q] summary_ops = tf.summary.merge_all() return summary_ops, summary_vars
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.3 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters self.score = 0 self.best_score = -np.inf self.noise_scale = 0.1 def reset_episode(self): self.noise.reset() self.total_reward = 0.0 self.count = 0 state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) self.total_reward += reward self.count += 1 # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) self.score = self.total_reward / float( self.count) if self.count else 0.0 if self.score > self.best_score: self.best_score = self.score self.noise_scale = max(0.5 * self.noise_scale, 0.01) else: self.noise_scale = min(2.0 * self.noise_scale, 3.2) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
action_range = action_high - action_low #Start with random action action = np.array([np.random.uniform() for _ in action_low]) noise = OUNoise(action.shape[0], exploration_mu, exploration_theta, exploration_sigma) time_limit = 10 for i in range(10): start_time = time.time() env.reset() done = False j = 0 while not done: j += 1 ns = noise.sample() action = action + ns v_size, angle, speed = np.array(transform_action( action, action_range, action_low), dtype='uint8') ns, rw, done = env.step((v_size, angle, speed)) if time.time() - start_time > time_limit: print("Time limit, will reset") done = True #env.reset() #cv2.imshow('frame', frame[28: 112, :]) if cv2.waitKey(25) & 0xFF == ord('q'): cv2.destroyAllWindows() #asb.stop()
#h, w = buff.get_device_screen_shape() exploration_mu = 0 exploration_theta = 0.15 exploration_sigma = 0.2 action_size = 3 action_low = np.array([1, 0, 1]) action_high = np.array([10, 359, 2000]) action_range = action_high - action_low action = np.array([np.random.uniform() for _ in action_low]) noise = OUNoise(action.shape[0], exploration_mu, exploration_theta, exploration_sigma) values = np.zeros((100, 3)) iis = [i for i in range(100)] for i in iis: action = action + noise.sample() action = np.array(transform_action(action, action_range, action_low), dtype='uint8') values[i] = action print(values.shape) fig, ax = plt.subplots(3, sharex='col', sharey='row') labels = ['Size', 'Angle', 'Duration'] for i in range(3): ax[i].plot(iis, values[:, i]) ax[i].set_xlabel(labels[i]) plt.grid() plt.show()
class Agent: """Interacts and learns from the environment""" def __init__(self, device, state_size, action_size, random_seed, fc1=128, fc2=128, lr_actor=1e-04, lr_critic=1e-04, weight_decay=0, buffer_size=100000, batch_size=64, gamma=0.99, tau=1e-3): """ Parameters ---------- brain_name (String): state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed fc1 (int): 1st fully connected layer size for model (actor & critic) fc2 (int): 2nd fully connected layer size for model (actor & critic) device: CPU/GPU lr_actor (float): learning rate for Actor lr_critic (flaot): learning rate for Critic weight_decay (float): weight decay used in model optimizer buffer_size (int): replay buffer size batch_size (int): batch size to sample from buffer gamma (float): parameter used to calculate Q target tau (float): soft update interpolation parameter """ self.device = device self.state_size = state_size self.action_size = action_size self.seed = random.seed(random_seed) self.batch_size = batch_size self.gamma = gamma self.tau = tau # Actor network (with target) self.actor_local = Actor(self.state_size, self.action_size, random_seed, fc1_units=fc1, fc2_units=fc2).to(device) self.actor_target = Actor(self.state_size, self.action_size, random_seed, fc1_units=fc1, fc2_units=fc2).to(device) self.actor_optimizer = optim.Adam(params=self.actor_local.parameters(), lr=lr_actor) # Critic actor self.critic_local = Critic(self.state_size, self.action_size, random_seed, fc1_units=fc1, fc2_units=fc2).to(device) self.critic_target = Critic(self.state_size, self.action_size, random_seed, fc1_units=fc1, fc2_units=fc2).to(device) self.critic_optimizer = optim.Adam( params=self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, buffer_size, batch_size, self.device, random_seed) self.make_copy(self.critic_local, self.critic_target) self.make_copy(self.actor_local, self.actor_target) print("Initilized agent with state size = {} and action size = {}". format(self.state_size, self.action_size)) def make_copy(self, local_model, target_model): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(local_param.data) def step(self, state, action, reward, next_state, done): """ Save experience in replay memory, and use random sample from buffer to learn """ self.memory.add(state, action, reward, next_state, done) if len(self.memory) > self.batch_size: batch = self.memory.sample() self.learn(batch) def act(self, state, add_noise=True): """ Returns actions for given state as per current policy. """ state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, batch): """ Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Parameters ---------- batch (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = batch # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_target_next = self.critic_target(next_states, actions_next) # compute Q targets for next states (y_i) Q_targets = rewards + (self.gamma * Q_target_next * (1.0 - dones)) # Compute citic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimise loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimise loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target) self.soft_update(self.actor_local, self.actor_target) def soft_update(self, local_model, target_model): """ Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Parameters ---------- local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(self.tau * local_param.data + (1.0 - self.tau) * target_param.data)
class DDPG(): """DDPG agent with own actor and critic.""" def __init__(self, agent_id, model, action_size=2, seed=0): """Initialize an Agent object. """ self.seed = random.seed(seed) self.id = agent_id self.action_size = action_size # Actor Network self.actor_local = model.actor_local self.actor_target = model.actor_target self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic Network self.critic_local = model.critic_local self.critic_target = model.critic_target self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Set weights for local and target actor, respectively, critic the same self.hard_copy_weights(self.actor_target, self.actor_local) self.hard_copy_weights(self.critic_target, self.critic_local) # Noise process self.noise = OUNoise(action_size, seed) def hard_copy_weights(self, target, source): """ copy weights from source to target network (part of initialization)""" for target_param, param in zip(target.parameters(), source.parameters()): target_param.data.copy_(param.data) def act(self, state, noise_weight=1.0, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: self.noise_val = self.noise.sample() * noise_weight action += self.noise_val return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, agent_id, experiences, gamma, all_next_actions, all_actions): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # get predicted next-state actions and Q values from target models self.critic_optimizer.zero_grad() agent_id = torch.tensor([agent_id]).to(device) actions_next = torch.cat(all_next_actions, dim=1).to(device) with torch.no_grad(): q_targets_next = self.critic_target(next_states, actions_next) # compute Q targets for current states (y_i) q_expected = self.critic_local(states, actions) # q_targets = reward of this timestep + discount * Q(st+1,at+1) from target network q_targets = rewards.index_select( 1, agent_id) + (gamma * q_targets_next * (1 - dones.index_select(1, agent_id))) # compute critic loss critic_loss = F.mse_loss(q_expected, q_targets.detach()) # minimize loss critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # compute actor loss self.actor_optimizer.zero_grad() # detach actions from other agents actions_pred = [ actions if i == self.id else actions.detach() for i, actions in enumerate(all_actions) ] actions_pred = torch.cat(actions_pred, dim=1).to(device) actor_loss = -self.critic_local(states, actions_pred).mean() # minimize loss actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): def __init__(self, state_size, action_size, num_agents, device, gamma=GAMMA, tau=TAU, lr_actor=LR_ACTOR, lr_critic=LR_CRITIC, random_seed=0): """ Initialize an Agent object. :param state_size: size of state :param action_size: size of action :param num_agents: number of agents :param gamma: discount factor :param tau: factor for soft update of target parameters :param lr_actor: Learning rate of actor :param lr_critic: Learning rate of critic :param random_seed: Random seed :param device: cuda or cpu """ self.device = device self.gamma = gamma self.tau = tau self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.full_state_size = state_size * num_agents self.full_action_size = action_size * num_agents self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, device, random_seed).to(device) self.actor_target = Actor(state_size, action_size, device, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(self.full_state_size, self.full_action_size, device=device, random_seed=random_seed).to(device) self.critic_target = Critic(self.full_state_size, self.full_action_size, device=device, random_seed=random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=0) self.noise = OUNoise(action_size, random_seed) def save_model(self, agent_number): torch.save(self.actor_local.state_dict(), f'models/checkpoint_actor_{agent_number}.pth') torch.save(self.critic_local.state_dict(), f'models/checkpoint_critic_{agent_number}.pth') def load_model(self, agent_number): checkpoint = torch.load(f'models/checkpoint_actor_{agent_number}.pth', map_location=torch.device('cpu')) self.actor_local.load_state_dict(checkpoint) checkpoint = torch.load(f'models/checkpoint_critic_{agent_number}.pth', map_location=torch.device('cpu')) self.critic_local.load_state_dict(checkpoint) def act(self, state, noise=0., train=False): """Returns actions for given state as per current policy. :param state: state as seen from single agent """ if train is True: self.actor_local.train() else: self.actor_local.eval() action = self.actor_local(state) if noise > 0: noise = torch.tensor(noise * self.noise.sample(), dtype=state.dtype, device=state.device) return action + noise def target_act(self, state, noise=0.): #self.actor_target.eval() # convert to cpu() since noise is in cpu() self.actor_target.eval() action = self.actor_target(state).cpu() if noise > 0.: noise = torch.tensor(noise * self.noise.sample(), dtype=state.dtype, device=state.device) return action + noise def update_critic(self, rewards, dones, all_states, all_actions, all_next_states, all_next_actions): with torch.no_grad(): Q_targets_next = self.critic_target(all_next_states, all_next_actions) # Compute Q targets for current states (y_i) q_targets = rewards + (self.gamma * Q_targets_next * (1 - dones)) # Compute critic loss q_expected = self.critic_local(all_states, all_actions) # critic_loss = F.mse_loss(q_expected, q_targets) critic_loss = ((q_expected - q_targets.detach())**2).mean() self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() def update_actor(self, all_states, all_predicted_actions): """Update actor network :param all_states: all states :param all_predicted_actions: all predicted actions """ actor_loss = -self.critic_local(all_states, all_predicted_actions).mean() self.actor_optimizer.zero_grad() actor_loss.backward(retain_graph=True) self.actor_optimizer.step() def update_targets(self): self.soft_update(self.actor_local, self.actor_target, self.tau) self.soft_update(self.critic_local, self.critic_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data) def reset(self): self.noise.reset()
class Agent(): """ DDPG Agent, interacts with environment and learns from environment """ def __init__(self, device, state_size, n_agents, action_size, random_seed, \ buffer_size, batch_size, gamma, TAU, lr_actor, lr_critic, weight_decay, \ learn_interval, learn_num, ou_sigma, ou_theta, checkpoint_folder = './'): # Set Computational device self.DEVICE = device # Init State, action and agent dimensions self.state_size = state_size self.n_agents = n_agents self.action_size = action_size self.seed = random.seed(random_seed) self.l_step = 0 self.log_interval = 200 # Init Hyperparameters self.BUFFER_SIZE = buffer_size self.BATCH_SIZE = batch_size self.GAMMA = gamma self.TAU = TAU self.LR_ACTOR = lr_actor self.LR_CRITIC = lr_critic self.WEIGHT_DECAY = weight_decay self.LEARN_INTERVAL = learn_interval self.LEARN_NUM = learn_num # Init Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Init Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay) # Init Noise Process self.noise = OUNoise((n_agents, action_size), random_seed, mu=0., theta=ou_theta, sigma=ou_sigma) # Init Replay Memory self.memory = ReplayBuffer(device, action_size, buffer_size, batch_size, random_seed) # think def act(self, states, add_noise=True): """ Decide what action to take next """ # evaluate state through actor_local states = torch.from_numpy(states).float().to(self.DEVICE) actions = np.zeros((self.n_agents, self.action_size)) self.actor_local.eval() # put actor_local network in "evaluation" mode with torch.no_grad(): for n, state in enumerate(states): actions[n, :] = self.actor_local(state).cpu().data.numpy() self.actor_local.train() # put actor_local back into "training" mode # add noise for better performance if add_noise: actions += self.noise.sample() return np.clip(actions, -1, 1) # embody def step(self, t, s, a, r, s_, done): """ Commit step into the brain """ # Save SARS' to replay buffer --- state-action-reward-next_state tuple for n in range(self.n_agents): # self.memory.add(s, a, r, s_, done) # print ("going to learn 10 times") self.memory.add(s[n], a[n], r[n], s_[n], done[n]) if t % self.LEARN_INTERVAL != 0: return # Learn (if enough samples are available in memory ) if len(self.memory) > self.BATCH_SIZE: # print ("going to learn 10 times") for _ in range(self.LEARN_NUM): experiences = self.memory.sample() # get a memory sample self.learn(experiences, self.GAMMA) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """ Learn from experiences, with discount factor gamma Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params: experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ------ Update Critic ------ # # get predicted next-state actions and Q values from target networks actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # minimize loss self.critic_optimizer.zero_grad() critic_loss.backward() # torch.nn.utils.clip_grad_norm(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ------ Update Actor ------ # # compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # minimize loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ------ Update Target Networks ------ # self.soft_update(self.critic_local, self.critic_target, self.TAU) self.soft_update(self.actor_local, self.actor_target, self.TAU) # keep count of steps taken # self.noise.reset() def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, num_all_agents, seed, batch_size, buffer_size=int(1e6), gamma=0.99, tau=1e-3, lr_actor=4e-4, lr_critic=4e-4, weight_decay=0, discrete_actions=False): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_all_agents (int): number of agents seed (int): random seed batch_size (int): minibatch size buffer_size (int): replay buffer size gamma (float): discount factor tau (float): for soft update of target parameters lr_actor (float): learning rate of the actor lr_critic (float): learning rate of the critic weight_decay (float): L2 weight decay """ random.seed(seed) self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.num_all_agents = num_all_agents self.batch_size = batch_size self.gamma = gamma self.tau = tau self.noise = OUNoise(action_size, seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, seed, use_batch_norm_layers=False).to(device) self.actor_target = Actor(state_size, action_size, seed, use_batch_norm_layers=False).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (w/ Target Network) if discrete_actions: action_size = 1 self.critic_local = Critic(state_size * num_all_agents, action_size * num_all_agents, seed).to(device) self.critic_target = Critic(state_size * num_all_agents, action_size * num_all_agents, seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay) def act(self, states, add_noise=True): """Returns actions for given state as per current policy.""" states = torch.from_numpy(states).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(states).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma, tau, agent_index): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states_all, actions_all, rewards_all, next_states_all, dones, actions_next_target_all, actions_next_local_all = experiences rewards_self = rewards_all[:, agent_index] states_self = states_all.view(-1, self.num_all_agents, self.state_size)[:, agent_index, :] del rewards_all # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models Q_targets_next = self.critic_target(next_states_all, actions_next_target_all) # Compute Q targets for current states (y_i) Q_targets = rewards_self + (gamma * Q_targets_next) * (1 - dones) # Compute critic loss Q_expected = self.critic_local(states_all, actions_all) critic_loss = F.mse_loss(Q_expected.view(-1, self.batch_size), Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actor_loss = -self.critic_local(states_all, actions_next_local_all).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, tau) self.soft_update(self.actor_local, self.actor_target, tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent: """Interacts with and learns from the environment.""" def __init__( self, num_agents, state_size, action_size, buffer_size=int(1e5), batch_size=128, gamma=0.99, tau=1e-3, lr_actor=1e-4, lr_critic=1e-3, weight_decay=0, random_seed=2, ): """Initialize an Agent object. Params ====== num_agents (int): number of agents state_size (int): dimension of each state action_size (int): dimension of each action random_seed (int): random seed """ self.num_agents = num_agents self.state_size = state_size self.action_size = action_size self.batch_size = batch_size self.gamma = gamma self.tau = tau self.seed = random.seed(random_seed) # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(device) self.actor_target = Actor(state_size, action_size, random_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(device) self.critic_target = Critic(state_size, action_size, random_seed).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=lr_critic, weight_decay=weight_decay) # Noise process self.noise = OUNoise((num_agents, action_size), random_seed) # Replay memory self.memory = ReplayBuffer( action_size=action_size, buffer_size=buffer_size, batch_size=batch_size, seed=random_seed, ) def step(self, state, action, reward, next_state, done): """Save experience in replay memory, and use random sample from buffer to learn.""" # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences, self.gamma) def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1) # clip gradients at 1 self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPGAgent: ''' DDPG Agent implementation ''' def __init__(self, agent_id, state_size, action_size, rand_seed, meta_agent): """ Creates a new DDPG Agent """ self.agent_id = agent_id self.action_size = action_size # Defines the Actor Networks self.actor_local = Actor(state_size, action_size, rand_seed).to(device) self.actor_target = Actor(state_size, action_size, rand_seed).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Defines the Critic Networks self.critic_local = Critic(state_size, action_size, meta_agent.agents_qty, rand_seed).to(device) self.critic_target = Critic(state_size, action_size, meta_agent.agents_qty, rand_seed).to(device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=LR_CRITIC) #, weight_decay=WEIGHT_DECAY) self.noise = OUNoise(action_size, rand_seed) # Refers to the MA agent memory self.memory = meta_agent.memory self.t_step = 0 def step(self): # Takes an step self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def learn(self, experiences, gamma): (states_list, actions_list, rewards, next_states_list, dones) = experiences # Get the target actions for all the states l_all_next_actions = [] for states in states_list: l_all_next_actions.append(self.actor_target(states)) # Convert the experiences into Torch tensors all_next_actions = torch.cat(l_all_next_actions, dim=1).to(device) all_next_states = torch.cat(next_states_list, dim=1).to(device) all_states = torch.cat(states_list, dim=1).to(device) all_actions = torch.cat(actions_list, dim=1).to(device) Q_targets_next = self.critic_target(all_next_states, all_next_actions) # Calculates the Q function using all the next states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) Q_expected = self.critic_local(all_states, all_actions) critic_loss = F.mse_loss(Q_expected, Q_targets) self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # --------------------------- update actor --------------------------- actions_pred = [] for states in states_list: actions_pred.append(self.actor_local(states)) actions_pred = torch.cat(actions_pred, dim=1).to(device) actor_loss = -self.critic_local(all_states, actions_pred).mean() self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ---------------------- update target networks ---------------------- self.soft_update(self.critic_local, self.critic_target, TAU) self.soft_update(self.actor_local, self.actor_target, TAU) def act(self, states, add_noise=True): """ Returns the actions to take by the agent""" states = torch.from_numpy(states).float().to(device) self.actor_local.eval() with torch.no_grad(): actions = self.actor_local(states).cpu().data.numpy() self.actor_local.train() if add_noise: actions += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def soft_update(self, local_model, target_model, tau): """ Performs the softupdate """ iter_params = zip(target_model.parameters(), local_model.parameters()) for target_param, local_param in iter_params: tensor_aux = tau * local_param.data + (1.0 - tau) * target_param.data target_param.data.copy_(tensor_aux) def reset(self): self.noise.reset()
class DDPG: def __init__(self, task): # Hyper parameters self.learning_rate_actor = 1e-4 self.learning_rate_critic = 1e-3 self.gamma = 0.99 self.tau = 0.001 # Define net self.sess = tf.Session() self.task = task self.actor = ActorNet(self.sess, self.task.state_size, self.task.action_size, self.learning_rate_actor, \ self.task.action_low, self.task.action_high, self.tau) self.critic = CriticNet(self.sess, self.task.state_size, self.task.action_size, self.learning_rate_critic, self.tau) # Define noise self.mu = 0 self.theta = 0.15 self.sigma = 0.20 self.noise = OUNoise(self.task.action_size, self.mu, self.theta, self.sigma) # Define memory replay self.buffer_size = 1000000 self.batch_size = 64 self.memory = Replay(self.buffer_size, self.batch_size) # Score self.best_score = -np.inf self.best_reward = -np.inf def reset(self): self.noise.reset() state = self.task.reset() self.last_state = state self.total_reward = 0.0 self.count = 0 return state def learn(self, experience): # Turn into different np arrays state_batch = np.vstack([e[0] for e in experience]) action_batch = np.vstack([e[1] for e in experience]) reward_batch = np.vstack([e[2] for e in experience]) next_state_batch = np.vstack([e[3] for e in experience]) done_batch = np.vstack([e[4] for e in experience]) # Calculate next_state q value next_action_batch = self.actor.target_actions(next_state_batch) next_q_targets = self.critic.targetQ(next_state_batch, next_action_batch) # Train critic net q_targets = reward_batch + self.gamma * next_q_targets * (1 - done_batch) self.critic.train(state_batch, action_batch, q_targets) # Train actor net action_gradients = self.critic.gradients(state_batch, action_batch) self.actor.train(action_gradients, state_batch) # Update target network self.actor.update_target(False) self.critic.update_target(False) def step(self, action, reward, next_state, done): self.memory.add([self.last_state, action, reward, next_state, done]) self.total_reward += reward self.count += 1 if done: self.score = self.total_reward / float(self.count) if self.count else 0.0 self.best_score = max(self.best_score, self.score) self.best_reward = max(self.total_reward, self.best_reward) if len(self.memory.buffer) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) self.last_state = next_state def act(self, states): states = np.reshape(states, [-1, self.task.state_size]) action = self.actor.actions(states)[0] return list(action + self.noise.sample())
class Agent(object): """ The Agent interacts with and learns from the environment. """ def __init__(self, state_size, action_size, num_agents, random_seed=0, params=params): """ Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.params = params # Actor (Policy) Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(self.params['DEVICE']) self.actor_target = Actor(state_size, action_size, random_seed).to(self.params['DEVICE']) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.params['LR_ACTOR']) # Critic (Value) Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(self.params['DEVICE']) self.critic_target = Critic(state_size, action_size, random_seed).to(self.params['DEVICE']) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=self.params['LR_CRITIC'], weight_decay=self.params['WEIGHT_DECAY']) # Initialize target and local to same weights self.hard_update(self.actor_local, self.actor_target) self.hard_update(self.critic_local, self.critic_target) # Noise process self.noise = OUNoise(action_size, random_seed) # Replay memory self.memory = ReplayBuffer(action_size, self.params['BUFFER_SIZE'], self.params['BATCH_SIZE'], random_seed) def hard_update(self, local_model, target_model): """ Hard update model parameters. """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(local_param.data) def step(self, states, actions, rewards, next_states, dones): """ Save experiences in replay memory and use random sample from buffer to learn. """ # Save experience / reward, cater for when multiples for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) # Learn if enough samples are available in memory if len(self.memory) > self.params['BATCH_SIZE']: experiences = self.memory.sample() self.learn(experiences, self.params['GAMMA']) def act(self, states, add_noise=True): """ Returns actions for a given state as per current policy. """ states = torch.from_numpy(states).float().to(self.params['DEVICE']) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): for i, state in enumerate(states): actions[i, :] = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: actions += self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma=params['GAMMA']): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Update Critic(Value) # Get predicted next-state actions and Q-Values from target Network actions_next = self.actor_target(next_states) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q Targe for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimise the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_( self.critic_local.parameters(), 1) # Stabilize learning per bernchmark guidelines self.critic_optimizer.step() # Update Actor (Policy) # Compute Actor Loss actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update target networks self.soft_update(self.critic_local, self.critic_target, tau=self.params['TAU']) self.soft_update(self.actor_local, self.actor_target, tau=self.params['TAU']) def soft_update(self, local_model, target_model, tau=params['TAU']): """ Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.session = K.get_session() init = tf.global_variables_initializer() self.session.run(init) self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high self.score = -math.inf self.best_score = -math.inf self.last_loss = math.inf # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 self.exploration_theta = 0.15 self.exploration_sigma = 0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) self.noise_scale = (self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 16 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.001 # for soft update of target parameters def reset_episode(self): self.noise.reset() self.total_reward = 0 state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) self.total_reward += reward # Learn, if enough samples are available in memory print("Memory Size: {}, Batch Size: {}".format(len(self.memory), self.batch_size)) if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state def act(self, state): """Returns actions for given state(s) as per current policy.""" #state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(np.array([state]))[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): print("Fitting model iteration ...") """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.array([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.array( [e.next_state for e in experiences if e is not None]) print("Next states shape: {}".format(next_states.shape)) self.score = rewards.mean() self.best_score = max(self.score, self.best_score) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) r = self.actor_local.train_fn([states, action_gradients, 1]) self.last_loss = np.mean(-action_gradients * actions) # custom training function Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights) if __name__ == "__main__": state_size = (84, 296, 9) action_low = np.array([1, 0, 1]) action_high = np.array([10, 359, 2000]) net = Actor(state_size, 3, action_low, action_high) #net = Critic(state_size, 3) net.model.summary()
class Agent(): def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_high = task.action_high self.action_low = task.action_low # actor policy model self.actor_local = Actor(self.state_size, self.action_size, self.action_high, self.action_low) self.actor_target = Actor(self.state_size, self.action_size, self.action_high, self.action_low) # critic value model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # initialize target model parameters with local model parameters self.actor_target.model.set_weights( self.actor_local.model.get_weights()) self.critic_target.model.set_weights( self.critic_local.model.get_weights()) # noise process self.exploration_mu = 0 self.exploration_theta = 0.25 self.exploration_sigma = 0.3 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # replay buffer self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # algorithm parameters self.gamma = 0.9 # discount rate self.tau = 0.1 # soft update parameter self.total_reward = 0 self.count = 0 self.score = 0 self.best_score = -np.inf self.reset_episode() def reset_episode(self): self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # keep track of rewards self.total_reward += reward self.count += 1 # save experience/reward self.memory.add(self.last_state, action, reward, next_state, done) # if there are enough experiences, learn from them if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) self.last_state = next_state def act(self, states): # returns action for a given state(s) as per the current policy state = np.reshape(states, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) def learn(self, experiences): self.score = self.total_reward / float( self.count) if self.count else 0.0 # update the policy and value parameters given batch of experience tuples states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # get predicted next state and Q values from target models next_actions = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, next_actions]) # compute Q targets for current state and train local critic model Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # train local actor model action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom train function # soft update target models self.soft_update(self.actor_local.model, self.actor_target.model) self.soft_update(self.critic_local.model, self.critic_target.model) def soft_update(self, local_model, target_model): local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)
class Agent: def __init__(self, state_size, action_size, random_seed): """ Creates a new DDPG agent initilizing the networks """ self.state_size = state_size self.action_size = action_size self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, random_seed) self.critic = Critic(state_size, action_size, 17).to(device) self.critic_target = Critic(state_size, action_size, 17).to(device) self.critic_optimizer = optim.Adam(self.critic.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) self.actor = Actor(state_size, action_size, 17).to(device) self.actor_target = Actor(state_size, action_size, 17).to(device) self.actor_optimizer = optim.Adam(self.actor.parameters(), lr=LR_ACTOR) self.seed = random.seed(random_seed) # Noise process self.noise = OUNoise(action_size, random_seed) def next_action(self, states, add_noise=True): """ Returns the next action to take """ states = torch.from_numpy(states).float().to(device) self.actor.eval() with torch.no_grad(): action_values = self.actor(states).cpu().data.numpy() self.actor.train() actions = action_values if add_noise: actions += self.noise.sample() actions = np.clip(actions, -1, 1) return actions def step(self, state, action, reward, next_state, done): """ Takes a next step saving the data in the replay buffer and learning new experiences """ ## Save in experience replay buffer""" for s, a, r, ns, d in zip(state, action, reward, next_state, done): self.memory.add(s, a, r, ns, d) ## If there is sufficient memories in the replay buffer if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self._learn(experiences, GAMMA) def _learn(self, experiences, gamma): """ Learns new experiences """ state, action, reward, next_state, dones = experiences ## Update Critic # Recovers next action from actor next_action = self.actor(next_state) # Train the critic using the experience (4) Q_target_next = self.critic_target(next_state, next_action) Q_target = reward + (gamma * Q_target_next * (1 - dones)) Q_expected = self.critic(state, action) critic_loss = F.mse_loss(Q_expected, Q_target) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic.parameters(), 1) self.critic_optimizer.step() ## Update Actor new_action = self.actor(state) actor_loss = -self.critic(state, new_action).mean() self.actor_optimizer.zero_grad() actor_loss.backward() torch.nn.utils.clip_grad_norm_(self.actor.parameters(), 1) self.actor_optimizer.step() self.soft_update(self.critic, self.critic_target, TAU) self.soft_update(self.actor, self.actor_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)