def main(): env_spec = registry[env_name] env = gym.make(env_spec["id"]) ep_max_steps = env_spec["max_episode_steps"] agent = DDPG(env.observation_space.shape, env.action_space.shape, env.action_space.low[0], env.action_space.high[0]) replay_buffer = ReplayBuffer() state = env.reset() done = False ep_timesteps = 0 ep_reward = 0 ep_num = 0 reward_history = [] for t in range(TOTAL_TIMESTEPS): ep_timesteps += 1 # Select action if t < START_TIMESTEP: action = env.action_space.sample() else: action = agent.select_action(np.array(state)) # Perform action next_state, reward, done, _ = env.step(action) train_done = done and ep_timesteps < ep_max_steps replay_buffer.add( TransitionTuple(state, action, next_state, reward, int(train_done))) state = next_state ep_reward += reward if t >= START_TIMESTEP: agent.train(replay_buffer, BATCH_SIZE) if done: reward_history.append(ep_reward) print( f"[Episode {ep_num+1}, Timestep {t+1}] Total reward: {ep_reward} Total timesteps: {ep_timesteps}" ) state = env.reset() done = False ep_timesteps = 0 ep_reward = 0 ep_num += 1 if RENDER: env.render() # Visualize results if OUTPUT_PLOT: sns.lineplot(x=np.arange(len(reward_history)) + 1, y=reward_history) plt.ylabel("Episode Reward") plt.xlabel("Episode Number") plt.savefig(OUTPUT_PLOT)
class Agent: """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, num_agents, random_seed, device, lr_actor, lr_critic, weight_decay_critic, batch_size, buffer_size, gamma, tau, update_every, n_updates, eps_start, eps_end, eps_decay): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action num_agents (int): number of agents random_seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.num_agents = num_agents self.seed = random.seed(random_seed) self.t_step = 0 self.device = device self.lr_actor = lr_actor self.lr_critic = lr_critic self.weight_decay_critic = weight_decay_critic self.batch_size = batch_size self.buffer_size = buffer_size self.gamma = gamma self.tau = tau self.update_every = update_every self.n_updates = n_updates self.eps = eps_start self.eps_end = eps_end self.eps_decay = eps_decay # Actor Network (w/ Target Network) self.actor_local = Actor(state_size, action_size, random_seed).to(self.device) self.actor_target = Actor(state_size, action_size, random_seed).to(self.device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=self.lr_actor) # Critic Network (w/ Target Network) self.critic_local = Critic(state_size, action_size, random_seed).to(self.device) self.critic_target = Critic(state_size, action_size, random_seed).to(self.device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=self.lr_critic, weight_decay=self.weight_decay_critic) # Noise process self.noise = OUNoise((num_agents, action_size), random_seed) # Replay memory self.memory = ReplayBuffer(action_size, self.buffer_size, self.batch_size, random_seed, self.device) def step(self, state, action, reward, next_state, done, agent_number): """Save experience in replay memory, and use random sample from buffer to learn.""" self.t_step += 1 # Save experience / reward self.memory.add(state, action, reward, next_state, done) # Learn, if enough samples are available in memory and at interval settings if len(self.memory) > self.batch_size: if self.t_step % self.update_every == 0: for _ in range(self.n_updates): experiences = self.memory.sample() self.learn(experiences, self.gamma, agent_number) def act(self, states, add_noise): """Returns actions for given state as per current policy.""" states = torch.from_numpy(states).float().to(self.device) actions = np.zeros((self.num_agents, self.action_size)) self.actor_local.eval() with torch.no_grad(): for agent_num, state in enumerate(states): action = self.actor_local(state).cpu().data.numpy() actions[agent_num, :] = action self.actor_local.train() if add_noise: actions += self.eps * self.noise.sample() return np.clip(actions, -1, 1) def reset(self): self.noise.reset() def learn(self, experiences, gamma, agent_number): """Update policy and value parameters using given batch of experience tuples. Q_targets = r + γ * critic_target(next_state, actor_target(next_state)) where: actor_target(state) -> action critic_target(state, action) -> Q-value Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # ---------------------------- update critic ---------------------------- # # Get predicted next-state actions and Q values from target models actions_next = self.actor_target(next_states) if agent_number == 0: actions_next = torch.cat((actions_next, actions[:, 2:]), dim=1) else: actions_next = torch.cat((actions[:, :2], actions_next), dim=1) Q_targets_next = self.critic_target(next_states, actions_next) # Compute Q targets for current states (y_i) Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Compute critic loss Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() # ---------------------------- update actor ---------------------------- # # Compute actor loss actions_pred = self.actor_local(states) if agent_number == 0: actions_pred = torch.cat((actions_pred, actions[:, 2:]), dim=1) else: actions_pred = torch.cat((actions[:, :2], actions_pred), dim=1) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.tau) self.soft_update(self.actor_local, self.actor_target, self.tau) # Update epsilon noise value self.eps = max(self.eps_end, self.eps_decay*self.eps) # self.eps = self.eps - (1/self.eps_decay) # if self.eps < self.eps_end: # self.eps = self.eps_end def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model: PyTorch model (weights will be copied from) target_model: PyTorch model (weights will be copied to) tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def train(sess, env, actor, critic, noise, reward, discrete): # set up summary writer summary_write = tf.summary.FileWriter("ddpg_summary") sess.run(tf.global_variables_initializer()) # initialize target and critic network actor.update_target_network() critic.update_target_network() # initialize replay memory replay_buffer = ReplayBuffer(BUFFER_SIZE, RANDOM_SEED) # initialize noise ou_level = 0. for i in range(MAX_EPISODES): s = env.reset() ep_reward = 0 ep_ave_max_q = 0 episode_buffer = np.empty((0, 5), float) for j in range(MAX_EP_STEPS): if RENDER_ENV: env.render() a = actor.predict(np.reshape(s, (1, actor.s_dim))) # Add exploration noise if i < NOISE_MAX_EP: ou_level = noise.ornstein_uhlenbeck_level(ou_level) a = a + ou_level # Set action for discrete and continuous action spaces if discrete: action = np.argmax(a) else: action = a[0] s2, r, terminal, info = env.step(action) # Choose reward type ep_reward += r episode_buffer = np.append(episode_buffer, [[s, a, r, terminal, s2]], axis=0) # Adding experience to memory if replay_buffer.size() > MINIBATCH_SIZE: s_batch, a_batch, r_batch, t_batch, s2_batch = \ replay_buffer.sample_batch(MINIBATCH_SIZE) # Calculate targets target_q = critic.predict_target( s2_batch, actor.predict_target(s2_batch)) y_i = [] for k in range(MINIBATCH_SIZE): if t_batch[k]: y_i.append(r_batch[k]) else: y_i.append(r_batch[k] + GAMMA * target_q[k]) # Update the critic given the targes predicted_q_value, _ = critic.train( s_batch, a_batch, np.reshape(y_i, (MINIBATCH_SIZE, 1))) ep_ave_max_q += np.max(predicted_q_value) # Update the actor policy using the sampled gradient a_outs = actor.predict(s_batch) grads = critic.action_gradients(s_batch, a_outs) actor.train(s_batch, grads[0]) # Update target networks actor.update_target_network() critic.update_target_network() # Set previous state for next step s = s2 if terminal: # Reward system for episode # episode_buffer = reward.discount(episode_buffer) # Add episode to replay for step in episode_buffer: replay_buffer.add(np.reshape(step[0], (actor.s_dim, )), np.reshape(step[1], (actor.a_dim, )), step[2], step[3], np.reshape(step[4], (actor.s_dim, ))) # summary = tf.summary() # summary.value.add(tag="Perf/Reward", simple_value=float(ep_reward)) # summary.value.add(tag="Perf/Qmax", simple_value=float(ep_ave_max_q / float(j))) # summary_writer.add_summary(summary, i) # summary_writer.flush() if i != 0: print "|Reward: %.2i | Episode: %d | Qmax: %.4f" % ( int(ep_reward), i, (ep_ave_max_q / float(i))) break
class DuelingDQN(BaseAgent): def __init__(self, env): self.buffer_size = 20000 self.batch_size = 64 self.tau = 1 self.gamma = 0.95 self.learning_rate = 0.001 # Exploration Parameters self.E_start = 1 self.E_end = 0.1 self.E_decay = 0.002 self.episode = 0 self.env = env self.os = self.env.observation_space # self.acs = self.env.action_space self.edim = len(self.os.high) self.adim = self.env.action_space.n self.buffer = ReplayBuffer(self.buffer_size, self.edim, 1) self.local = DuelingDQN_Model(self.edim, self.adim, self.learning_rate) self.target = DuelingDQN_Model(self.edim, self.adim, self.learning_rate) self.initial_weights = self.local.model.get_weights() self.target.model.set_weights(self.initial_weights) def act(self, state, testing): state = state.reshape([1, -1]) actionQs = self.local.model.predict(state) action = np.argmax(actionQs) epsilon = self.E_end + (self.E_start - self.E_end) * np.exp( -self.E_decay * self.episode) if (not testing): if (np.random.rand() < epsilon): action = np.random.choice(self.adim) # action = np.array([action]) return action def learn(self, state, action, reward, next_state, done, testing): # Skip all learning during testing if (testing): return act_index = action self.buffer.add(state, act_index, reward, next_state, done) if (done): self.episode += 1 # TODO When upgrading to RDPG this should be per-episode based states, actions, rewards, next_states, dones = self.buffer.batch( self.batch_size) actions.astype(int) actions = actions.reshape([-1]) rewards = rewards.reshape([-1]) dones = dones.reshape([-1]) # Bellman equation target_Q = rewards + self.gamma * np.amax( self.target.model.predict_on_batch(next_states), axis=1) * (1 - dones) self.local.train([states, actions, target_Q]) self.soft_update(self.target, self.local) def soft_update(self, target, local): local_weights = np.array(local.model.get_weights()) target_weights = np.array(target.model.get_weights()) new_target_weights = ( 1 - self.tau) * target_weights + self.tau * local_weights target.model.set_weights(new_target_weights) def reset(self): shuffle_weights(self.local.model, self.initial_weights) self.target.model.set_weights(self.local.model.get_weights()) self.episode = 0
s = env.reset() s_t = np.hstack((s[0], s[1], s[2])) while True: #epsilon *= 0.995 loss = 0.0 #epsilon -= 1.0/10000.0 a = actor.model.predict(s_t.reshape(1,s_t.shape[0])) noise = Ornstein_Uhlenbeck(a[0]) #noise = max(epsilon,0) * noise.function(a[0], 0.0, 0.15, 0.3) # a = a[0] + noise a = a[0] + noise()[0] s2, r, done, info = env.step(np.array(a)) s2_t = np.hstack((s2[0],s2[1],s2[2])) buff.add(s_t, a, r, s2_t, done) batch = buff.get_batch(batch_size) if len(batch) >= batch_size: states = np.asarray([e[0] for e in batch]) actions = np.asarray([e[1] for e in batch]) reward = np.asarray([e[2] for e in batch]) new_states = np.asarray([e[3] for e in batch]) dones = np.asarray([e[4] for e in batch]) y_t = np.asarray([e[1] for e in batch]) target_q = critic.target_model.predict([new_states, actor.target_model.predict(new_states)]) for k in range(len(batch)): if dones[k]: y_t[k] = reward[k]
def challenger_round(): challengers = [] leaders = [] leader_checkpoints = os.listdir(LEADER_DIR) # Need to share the same schedule with all challengers, so they all anneal # at same rate epsilon_schedule = LinearSchedule(EPS_START, EPS_END, TRAIN_FRAMES) for i in xrange(NUM_LEADERS): challenger = try_gpu( DQNAgent(6, epsilon_schedule, OBSERVATION_MODE, lr=LR, max_grad_norm=GRAD_CLIP_NORM)) if i < len(leader_checkpoints): leader = try_gpu( DQNAgent(6, LinearSchedule(0.1, 0.1, 500000), OBSERVATION_MODE)) leader_path = os.path.join(LEADER_DIR, leader_checkpoints[i]) print "LOADING CHECKPOINT: {}".format(leader_path) challenger.load_state_dict( torch.load(leader_path, map_location=lambda storage, loc: storage)) leader.load_state_dict( torch.load(leader_path, map_location=lambda storage, loc: storage)) else: leader = RandomAgent(6) print "INITIALIZING NEW CHALLENGER AND LEADER" challengers.append(challenger) leaders.append(leader) if CHALLENGER_DIR is not None: challengers = [] # Load in all of the leaders for checkpoint in os.listdir(CHALLENGER_DIR): path = os.path.join(CHALLENGER_DIR, checkpoint) print "LOADING FROM CHALLENGER_DIR: {}".format(path) challenger = try_gpu( DQNAgent(6, LinearSchedule(0.05, 0.05, 1), CHALLENGER_OBSERVATION_MODE, lr=LR, max_grad_norm=GRAD_CLIP_NORM, name=checkpoint)) challenger.load_state_dict( torch.load(path, map_location=lambda storage, loc: storage)) challengers.append(challenger) challenger = EnsembleDQNAgent(challengers) leader = EnsembleDQNAgent(leaders) if OPPONENT is not None or HUMAN: leader = NoOpAgent() replay_buffer = ReplayBuffer(1000000) rewards = collections.deque(maxlen=1000) frames = 0 # number of training frames seen episodes = 0 # number of training episodes that have been played with tqdm(total=TRAIN_FRAMES) as progress: # Each loop completes a single episode while frames < TRAIN_FRAMES: states = env.reset() challenger.reset() leader.reset() episode_reward = 0. episode_frames = 0 # Each loop completes a single step, duplicates _evaluate() to # update at the appropriate frame #s for _ in xrange(MAX_EPISODE_LENGTH): frames += 1 episode_frames += 1 action1 = challenger.act(states[0]) action2 = leader.act(states[1]) next_states, reward, done = env.step(action1, action2) episode_reward += reward # NOTE: state and next_state are LazyFrames and must be # converted to np.arrays replay_buffer.add( Experience(states[0], action1._action_index, reward, next_states[0], done)) states = next_states if len(replay_buffer) > 50000 and \ frames % 4 == 0: experiences = replay_buffer.sample(32) challenger.update_from_experiences(experiences) if frames % 10000 == 0: challenger.sync_target() if frames % SAVE_FREQ == 0: # TODO: Don't access internals for agent in challenger._agents: path = os.path.join(LEADER_DIR, agent.name + "-{}".format(frames)) print "SAVING CHECKPOINT TO: {}".format(path) torch.save(agent.state_dict(), path) #path = os.path.join( # LEADER_DIR, challenger.name + "-{}".format(frames)) #torch.save(challenger.state_dict(), path) if frames >= TRAIN_FRAMES: break if done: break if episodes % 300 == 0: print "Evaluation: {}".format( evaluate(challenger, leader, EPISODES_EVALUATE_TRAIN)) print "Episode reward: {}".format(episode_reward) episodes += 1 rewards.append(episode_reward) stats = challenger.stats stats["Avg Episode Reward"] = float(sum(rewards)) / len(rewards) stats["Num Episodes"] = episodes stats["Replay Buffer Size"] = len(replay_buffer) progress.set_postfix(stats, refresh=False) progress.update(episode_frames) episode_frames = 0
class DDPG: def __init__(self, config): self.config = config self.state_size = config.state_size self.action_size = config.action_size self.actor_local = Actor(self.state_size, self.action_size, 2).to(device) self.actor_target = Actor(self.state_size, self.action_size, 2).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=config.LR_ACTOR) self.critic_local = Critic(self.state_size, self.action_size, 2).to(device) self.critic_target = Critic(self.state_size, self.action_size, 2).to(device) self.critic_optimizer = optim.Adam( self.critic_local.parameters(), lr=config.LR_CRITIC, ) self.memory = ReplayBuffer(config.random_seed, config.BUFFER_SIZE) self.noise = OUNoise(self.action_size, config.random_seed) self.t_step = 0 self.soft_update(self.critic_local, self.critic_target, 1) self.soft_update(self.actor_local, self.actor_target, 1) def step(self, states, actions, rewards, next_states, dones): for state, action, reward, next_state, done in zip( states, actions, rewards, next_states, dones): self.memory.add(state, action, reward, next_state, done) self.t_step = (self.t_step + 1) % self.config.UPDATE_EVERY if len(self.memory) > self.config.BATCH_SIZE and (self.t_step == 0): for i in range(self.config.EPOCH): experiences = self.memory.sample(self.config.BATCH_SIZE) self.learn(experiences) def reset(self): self.noise.reset() def act(self, state, add_noise=True): """Returns actions for given state as per current policy.""" state = torch.from_numpy(state).float().to(device) self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().data.numpy() self.actor_local.train() if add_noise: action += self.noise.sample() return np.clip(action, -1, 1) def learn(self, experiences): states, actions, rewards, next_states, dones = experiences Q_targets_next = self.critic_target(next_states, self.actor_target(next_states)) Q_targets = rewards + (self.config.GAMMA * Q_targets_next * (1 - dones)) Q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() actor_loss = -self.critic_local(states, self.actor_local(states)).mean() # Minimize the loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # ----------------------- update target networks ----------------------- # self.soft_update(self.critic_local, self.critic_target, self.config.TAU) self.soft_update(self.actor_local, self.actor_target, self.config.TAU) def soft_update(self, local_model, target_model, tau): for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(object): """DDPG Agent that interacts and learns from the environment.""" def __init__(self, state_size, action_size, device, actor_args={}, critic_args={}): """Initializes the DQN agent. Args: state_size (int): Dimension of each state action_size (int): Dimension of each action device (torch.device): Device to use for calculations actor_args (dict): Arguments describing the actor network critic_args (dict): Arguments describing the critic network """ self.state_size = state_size """Dimension of each state""" self.action_size = action_size """Dimension of each action""" self.device = device """Device to use for calculations""" self.t_step = 0 """Timestep between training updates""" # Parameters # Actor network self.actor_local = Actor(state_size, action_size, **actor_args).to(device) self.actor_target = Actor(state_size, action_size, **actor_args).to(device) self.actor_optimizer = optim.Adam(self.actor_local.parameters(), lr=LR_ACTOR) # Critic network self.critic_local = Critic(state_size, action_size, **critic_args).to(device) self.critic_target = Critic(state_size, action_size, **critic_args).to(device) self.critic_optimizer = optim.Adam(self.critic_local.parameters(), lr=LR_CRITIC, weight_decay=WEIGHT_DECAY) # Noise process for exploration self.noise = OUNoise(action_size, sigma=NOISE_SD) # Replay memory self.memory = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, self.device) def reset(self): """Reset state of agent.""" self.noise.reset() def save_weights(self, path): """Save local network weights. Args: path (string): File to save to""" torch.save( { 'actor_local': self.actor_local.state_dict(), 'actor_target': self.actor_target.state_dict(), 'critic_local': self.critic_local.state_dict(), 'critic_target': self.critic_target.state_dict() }, path) def load_weights(self, path): """Load local network weights. Args: path (string): File to load weights from""" checkpoint = torch.load(path) self.actor_local.load_state_dict(checkpoint['actor_local']) self.actor_target.load_state_dict(checkpoint['actor_target']) self.critic_local.load_state_dict(checkpoint['critic_local']) self.critic_target.load_state_dict(checkpoint['critic_target']) def act(self, state, add_noise=True): """Returns action for given state according to the current policy Args: state (np.ndarray): Current state Returns: action (np.ndarray): Action tuple """ state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) # Temporarily set evaluation mode (no dropout &c) & turn off autograd self.actor_local.eval() with torch.no_grad(): action = self.actor_local(state).cpu().detach().numpy() # Resume training mode self.actor_local.train() # Add noise if exploring if add_noise: action += self.noise.sample() # The noise might take us out of range action = np.clip(action, -1, 1) return action def step(self, state, action, reward, next_state, done): """Save experience and learn if due. Args: state (Tensor): Current state action (int): Chosen action reward (float): Resulting reward next_state (Tensor): State after action done (bool): True if terminal state """ self.memory.add(state, action, reward, next_state, done) # Learn as soon as we have enough stored experiences self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0 and len(self.memory) > BATCH_SIZE: for _ in range(NUM_UPDATES): experiences = self.memory.sample() self.learn(experiences) def learn(self, experiences): """Learn from batch of experiences.""" states, actions, rewards, next_states, dones = experiences # region Update Critic actions_next = self.actor_target(next_states) q_targets_next = self.critic_target(next_states, actions_next) q_targets = rewards + (GAMMA * q_targets_next * (1 - dones)) q_expected = self.critic_local(states, actions) critic_loss = F.mse_loss(q_expected, q_targets) # Minimize loss self.critic_optimizer.zero_grad() critic_loss.backward() torch.nn.utils.clip_grad_norm_(self.critic_local.parameters(), 1.0) self.critic_optimizer.step() # endregion # region Update Actor actions_pred = self.actor_local(states) actor_loss = -self.critic_local(states, actions_pred).mean() # Minimize loss self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # endregion # Update target networks soft_update(self.critic_local, self.critic_target, TAU) soft_update(self.actor_local, self.actor_target, TAU)
class Agent(object): """DQN Agent that interacts and learns from the environment.""" def __init__(self, state_size, action_size, device, replay_buffer_size=int(1e5), batch_size=64, discount_factor=0.99, soft_update=1e-3, learning_rate=5e-4, update_every=4, **kwargs): """Initializes the DQN agent. Args: state_size (int): Dimension of each state action_size (int): Dimension of each action device (torch.device): Device to use for calculations replay_buffer_size (int): Size of replay buffer batch_size (int): Size of experience batches during training discount_factor (float): Discount factor (gamma) soft_update (float): Soft update coefficient (tau) learning_rate (float): Learning rate (alpha) update_every (int): Steps between updating the network **kwargs: Arguments describing the QNetwork """ self.state_size = state_size """Dimension of each state""" self.action_size = action_size """Dimension of each action""" self.device = device """Device to use for calculations""" # Parameters self.batch_size = batch_size """Size of experience batches during training""" self.discount_factor = discount_factor """Discount factor (gamma)""" self.soft_update = soft_update """Soft update coefficient (tau)""" self.update_every = update_every """Steps between updating the network""" # Q Networks self.target_network = QNetwork(state_size, action_size, **kwargs) \ .to(device) """Target Q-Network""" self.local_network = QNetwork(state_size, action_size, **kwargs) \ .to(device) """Local Q-Network""" self.optimizer = optim.Adam(self.local_network.parameters(), lr=learning_rate) """Optimizer used when training the Q-network.""" # Memory self.memory = ReplayBuffer(replay_buffer_size, batch_size, device) # Time step self.t_step = 0 """Current time step""" def save_weights(self, path): """Save local network weights. Args: path (string): File to save to""" self.local_network.save_weights(path) def load_weights(self, path): """Load local network weights. Args: path (string): File to load weights from""" self.local_network.load_weights(path) def act(self, state, eps=0.): """Returns action for given state according to the current policy Args: state (np.ndarray): Current state eps (float): Probability of selecting random action (epsilon) Returns: int: Epsilon-greedily selected action """ state = torch.from_numpy(state).float().unsqueeze(0).to(self.device) # Temporarily set evaluation mode (no dropout &c) & turn off autograd self.local_network.eval() with torch.no_grad(): action_values = self.local_network(state) self.local_network.train() # Select action epsilon-greedily if random.random() > eps: return np.argmax(action_values.cpu().detach().numpy()) else: return random.choice(np.arange(self.action_size)) def step(self, state, action, reward, next_state, done): """Save experience and learn if due. Args: state (Tensor): Current state action (int): Chosen action reward (float): Resulting reward next_state (Tensor): State after action done (bool): True if terminal state """ self.memory.add(state, action, reward, next_state, done) # Learn if at update_every steps self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: # Check that we have enough stored experiences if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) def learn(self, experiences): """Update Q-network using given experiences Args: experiences (Tuple[Tensor, Tensor, Tensor, Tensor, Tensor]: SARS'+done tuple """ states, actions, rewards, next_states, dones = experiences # Predicted Q values from target model for next states # (NB. torch.max returns tuple (max, argmax) q_target_next = self.target_network(next_states).max(dim=1, keepdim=True)[0] # Computed target Q values for current state q_target = rewards + self.discount_factor * q_target_next * (1 - dones) # Predicted Q values from local model for current state q_local = self.local_network(states).gather(dim=1, index=actions) loss = F.mse_loss(q_local, q_target) # Update local network weights self.optimizer.zero_grad() loss.backward() self.optimizer.step() # Update target network soft_update(self.local_network, self.target_network, self.soft_update)
class DDPG(): """Reinforcement Learning agent that learns using DDPG.""" def __init__(self, task): self.task = task self.state_size = task.state_size self.action_size = task.action_size self.action_low = task.action_low self.action_high = task.action_high # Actor (Policy) Model self.actor_local = Actor(self.state_size, self.action_size, self.action_low, self.action_high) self.actor_target = Actor(self.state_size, self.action_size, self.action_low, self.action_high) # Critic (Value) Model self.critic_local = Critic(self.state_size, self.action_size) self.critic_target = Critic(self.state_size, self.action_size) # Initialize target model parameters with local model parameters self.critic_target.model.set_weights( self.critic_local.model.get_weights()) self.actor_target.model.set_weights( self.actor_local.model.get_weights()) # Noise process self.exploration_mu = 0 #0 self.exploration_theta = 0.15 #0.15 self.exploration_sigma = 0.2 #0.2 self.noise = OUNoise(self.action_size, self.exploration_mu, self.exploration_theta, self.exploration_sigma) # Replay memory self.buffer_size = 100000 self.batch_size = 64 self.memory = ReplayBuffer(self.buffer_size, self.batch_size) # Algorithm parameters self.gamma = 0.99 # discount factor self.tau = 0.01 # for soft update of target parameters # Score tracker and learning parameters self.best_score = -np.inf def reset_episode(self): self.total_reward = 0.0 self.count = 0 self.noise.reset() state = self.task.reset() self.last_state = state return state def step(self, action, reward, next_state, done): # Save experience / reward self.memory.add(self.last_state, action, reward, next_state, done) # Save reward self.total_reward += reward self.count += 1 # Learn, if enough samples are available in memory if len(self.memory) > self.batch_size: experiences = self.memory.sample() self.learn(experiences) # Roll over last state and action self.last_state = next_state if done: # Keeping track of the score self.score = self.total_reward / float( self.count) if self.count else 0.0 if self.score > self.best_score: self.best_score = self.score def act(self, state): """Returns actions for given state(s) as per current policy.""" state = np.reshape(state, [-1, self.state_size]) action = self.actor_local.model.predict(state)[0] return list(action + self.noise.sample()) # add some noise for exploration def learn(self, experiences): """Update policy and value parameters using given batch of experience tuples.""" # Convert experience tuples to separate arrays for each element (states, actions, rewards, etc.) states = np.vstack([e.state for e in experiences if e is not None]) actions = np.array([e.action for e in experiences if e is not None]).astype(np.float32).reshape( -1, self.action_size) rewards = np.array([e.reward for e in experiences if e is not None ]).astype(np.float32).reshape(-1, 1) dones = np.array([e.done for e in experiences if e is not None]).astype(np.uint8).reshape(-1, 1) next_states = np.vstack( [e.next_state for e in experiences if e is not None]) # Get predicted next-state actions and Q values from target models # Q_targets_next = critic_target(next_state, actor_target(next_state)) actions_next = self.actor_target.model.predict_on_batch(next_states) Q_targets_next = self.critic_target.model.predict_on_batch( [next_states, actions_next]) # Compute Q targets for current states and train critic model (local) Q_targets = rewards + self.gamma * Q_targets_next * (1 - dones) self.critic_local.model.train_on_batch(x=[states, actions], y=Q_targets) # Train actor model (local) action_gradients = np.reshape( self.critic_local.get_action_gradients([states, actions, 0]), (-1, self.action_size)) self.actor_local.train_fn([states, action_gradients, 1]) # custom training function # Soft-update target models self.soft_update(self.critic_local.model, self.critic_target.model) self.soft_update(self.actor_local.model, self.actor_target.model) def soft_update(self, local_model, target_model): """Soft update model parameters.""" local_weights = np.array(local_model.get_weights()) target_weights = np.array(target_model.get_weights()) assert len(local_weights) == len( target_weights ), "Local and target model parameters must have the same size" new_weights = self.tau * local_weights + (1 - self.tau) * target_weights target_model.set_weights(new_weights)