class TestReplayMemory(unittest.TestCase): def setUp(self): self.memory = ReplayMemory(capacity=10) def test_append(self): for i in range(20): a = Transition([0, 1, 2, 3], 0, [4, 5, 6, 7], 0, True) self.memory.push(a) self.assertEqual(len(self.memory.memory), 10) def test_sample(self): for i in range(10): a = Transition([0, 1, 2, i], 0, [4, 5, 6, i*i], 0, True) self.memory.push(a) s, a, s1, r, done = self.memory.sample(2) self.assertEqual(s.shape, (2, 4)) self.assertEqual(a.shape, (2, 1)) self.assertEqual(s1.shape, (2, 4)) self.assertEqual(r.shape, (2, 1)) self.assertEqual(done.shape, (2, 1)) def test_multi_step(self): self.memory = ReplayMemory(capacity=10, multi_step_n=2) for i in range(5): a = Transition([0, 1, 2, i], 0, [4, 5, 6, i*i], 1, False) self.memory.push(a) final = Transition([0, 1, 2, 10], 0, [4, 5, 6, 100], 10, True) self.memory.push(final) self.assertEqual(self.memory.memory[0].r, 2.9701) self.assertEqual(self.memory.memory[3].r, 11.791) self.assertEqual(self.memory.memory[4].r, 10.9) self.assertEqual(self.memory.memory[5].r, 10) def test_zero_step(self): self.memory = ReplayMemory(capacity=10, multi_step_n=0) for i in range(5): a = Transition([0, 1, 2, i], 0, [4, 5, 6, i*i], 1, False) self.memory.push(a) final = Transition([0, 1, 2, 10], 0, [4, 5, 6, 100], 10, True) self.memory.push(final) self.assertEqual(self.memory.memory[0].r, 1) self.assertEqual(self.memory.memory[3].r, 1) self.assertEqual(self.memory.memory[4].r, 1) self.assertEqual(self.memory.memory[5].r, 10)
class hDQN(): """ The Hierarchical-DQN Agent Parameters ---------- optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate schedule for the optimizer num_goal: int The number of goal that agent can choose from num_action: int The number of action that agent can choose from replay_memory_size: int How many memories to store in the replay memory. batch_size: int How many transitions to sample each time experience is replayed. """ def __init__(self, optimizer_spec, num_goal=6, num_action=2, replay_memory_size=10000, batch_size=128): ############### # BUILD MODEL # ############### self.num_goal = num_goal self.num_action = num_action self.batch_size = batch_size # Construct meta-controller and controller self.meta_controller = MetaController().type(dtype) self.target_meta_controller = MetaController().type(dtype) self.controller = Controller().type(dtype) self.target_controller = Controller().type(dtype) # Construct the optimizers for meta-controller and controller self.meta_optimizer = optimizer_spec.constructor( self.meta_controller.parameters(), **optimizer_spec.kwargs) self.ctrl_optimizer = optimizer_spec.constructor( self.controller.parameters(), **optimizer_spec.kwargs) # Construct the replay memory for meta-controller and controller self.meta_replay_memory = ReplayMemory(replay_memory_size) self.ctrl_replay_memory = ReplayMemory(replay_memory_size) def get_intrinsic_reward(self, goal, state): return 1.0 if goal == state else 0.0 def select_goal(self, state, epilson): sample = random.random() if sample > epilson: state = torch.from_numpy(state).type(dtype) # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history with torch.no_grad(): return self.meta_controller(Variable( state, volatile=True)).data.max(1)[1].cpu() else: return torch.IntTensor([random.randrange(self.num_goal)]) def select_action(self, joint_state_goal, epilson): sample = random.random() if sample > epilson: joint_state_goal = torch.from_numpy(joint_state_goal).type(dtype) # Use volatile = True if variable is only used in inference mode, i.e. don’t save the history with torch.no_grad(): return self.controller( Variable(joint_state_goal, volatile=True)).data.max(1)[1].cpu() else: return torch.IntTensor([random.randrange(self.num_action)]) def update_meta_controller(self, gamma=1.0): if len(self.meta_replay_memory) < self.batch_size: return state_batch, goal_batch, next_state_batch, ex_reward_batch, done_mask = \ self.meta_replay_memory.sample(self.batch_size) state_batch = Variable(torch.from_numpy(state_batch).type(dtype)) goal_batch = Variable(torch.from_numpy(goal_batch).long()) next_state_batch = Variable( torch.from_numpy(next_state_batch).type(dtype)) ex_reward_batch = Variable( torch.from_numpy(ex_reward_batch).type(dtype)) not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype) if USE_CUDA: goal_batch = goal_batch.cuda() # Compute current Q value, meta_controller takes only state and output value for every state-goal pair # We choose Q based on goal chosen. current_Q_values = self.meta_controller(state_batch).gather( 1, goal_batch.unsqueeze(1)) # Compute next Q value based on which goal gives max Q values # Detach variable from the current graph since we don't want gradients for next Q to propagated next_max_q = self.target_meta_controller( next_state_batch).detach().max(1)[0] next_Q_values = not_done_mask * next_max_q # Compute the target of the current Q values target_Q_values = ex_reward_batch + (gamma * next_Q_values) # Compute Bellman error (using Huber loss) loss = F.smooth_l1_loss(current_Q_values.view(-1), target_Q_values) # Copy Q to target Q before updating parameters of Q self.target_meta_controller.load_state_dict( self.meta_controller.state_dict()) # Optimize the model self.meta_optimizer.zero_grad() loss.backward() for param in self.meta_controller.parameters(): param.grad.data.clamp_(-1, 1) self.meta_optimizer.step() def update_controller(self, gamma=1.0): if len(self.ctrl_replay_memory) < self.batch_size: return state_goal_batch, action_batch, next_state_goal_batch, in_reward_batch, done_mask = \ self.ctrl_replay_memory.sample(self.batch_size) state_goal_batch = Variable( torch.from_numpy(state_goal_batch).type(dtype)) action_batch = Variable(torch.from_numpy(action_batch).long()) next_state_goal_batch = Variable( torch.from_numpy(next_state_goal_batch).type(dtype)) in_reward_batch = Variable( torch.from_numpy(in_reward_batch).type(dtype)) not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype) if USE_CUDA: action_batch = action_batch.cuda() # Compute current Q value, controller takes only (state, goal) and output value for every (state, goal)-action pair # We choose Q based on action taken. current_Q_values = self.controller(state_goal_batch).gather( 1, action_batch.unsqueeze(1)) # Compute next Q value based on which goal gives max Q values # Detach variable from the current graph since we don't want gradients for next Q to propagated next_max_q = self.target_controller( next_state_goal_batch).detach().max(1)[0] next_Q_values = not_done_mask * next_max_q # Compute the target of the current Q values target_Q_values = in_reward_batch + (gamma * next_Q_values) # Compute Bellman error (using Huber loss) loss = F.smooth_l1_loss(current_Q_values.view(-1), target_Q_values) # Copy Q to target Q before updating parameters of Q self.target_controller.load_state_dict(self.controller.state_dict()) # Optimize the model self.ctrl_optimizer.zero_grad() loss.backward() for param in self.controller.parameters(): param.grad.data.clamp_(-1, 1) self.ctrl_optimizer.step()
timestep += 1 epoch_return += reward mask = torch.Tensor([done]).to(device) reward = torch.Tensor([reward]).to(device) next_state = torch.Tensor([next_state]).to(device) memory.push(state, action, mask, next_state, reward) state = next_state epoch_value_loss = 0 epoch_policy_loss = 0 if len(memory) > args.batch_size: transitions = memory.sample(args.batch_size) # Transpose the batch # (see http://stackoverflow.com/a/19343/3343043 for detailed explanation). batch = Transition(*zip(*transitions)) # Update actor and critic according to the batch value_loss, policy_loss = agent.update_params(batch) epoch_value_loss += value_loss epoch_policy_loss += policy_loss if done: break rewards.append(epoch_return) value_losses.append(epoch_value_loss)
class DQNDoubleQAgent(BaseAgent): def __init__(self): super(DQNDoubleQAgent, self).__init__() self.training = False self.max_frames = 2000000 self._epsilon = Epsilon(start=1.0, end=0.1, update_increment=0.0001) self.gamma = 0.99 self.train_q_per_step = 4 self.train_q_batch_size = 256 self.steps_before_training = 10000 self.target_q_update_frequency = 50000 self._Q_weights_path = "./data/SC2DoubleQAgent" self._Q = DQNCNN() if os.path.isfile(self._Q_weights_path): self._Q.load_state_dict(torch.load(self._Q_weights_path)) print("Loading weights:", self._Q_weights_path) self._Qt = copy.deepcopy(self._Q) self._Q.cuda() self._Qt.cuda() self._optimizer = optim.Adam(self._Q.parameters(), lr=1e-8) self._criterion = nn.MSELoss() self._memory = ReplayMemory(100000) self._loss = deque(maxlen=1000) self._max_q = deque(maxlen=1000) self._action = None self._screen = None self._fig = plt.figure() self._plot = [plt.subplot(2, 2, i + 1) for i in range(4)] self._screen_size = 28 def get_env_action(self, action, obs): action = np.unravel_index(action, [1, self._screen_size, self._screen_size]) target = [action[2], action[1]] command = _MOVE_SCREEN #action[0] # removing unit selection out of the equation # if command == 0: # command = _SELECT_POINT # else: # command = _MOVE_SCREEN if command in obs.observation["available_actions"]: return actions.FunctionCall(command, [[0], target]) else: return actions.FunctionCall(_NO_OP, []) ''' :param s = obs.observation["screen"] :returns action = argmax action ''' def get_action(self, s): # greedy if np.random.rand() > self._epsilon.value(): # print("greedy action") s = Variable(torch.from_numpy(s).cuda()) s = s.unsqueeze(0).float() self._action = self._Q(s).squeeze().cpu().data.numpy() return self._action.argmax() # explore else: # print("random choice") # action = np.random.choice([0, 1]) action = 0 target = np.random.randint(0, self._screen_size, size=2) return action * self._screen_size * self._screen_size + target[ 0] * self._screen_size + target[1] def select_friendly_action(self, obs): player_relative = obs.observation["screen"][_PLAYER_RELATIVE] friendly_y, friendly_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() target = [int(friendly_x.mean()), int(friendly_y.mean())] return actions.FunctionCall(_SELECT_POINT, [[0], target]) def train(self, env, training=True): self._epsilon.isTraining = training self.run_loop(env, self.max_frames) if self._epsilon.isTraining: torch.save(self._Q.state_dict(), self._Q_weights_path) def run_loop(self, env, max_frames=0): """A run loop to have agents and an environment interact.""" total_frames = 0 start_time = time.time() action_spec = env.action_spec() observation_spec = env.observation_spec() self.setup(observation_spec, action_spec) try: while True: obs = env.reset()[0] # remove unit selection from the equation by selecting the friendly on every new game. select_friendly = self.select_friendly_action(obs) obs = env.step([select_friendly])[0] # distance = self.get_reward(obs.observation["screen"]) self.reset() while True: total_frames += 1 self._screen = obs.observation["screen"][5] s = np.expand_dims(obs.observation["screen"][5], 0) # plt.imshow(s[5]) # plt.pause(0.00001) if max_frames and total_frames >= max_frames: print("max frames reached") return if obs.last(): print("total frames:", total_frames, "Epsilon:", self._epsilon.value()) self._epsilon.increment() break action = self.get_action(s) env_actions = self.get_env_action(action, obs) obs = env.step([env_actions])[0] r = obs.reward s1 = np.expand_dims(obs.observation["screen"][5], 0) done = r > 0 if self._epsilon.isTraining: transition = Transition(s, action, s1, r, done) self._memory.push(transition) if total_frames % self.train_q_per_step == 0 and total_frames > self.steps_before_training and self._epsilon.isTraining: self.train_q() # pass if total_frames % self.target_q_update_frequency == 0 and total_frames > self.steps_before_training and self._epsilon.isTraining: self._Qt = copy.deepcopy(self._Q) self.show_chart() if total_frames % 1000 == 0 and total_frames > self.steps_before_training and self._epsilon.isTraining: self.show_chart() if not self._epsilon.isTraining and total_frames % 3 == 0: self.show_chart() except KeyboardInterrupt: pass finally: print("finished") elapsed_time = time.time() - start_time print("Took %.3f seconds for %s steps: %.3f fps" % (elapsed_time, total_frames, total_frames / elapsed_time)) def get_reward(self, s): player_relative = s[_PLAYER_RELATIVE] neutral_y, neutral_x = (player_relative == _PLAYER_NEUTRAL).nonzero() neutral_target = [int(neutral_x.mean()), int(neutral_y.mean())] friendly_y, friendly_x = ( player_relative == _PLAYER_FRIENDLY).nonzero() if len(friendly_y) == 0 or len(friendly_x) == 0: # this is shit return 0 friendly_target = [int(friendly_x.mean()), int(friendly_y.mean())] distance_2 = (neutral_target[0] - friendly_target[0])**2 + ( neutral_target[1] - friendly_target[1])**2 distance = math.sqrt(distance_2) return -distance def show_chart(self): self._plot[0].clear() self._plot[0].set_xlabel('Last 1000 Training Cycles') self._plot[0].set_ylabel('Loss') self._plot[0].plot(list(self._loss)) self._plot[1].clear() self._plot[1].set_xlabel('Last 1000 Training Cycles') self._plot[1].set_ylabel('Max Q') self._plot[1].plot(list(self._max_q)) self._plot[2].clear() self._plot[2].set_title("screen") self._plot[2].imshow(self._screen) self._plot[3].clear() self._plot[3].set_title("action") self._plot[3].imshow(self._action) plt.pause(0.00001) def train_q(self): if self.train_q_batch_size >= len(self._memory): return s, a, s_1, r, done = self._memory.sample(self.train_q_batch_size) s = Variable(torch.from_numpy(s).cuda()).float() a = Variable(torch.from_numpy(a).cuda()).long() s_1 = Variable(torch.from_numpy(s_1).cuda(), volatile=True).float() r = Variable(torch.from_numpy(r).cuda()).float() done = Variable(torch.from_numpy(1 - done).cuda()).float() # Q_sa = r + gamma * max(Q_s'a') Q = self._Q(s) Q = Q.view(self.train_q_batch_size, -1) Q = Q.gather(1, a) Qt = self._Qt(s_1).view(self.train_q_batch_size, -1) # double Q best_action = self._Q(s_1).view(self.train_q_batch_size, -1).max(dim=1, keepdim=True)[1] y = r + done * self.gamma * Qt.gather(1, best_action) # Q # y = r + done * self.gamma * Qt.max(dim=1)[0].unsqueeze(1) y.volatile = False loss = self._criterion(Q, y) self._loss.append(loss.sum().cpu().data.numpy()) self._max_q.append(Q.max().cpu().data.numpy()[0]) self._optimizer.zero_grad() # zero the gradient buffers loss.backward() self._optimizer.step()
class NEC: def __init__(self, env, args, device='cpu'): """ Instantiate an NEC Agent ---------- env: gym.Env gym environment to train on args: args class from argparser args are from from train.py: see train.py for help with each arg device: string 'cpu' or 'cuda:0' depending on use_cuda flag from train.py """ self.environment_type = args.environment_type self.env = env self.device = device # Hyperparameters self.epsilon = args.initial_epsilon self.final_epsilon = args.final_epsilon self.epsilon_decay = args.epsilon_decay self.gamma = args.gamma self.N = args.N # Transition queue and replay memory self.transition_queue = [] self.replay_every = args.replay_every self.replay_buffer_size = args.replay_buffer_size self.replay_memory = ReplayMemory(self.replay_buffer_size) # CNN for state embedding network self.frames_to_stack = args.frames_to_stack self.embedding_size = args.embedding_size self.in_height = args.in_height self.in_width = args.in_width self.cnn = CNN(self.frames_to_stack, self.embedding_size, self.in_height, self.in_width).to(self.device) # Differentiable Neural Dictionary (DND): one for each action self.kernel = inverse_distance self.num_neighbors = args.num_neighbors self.max_memory = args.max_memory self.lr = args.lr self.dnd_list = [] for i in range(env.action_space.n): self.dnd_list.append( DND(self.kernel, self.num_neighbors, self.max_memory, args.optimizer, self.lr)) # Optimizer for state embedding CNN self.q_lr = args.q_lr self.batch_size = args.batch_size self.optimizer = get_optimizer(args.optimizer, self.cnn.parameters(), self.lr) def choose_action(self, state_embedding): """ Choose epsilon-greedy policy according to Q-estimates from DNDs """ if random.uniform(0, 1) < self.epsilon: return random.randint(0, self.env.action_space.n - 1) else: qs = [dnd.lookup(state_embedding) for dnd in self.dnd_list] action = torch.argmax(torch.cat(qs)) return action def Q_lookahead(self, t, warmup=False): """ Return the N-step Q-value lookahead from time t in the transition queue """ if warmup or len(self.transition_queue) <= t + self.N: lookahead = [tr.reward for tr in self.transition_queue[t:]] discounted = discount(lookahead, self.gamma) Q_N = torch.tensor([discounted], requires_grad=True) return Q_N else: lookahead = [ tr.reward for tr in self.transition_queue[t:t + self.N] ] discounted = discount(lookahead, self.gamma) state = self.transition_queue[t + self.N].state state = torch.tensor(state).permute(2, 0, 1).unsqueeze(0) # (N,C,H,W) state = state.to(self.device) state_embedding = self.cnn(state) Q_a = [dnd.lookup(state_embedding) for dnd in self.dnd_list] maxQ = torch.cat(Q_a).max() Q_N = discounted + (self.gamma**self.N) * maxQ Q_N = torch.tensor([Q_N], requires_grad=True) return Q_N def Q_update(self, Q, Q_N): """ Return the Q-update for DND updates """ return Q + self.q_lr * (Q_N - Q) def update(self): """ Iterate through the transition queue and make NEC updates """ # Insert transitions into DNDs for t in range(len(self.transition_queue)): tr = self.transition_queue[t] action = tr.action tr = self.transition_queue[t] state = torch.tensor(tr.state).permute(2, 0, 1) # (C,H,W) state = state.unsqueeze(0).to(self.device) # (N,C,H,W) state_embedding = self.cnn(state) dnd = self.dnd_list[action] Q_N = self.Q_lookahead(t).to(self.device) embedding_index = dnd.get_index(state_embedding) if embedding_index is None: dnd.insert(state_embedding.detach(), Q_N.detach().unsqueeze(0)) else: Q = self.Q_update(dnd.values[embedding_index], Q_N) dnd.update(Q.detach(), embedding_index) Q_N = Q_N.detach().to(self.device) self.replay_memory.push(tr.state, action, Q_N) # Commit inserts for dnd in self.dnd_list: dnd.commit_insert() # Train CNN on minibatch for t in range(len(self.transition_queue)): if t % self.replay_every == 0 or t == len( self.transition_queue) - 1: # Train on random mini-batch from self.replay_memory batch = self.replay_memory.sample(self.batch_size) actual_Qs = torch.cat([sample.Q_N for sample in batch]) predicted_Qs = [] for sample in batch: state = torch.tensor(sample.state).permute(2, 0, 1) # (C,H,W) state = state.unsqueeze(0).to(self.device) # (N,C,H,W) state_embedding = self.cnn(state) dnd = self.dnd_list[sample.action] predicted_Q = dnd.lookup(state_embedding, update_flag=True) predicted_Qs.append(predicted_Q) predicted_Qs = torch.cat(predicted_Qs).to(self.device) loss = torch.dist(actual_Qs, predicted_Qs) self.optimizer.zero_grad() loss.backward() self.optimizer.step() for dnd in self.dnd_list: dnd.update_params() # Clear out transition queue self.transition_queue = [] def run_episode(self): """ Train an NEC agent for a single episode: Interact with environment Append (state, action, reward) transitions to transition queue Call update at the end of the episode """ if self.epsilon > self.final_epsilon: self.epsilon = self.epsilon * self.epsilon_decay state = self.env.reset() if self.environment_type == 'fourrooms': fewest_steps = self.env.shortest_path_length(self.env.state) total_steps = 0 total_reward = 0 total_frames = 0 done = False while not done: state_embedding = torch.tensor(state).permute(2, 0, 1) # (C,H,W) state_embedding = state_embedding.unsqueeze(0).to(self.device) state_embedding = self.cnn(state_embedding) action = self.choose_action(state_embedding) next_state, reward, done, _ = self.env.step(action) self.transition_queue.append(Transition(state, action, reward)) total_reward += reward total_frames += self.env.skip total_steps += 1 state = next_state self.update() if self.environment_type == 'fourrooms': n_extra_steps = total_steps - fewest_steps return n_extra_steps, total_frames, total_reward else: return total_frames, total_reward def warmup(self): """ Warmup the DND with values from an episode with a random policy """ state = self.env.reset() total_reward = 0 total_frames = 0 done = False while not done: action = random.randint(0, self.env.action_space.n - 1) next_state, reward, done, _ = self.env.step(action) total_reward += reward total_frames += self.env.skip self.transition_queue.append(Transition(state, action, reward)) state = next_state for t in range(len(self.transition_queue)): tr = self.transition_queue[t] state_embedding = torch.tensor(tr.state).permute(2, 0, 1) # (C,H,W) state_embedding = state_embedding.unsqueeze(0).to(self.device) state_embedding = self.cnn(state_embedding) action = tr.action dnd = self.dnd_list[action] Q_N = self.Q_lookahead(t, True).to(self.device) if dnd.keys_to_be_inserted is None and dnd.keys is None: dnd.insert(state_embedding, Q_N.detach().unsqueeze(0)) else: embedding_index = dnd.get_index(state_embedding) if embedding_index is None: state_embedding = state_embedding.detach() dnd.insert(state_embedding, Q_N.detach().unsqueeze(0)) else: Q = self.Q_update(dnd.values[embedding_index], Q_N) dnd.update(Q.detach(), embedding_index) self.replay_memory.push(tr.state, action, Q_N.detach()) for dnd in self.dnd_list: dnd.commit_insert() # Clear out transition queue self.transition_queue = [] return total_frames, total_reward
class DDPG(): """ The Deep Deterministic Policy Gradient (DDPG) Agent Parameters ---------- actor_optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate and other parameters for the optimizer critic_optimizer_spec: OptimizerSpec num_feature: int The number of features of the environmental state num_action: int The number of available actions that agent can choose from replay_memory_size: int How many memories to store in the replay memory. batch_size: int How many transitions to sample each time experience is replayed. tau: float The update rate that target networks slowly track the learned networks. """ def __init__(self, actor_optimizer_spec, critic_optimizer_spec, num_feature, num_action, net_type, replay_memory_size=1000000, batch_size=64, tau=0.001): ############### # BUILD MODEL # ############### self.num_feature = num_feature self.num_action = num_action self.batch_size = batch_size self.tau = tau # Construct actor and critic if net_type == 0: self.actor = MLPA(input_size=num_feature, output_size=num_action, hidden_size=(400, 300), n_layers=2, tanh_flag=1).type(dtype) self.target_actor = MLPA(input_size=num_feature, output_size=num_action, hidden_size=(400, 300), n_layers=2, tanh_flag=1).type(dtype) self.critic = MLPC(input_size_state=num_feature, input_size_action=num_action, output_size=1, hidden_size=(400, 300), n_layers=2).type(dtype) self.target_critic = MLPC(input_size_state=num_feature, input_size_action=num_action, output_size=1, hidden_size=(400, 300), n_layers=2).type(dtype) elif net_type == 1: self.actor = MLPA(input_size=num_feature + 1, output_size=num_action, hidden_size=(400, 300), n_layers=2, tanh_flag=1).type(dtype) self.target_actor = MLPA(input_size=num_feature + 1, output_size=num_action, hidden_size=(400, 300), n_layers=2, tanh_flag=1).type(dtype) self.critic = MLPC(input_size_state=num_feature + 1, input_size_action=num_action, output_size=1, hidden_size=(400, 300), n_layers=2).type(dtype) self.target_critic = MLPC(input_size_state=num_feature + 1, input_size_action=num_action, output_size=1, hidden_size=(400, 300), n_layers=2).type(dtype) elif net_type == 2: self.actor = PMLPA(input_size=num_feature, output_size=num_action, hidden_size=(400, 300), dtype=dtype, n_layers=2, tanh_flag=1).type(dtype) self.target_actor = PMLPA(input_size=num_feature, output_size=num_action, hidden_size=(400, 300), dtype=dtype, n_layers=2, tanh_flag=1).type(dtype) self.critic = PMLPC(input_size_state=num_feature, input_size_action=num_action, output_size=1, hidden_size=(400, 300), dtype=dtype, n_layers=2).type(dtype) self.target_critic = PMLPC(input_size_state=num_feature, input_size_action=num_action, output_size=1, hidden_size=(400, 300), dtype=dtype, n_layers=2).type(dtype) # Construct the optimizers for actor and critic self.actor_optimizer = actor_optimizer_spec.constructor( self.actor.parameters(), **actor_optimizer_spec.kwargs) self.critic_optimizer = critic_optimizer_spec.constructor( self.critic.parameters(), **critic_optimizer_spec.kwargs) # Construct the replay memory self.replay_memory = ReplayMemory(replay_memory_size) def copy_weights_for_finetune(self, weight_files): # hard coded for finetuning ... # copy actor for lin_layer, weight_file in zip(self.actor.control_hidden_list[0], weight_files): agent = torch.load(weight_file) lin_layer.load_state_dict(agent.actor.l1.state_dict()) for lin_layer, weight_file in zip(self.actor.control_hidden_list[1], weight_files): agent = torch.load(weight_file) lin_layer.load_state_dict(agent.actor.l2.state_dict()) for lin_layer, weight_file in zip(self.actor.control_h2o_list, weight_files): agent = torch.load(weight_file) lin_layer.load_state_dict(agent.actor.h2o.state_dict()) # copy critic for lin_layer, weight_file in zip(self.critic.control_hidden_list[0], weight_files): agent = torch.load(weight_file) lin_layer.load_state_dict(agent.critic.l1.state_dict()) for lin_layer, weight_file in zip(self.critic.control_hidden_list[1], weight_files): agent = torch.load(weight_file) lin_layer.load_state_dict(agent.critic.l2.state_dict()) for lin_layer, weight_files in zip(self.critic.control_h2o_list, weight_files): agent = torch.load(weight_file) lin_layer.load_state_dict(agent.critic.h2o.state_dict()) def select_action(self, state, phase, net_type): state = torch.from_numpy(state).type(dtype).unsqueeze(0) phase = torch.from_numpy(np.array([phase])).type(dtype).unsqueeze(0) if net_type == 0: action = self.actor(Variable(state, volatile=True)).data.cpu() elif net_type == 1: action = self.actor( Variable(torch.cat((state, phase), 1), volatile=True)).data.cpu() elif net_type == 2: action = self.actor(Variable(state, volatile=True), Variable(phase, volatile=True)).data.cpu() return action def update(self, net_type, gamma=1.0): if len(self.replay_memory) < self.batch_size: return state_batch, action_batch, reward_batch, next_state_batch, phase_batch, next_phase_batch, done_mask = \ self.replay_memory.sample(self.batch_size) state_batch = Variable(torch.from_numpy(state_batch).type(dtype)) action_batch = Variable(torch.from_numpy(action_batch).type(dtype)) reward_batch = Variable(torch.from_numpy(reward_batch).type(dtype)) next_state_batch = Variable( torch.from_numpy(next_state_batch).type(dtype)) phase_batch = Variable( torch.from_numpy(phase_batch).type(dtype)).unsqueeze(1) next_phase_batch = Variable( torch.from_numpy(next_phase_batch).type(dtype)).unsqueeze(1) not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype) ### Critic ### self.critic_optimizer.zero_grad() if net_type == 0 or net_type == 1: if net_type == 0: # Compute current Q value, critic takes state and action choosen current_Q_values = self.critic( torch.cat((state_batch, action_batch), 1)) # Compute next Q value based on which action target actor would choose # Detach variable from the current graph since we don't want gradients for next Q to propagated #target_actions = self.target_actor(state_batch) # shouldn't it be next_state_batch target_actions = self.target_actor(next_state_batch) next_max_q = self.target_critic( torch.cat((next_state_batch, target_actions), 1)).detach().max(1)[0] elif net_type == 1: # Compute current Q value, critic takes state and action choosen current_Q_values = self.critic( torch.cat((state_batch, phase_batch, action_batch), 1)) # Compute next Q value based on which action target actor would choose # Detach variable from the current graph since we don't want gradients for next Q to propagated target_actions = self.target_actor( torch.cat((next_state_batch, next_phase_batch), 1)) next_max_q = self.target_critic( torch.cat( (next_state_batch, next_phase_batch, target_actions), 1)).detach().max(1)[0] next_Q_values = not_done_mask * next_max_q # Compute the target of the current Q values target_Q_values = reward_batch + (gamma * next_Q_values) # Compute Bellman error (using Huber loss) critic_loss = F.mse_loss(current_Q_values, target_Q_values) # Optimize the critic critic_loss.backward() self.critic_optimizer.step() elif net_type == 2: current_Q_values = self.critic( torch.cat((state_batch, action_batch), 1), phase_batch) target_actions = self.target_actor(next_state_batch, next_phase_batch) next_max_q = self.target_critic( torch.cat((next_state_batch, target_actions), 1), next_phase_batch).detach().max(1)[0] next_Q_values = not_done_mask * next_max_q target_Q_values = reward_batch + (gamma * next_Q_values) critic_loss = F.mse_loss(current_Q_values, target_Q_values) critic_loss.backward() # Optimize the critic self.critic_optimizer.step() ### Actor ### self.actor_optimizer.zero_grad() if net_type == 0 or net_type == 1: if net_type == 0: actor_loss = -self.critic( torch.cat( (state_batch, self.actor(state_batch)), 1)).mean() elif net_type == 1: actor_loss = -self.critic( torch.cat( (state_batch, phase_batch, self.actor(torch.cat( (state_batch, phase_batch), 1))), 1)).mean() # Optimize the actor actor_loss.backward() self.actor_optimizer.step() elif net_type == 2: actor_loss = -self.critic( torch.cat((state_batch, self.actor(state_batch, phase_batch)), 1), phase_batch).mean() actor_loss.backward() # Optimize the actor self.actor_optimizer.step() # Update the target networks self.update_target(self.target_critic, self.critic) self.update_target(self.target_actor, self.actor) def update_target(self, target_model, model): for target_param, param in zip(target_model.parameters(), model.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)
def train_model(env, conv_layers, learning_rate=5e-4, total_timesteps=100000, buffer_size=50000, exploration_fraction=0.1, exploration_final_eps=0.02, train_freq=1, batch_size=32, print_freq=1, checkpoint_freq=100000, checkpoint_path=None, learning_starts=1000, gamma=1.0, target_network_update_freq=500, double_dqn=False, **network_kwargs) -> tf.keras.Model: """Train a DQN model. Parameters ------- env: gym.Env openai gym conv_layers: list a list of triples that defines the conv network learning_rate: float learning rate for adam optimizer total_timesteps: int number of env steps to run the environment buffer_size: int size of the replay buffer exploration_fraction: float fraction of entire training period over which the exploration rate is annealed exploration_final_eps: float final value of random action probability train_freq: int update the model every train_freq steps. batch_size: int size of a batched sampled from replay buffer for training print_freq: int how often to print out training progress set to None to disable printing checkpoint_freq: int how often to store a checkpoint during training checkpoint_path: str the fs path for storing the checkpoints learning_starts: int how many steps of the model to collect transitions for before learning starts gamma: float discount factor target_network_update_freq: int update the target network every `target_network_update_freq` steps. double_dqn: bool specifies if double q-learning is used during training Returns ------- dqn: an instance of tf.Module that contains the trained model """ q_func = build_dueling_q_func(conv_layers, **network_kwargs) dqn = DeepQ(model_builder=q_func, observation_shape=env.observation_space.shape, num_actions=env.action_space.n, learning_rate=learning_rate, gamma=gamma, double_dqn=double_dqn) manager = None if checkpoint_path is not None: load_path = osp.expanduser(checkpoint_path) ckpt = tf.train.Checkpoint(model=dqn.q_network) manager = tf.train.CheckpointManager(ckpt, load_path, max_to_keep=5) ckpt.restore(manager.latest_checkpoint) print("Restoring from {}".format(manager.latest_checkpoint)) current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S") train_log_dir = 'logs/gradient_tape/' + current_time + '/train' train_summary_writer = tf.summary.create_file_writer(train_log_dir) # Create the replay buffer replay_buffer = ReplayMemory(buffer_size) # Create the schedule for exploration starting from 1. exploration = LinearSchedule(total_timesteps=int(exploration_fraction * total_timesteps), initial_prob=1.0, final_prob=exploration_final_eps) dqn.update_target() episode_rewards = [0.0] obs = env.reset() obs = np.expand_dims(np.array(obs), axis=0) for t in range(total_timesteps): update_eps = exploration.step_to(t) action, _, _, _ = dqn.step(tf.constant(obs), update_eps=update_eps) action = action[0].numpy() new_obs, reward, done, _ = env.step(action) # Store transition in the replay buffer. new_obs = np.expand_dims(np.array(new_obs), axis=0) replay_buffer.add(obs[0], action, reward, new_obs[0], float(done)) obs = new_obs episode_rewards[-1] += reward if done: obs = env.reset() obs = np.expand_dims(np.array(obs), axis=0) episode_rewards.append(0.0) if t > learning_starts and t % train_freq == 0: # Minimize the error in Bellman's equation on a batch sampled from replay buffer. obses_t, actions, rewards, obses_tp1, dones = replay_buffer.sample( batch_size) weights, _ = tf.ones_like(rewards), None td_loss = dqn.train(obses_t, actions, rewards, obses_tp1, dones, weights) if t > learning_starts and t % target_network_update_freq == 0: # Update target network every target_network_update_freq steps dqn.update_target() reward_100_mean = np.round(np.mean(episode_rewards[-101:-1]), 1) number_episodes = len(episode_rewards) - 1 if done and print_freq is not None and number_episodes % print_freq == 0: format_str = "Steps: {}, Episodes: {}, 100 ep reward average: {}, Reward: {}, Epsilon-greedy %explore: {}" print( format_str.format(t, number_episodes, reward_100_mean, episode_rewards[-2], int(100 * exploration.value(t)))) with train_summary_writer.as_default(): tf.summary.scalar('loss', dqn.train_loss_metrics.result(), step=t) tf.summary.scalar('reward', episode_rewards[-2], step=t) if checkpoint_path is not None and t % checkpoint_freq == 0: manager.save() # Every training step, reset the loss metric dqn.train_loss_metrics.reset_states() return dqn.q_network
class DDPG(): """ The Deep Deterministic Policy Gradient (DDPG) Agent Parameters ---------- actor_optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate and other parameters for the optimizer critic_optimizer_spec: OptimizerSpec num_feature: int The number of features of the environmental state num_action: int The number of available actions that agent can choose from replay_memory_size: int How many memories to store in the replay memory. batch_size: int How many transitions to sample each time experience is replayed. tau: float The update rate that target networks slowly track the learned networks. """ def __init__(self, actor_optimizer_spec, critic_optimizer_spec, num_feature, num_action, replay_memory_size=1000000, batch_size=64, tau=0.001): ############### # BUILD MODEL # ############### self.num_feature = num_feature self.num_action = num_action self.batch_size = batch_size self.tau = tau # Construct actor and critic self.actor = Actor(num_feature, num_action).type(dtype) self.target_actor = Actor(num_feature, num_action).type(dtype) self.critic = Critic(num_feature, num_action).type(dtype) self.target_critic = Critic(num_feature, num_action).type(dtype) # Construct the optimizers for actor and critic self.actor_optimizer = actor_optimizer_spec.constructor( self.actor.parameters(), **actor_optimizer_spec.kwargs) self.critic_optimizer = critic_optimizer_spec.constructor( self.critic.parameters(), **critic_optimizer_spec.kwargs) # Construct the replay memory self.replay_memory = ReplayMemory(replay_memory_size) def select_action(self, state): state = torch.from_numpy(state).type(dtype).unsqueeze(0) action = self.actor(Variable(state, volatile=True)).data.cpu()[0, 0] return action def update(self, gamma=1.0): if len(self.replay_memory) < self.batch_size: return state_batch, action_batch, reward_batch, next_state_batch, done_mask = \ self.replay_memory.sample(self.batch_size) state_batch = Variable(torch.from_numpy(state_batch).type(dtype)) action_batch = Variable( torch.from_numpy(action_batch).type(dtype)).unsqueeze(1) reward_batch = Variable(torch.from_numpy(reward_batch).type(dtype)) next_state_batch = Variable( torch.from_numpy(next_state_batch).type(dtype)) not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype) ### Critic ### # Compute current Q value, critic takes state and action choosen current_Q_values = self.critic(state_batch, action_batch) # Compute next Q value based on which action target actor would choose # Detach variable from the current graph since we don't want gradients for next Q to propagated target_Q_values = get_target_value_critic(self.target_critic, self.target_actor, next_state_batch) target_Q_values = torch.squeeze(target_Q_values) target_Q_values.data.mul_(gamma) # if done, using the reward as the target target_Q_values.data.mul_(not_done_mask.data) # target_next_state_value:shape [batch_size] target_Q_values.data.add_(reward_batch.data) # Compute Bellman error (using Huber loss) critic_loss = F.smooth_l1_loss(current_Q_values, target_Q_values) # critic_loss = torch.mean(torch.pow(target_Q_values - current_Q_values, 2)) # Optimize the critic self.critic.get_optimizer().zero_grad() critic_loss.backward() self.critic.get_optimizer().step() ### Actor ### actor_loss = -self.critic(state_batch, self.actor(state_batch)).mean() # Optimize the actor self.actor.get_optimizer().zero_grad() actor_loss.backward() self.actor.get_optimizer().step() # Update the target networks self.target_actor.moving_average_update(self.actor.state_dict(), decay=1 - self.tau) self.target_critic.moving_average_update(self.critic.state_dict(), decay=1 - self.tau)
class DQN_Agent(): ''' Regular Q-Learning Agent One deep network. DQN - to predict Q of a given action, value a state. i.e. Q(s,a) and Q(s', a') for loss calculation. ''' def __init__( self, state_size, n_actions, args, device=torch.device("cuda" if torch.cuda.is_available() else "cpu")): self.device = device # Exploration / Exploitation params. self.steps_done = 0 self.eps_threshold = 1 self.eps_start = args.eps_start self.eps_end = args.eps_end self.eps_decay = args.eps_decay # RL params self.target_update = args.target_update self.discount = args.discount # Env params self.n_actions = n_actions self.state_size = state_size # Deep q networks params self.layers = args.layers self.batch_size = args.batch_size self.policy_net = DQN(state_size, n_actions, layers=self.layers).to(self.device).float() self.target_net = None self.grad_clip = args.grad_clip if str(args.optimizer).lower() == 'adam': self.optimizer = optim.Adam(self.policy_net.parameters()) if str(args.optimizer).lower() == 'rmsprop': self.optimizer = optim.RMSprop(self.policy_net.parameters()) else: raise NotImplementedError self.memory = ReplayMemory(args.replay_size) # Performance buffers. self.rewards_list = [] def add_to_memory(self, state, action, next_state, reward): self.rewards_list.append(reward) state = torch.from_numpy(state).float() action = torch.tensor([action]) next_state = torch.from_numpy(next_state).float() reward = torch.tensor([reward]) self.memory.push(state, action, next_state, reward) def select_action(self, state): sample = random.random() self.eps_threshold = self.eps_end + (self.eps_start - self.eps_end) * \ math.exp(-1. * self.steps_done / self.eps_decay) self.steps_done += 1 if sample > self.eps_threshold: with torch.no_grad(): # t.max(1) will return largest column value of each row. # second column on max result is index of where max element was # found, so we pick action with the larger expected reward. state = torch.from_numpy(state).float().to( self.device) # Convert to tensor. state = state.unsqueeze(0) # Add batch dimension. return self.policy_net(state).max(1)[1].view(1, 1) else: return torch.tensor([[random.randrange(self.n_actions)]], device=self.device, dtype=torch.long).item() def optimize_model(self): if len(self.memory) < self.batch_size: return transitions = self.memory.sample(self.batch_size) # This converts batch-array of Transitions # to Transition of batch-arrays. batch = Transition(*zip(*transitions)) next_states_batch = torch.cat(batch.next_state).view( self.batch_size, -1).to(self.device) state_batch = torch.cat(batch.state).view(self.batch_size, -1).to(self.device) action_batch = torch.cat(batch.action).view(self.batch_size, -1).to(self.device) reward_batch = torch.cat(batch.reward).view(self.batch_size, -1).to(self.device) # Compute loss loss = self._compute_loss(state_batch, action_batch, next_states_batch, reward_batch) # Optimize the model self.optimizer.zero_grad() loss.backward() # clip grad if self.grad_clip is not None: for param in self.policy_net.parameters(): param.grad.data.clamp_(-self.grad_clip, self.grad_clip) # update Policy net weights self.optimizer.step() # update Target net weights self._update_target() def _compute_loss(self, state_batch, action_batch, next_states_batch, reward_batch): # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net state_action_values = self.policy_net(state_batch).gather( 1, action_batch) # Compute V(s_{t+1}) for all next states using the same policy net. next_state_values = torch.zeros(self.batch_size, device=self.device) next_state_values = self.policy_net(next_states_batch).max( 1)[0].detach() # Compute the expected Q values expected_state_action_values = (next_state_values.unsqueeze(1) * self.discount) + reward_batch # Compute Huber loss loss = F.smooth_l1_loss(state_action_values, expected_state_action_values) return loss def _update_target(self): if self.target_net is None: # There is nothing to update. return # Update the target network, copying all weights and biases in DQN if self.target_update > 1: # Hard copy of weights. if self.steps_done % self.target_update == 0: self.target_net.load_state_dict(self.policy_net.state_dict()) return elif self.target_update < 1 and self.target_update > 0: # polyak averaging: tau = self.target_update for target_param, param in zip(self.target_net.parameters(), self.policy_net.parameters()): target_param.data.copy_(tau * param + (1 - tau) * target_param) return else: raise NotImplementedError def save_ckpt(self, ckpt_folder): ''' saves checkpoint of policy net in ckpt_folder :param ckpt_folder: path to a folder. ''' ckpt_path = os.path.join(ckpt_folder, 'policy_net_state_dict.pth') torch.save(self.policy_net.state_dict(), ckpt_path)
class DDPG(): """ The Deep Deterministic Policy Gradient (DDPG) Agent Parameters ---------- actor_optimizer_spec: OptimizerSpec Specifying the constructor and kwargs, as well as learning rate and other parameters for the optimizer critic_optimizer_spec: OptimizerSpec num_feature: int The number of features of the environmental state num_action: int The number of available actions that agent can choose from replay_memory_size: int How many memories to store in the replay memory. batch_size: int How many transitions to sample each time experience is replayed. tau: float The update rate that target networks slowly track the learned networks. """ def __init__(self, actor_optimizer_spec, critic_optimizer_spec, num_feature, num_action, replay_memory_size=1000000, batch_size=64, tau=0.001): ############### # BUILD MODEL # ############### self.num_feature = num_feature self.num_action = num_action self.batch_size = batch_size self.tau = tau # Construct actor and critic self.actor = Actor(num_feature, num_action).type(dtype) self.target_actor = Actor(num_feature, num_action).type(dtype) self.critic = Critic(num_feature, num_action).type(dtype) self.target_critic = Critic(num_feature, num_action).type(dtype) # Construct the optimizers for actor and critic self.actor_optimizer = actor_optimizer_spec.constructor( self.actor.parameters(), **actor_optimizer_spec.kwargs) self.critic_optimizer = critic_optimizer_spec.constructor( self.critic.parameters(), **critic_optimizer_spec.kwargs) # Construct the replay memory self.replay_memory = ReplayMemory(replay_memory_size) def select_action(self, state): state = torch.from_numpy(state).type(dtype).unsqueeze(0) action = self.actor(Variable(state, volatile=True)).data[0].cpu().numpy() #print(action) return action def update(self, gamma=1.0): if len(self.replay_memory) < self.batch_size: return state_batch, action_batch, reward_batch, next_state_batch, done_mask = \ self.replay_memory.sample(self.batch_size) state_batch = Variable(torch.from_numpy(state_batch).type(dtype)) action_batch = Variable(torch.from_numpy(action_batch).type(dtype)) reward_batch = Variable(torch.from_numpy(reward_batch).type(dtype)) next_state_batch = Variable( torch.from_numpy(next_state_batch).type(dtype)) not_done_mask = Variable(torch.from_numpy(1 - done_mask)).type(dtype) ### Critic ### # Compute current Q value, critic takes state and action choosen #print(state_batch.data.size(),action_batch.data.size()) current_Q_values = self.critic(state_batch, action_batch) # Compute next Q value based on which action target actor would choose # Detach variable from the current graph since we don't want gradients for next Q to propagated target_actions = self.target_actor(state_batch) next_max_q = self.target_critic(next_state_batch, target_actions).detach().max(1)[0] next_Q_values = not_done_mask * next_max_q # Compute the target of the current Q values target_Q_values = reward_batch + (gamma * next_Q_values) # Compute Bellman error (using Huber loss) critic_loss = F.smooth_l1_loss(current_Q_values, target_Q_values) # Optimize the critic self.critic_optimizer.zero_grad() critic_loss.backward() self.critic_optimizer.step() ### Actor ### actor_loss = -self.critic(state_batch, self.actor(state_batch)).mean() # Optimize the actor self.actor_optimizer.zero_grad() actor_loss.backward() self.actor_optimizer.step() # Update the target networks self.update_target(self.target_critic, self.critic) self.update_target(self.target_actor, self.actor) def update_target(self, target_model, model): for target_param, param in zip(target_model.parameters(), model.parameters()): target_param.data.copy_(self.tau * param.data + (1 - self.tau) * target_param.data)