def __init__( self, main_model: QNetwork, target_network: QNetwork, lr=1e-3, gamma=0.9, update_every_steps=10, memory: RootReplayBuffer = ReplayBuffer(buffer_size=int(1e5), batch_size=64), seed=0, device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")): """ Initialization :param state_size: how many states in world. :param action_size: how many agents can the agent choose from. :param is_double: if True, implements Double DQN (https://arxiv.org/pdf/1509.06461.pdf) :param seed: to reproduce results. :param device: do I have a GPU, or not? """ assert main_model.state_size == target_network.state_size, \ "Main model accepts a state size %d, but target accepts a state size %d" % (main_model.state_size, target_network.state_size) assert main_model.action_size == target_network.action_size, \ "Main model generates %d possible actions, but target generates %d" % (main_model.action_size, target_network.action_size) self.version = "v_debug: use of 'target' when calculating targets" self.state_size = main_model.state_size self.action_size = main_model.action_size self.seed = random.seed(seed) self.device = device self.update_every_steps = update_every_steps self.qnetwork_local = main_model.to(self.device) self.qnetwork_target = target_network.to(self.device) self.optimizer = optim.Adam(params=self.qnetwork_local.parameters(), lr=lr) self.memory = memory # ReplayBuffer(buffer_size=memory_size, batch_size=batch_size) # let's keep track of the steps so that we can run the algorithms properly self.t_step = 0 self.gamma = gamma
def load_qnet(self, model_name): """Load Q-Network parameters from file. Params ====== model_name (str): name of the Q-Network """ # Saved QNetwork is alway the CPU version. qnetwork_loaded = QNetwork(self.aug_state_size, self.action_size, self.hsize1, self.hsize2, seed=None) qnetwork_loaded.load_state_dict(torch.load(model_name + '.pth')) self.qnetwork_local.update_weights(qnetwork_loaded.to( device)) # copy loaded network weights to local network
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed) self.qnetwork_local = self.qnetwork_local.to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed) self.qnetwork_target = self.qnetwork_target.to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences ## TODO: compute and minimize the loss "*** YOUR CODE HERE ***" self.optimizer.zero_grad() output_local = self.qnetwork_local(states) output_target = self.qnetwork_target(next_states).detach( ) # detach() is necessary because of softupdate below # where local parameters are copied into the target param. q_of_states_actions = output_local.gather(1, actions) max_q_of_next_states, _ = torch.max(output_target, dim=1) max_q_of_next_states = max_q_of_next_states.unsqueeze(1) loss = torch.mean(0.5 * (rewards + gamma * max_q_of_next_states * (1 - dones) - q_of_states_actions)**2) loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed, filepath): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.avarage_score = 0 self.start_epoch = 0 self.seed = random.randint(0, seed) random.seed(seed) print("seed ", seed, " self.seed ", self.seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, self.seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, self.seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) if filepath: self.load_model(filepath) # Replay memory print("buffer size ", BUFFER_SIZE) self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, self.seed) print("memory ", self.memory) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() #print("experiences ",experiences) self.learn_DDQN(experiences, GAMMA) self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: self.update_network(self.qnetwork_local, self.qnetwork_target) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn_DDQN(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next_argmax = self.qnetwork_local(next_states).squeeze( 0).detach().max(1)[1].unsqueeze(1) #Q_targets_next0 = self.qnetwork_target(next_states).squeeze(0).detach() #Q_targets_next = Q_targets_next0.max(1)[0].unsqueeze(1) Q_targets_next = self.qnetwork_target(next_states).squeeze(0).gather( 1, Q_targets_next_argmax) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).squeeze(0).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # #self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Get max predicted Q values (for next states) from target model Q_targets_next0 = self.qnetwork_target(next_states).squeeze(0).detach() Q_targets_next = Q_targets_next0.max(1)[0].unsqueeze(1) # Compute Q targets for current states Q_targets = rewards + (gamma * Q_targets_next * (1 - dones)) # Get expected Q values from local model Q_expected = self.qnetwork_local(states).squeeze(0).gather(1, actions) # Compute loss loss = F.mse_loss(Q_expected, Q_targets) # Minimize the loss self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # #self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def save_model(self, filepath, epoch, score, last=False): checkpoint = { 'input_size': self.state_size, 'output_size': self.action_size, 'hidden_layers': [each.in_features for each in self.qnetwork_local.hidden_layers], 'state_dict': self.qnetwork_local.state_dict(), 'optimizer_state_dict': self.optimizer.state_dict(), 'epoch': epoch, 'avarage_score': score } checkpoint['hidden_layers'].append( self.qnetwork_local.hidden_layers[-1].out_features) torch.save(checkpoint, filepath) if last: torch.save(self.qnetwork_local.state_dict(), '{}_state_dict_{}.pt'.format(last, epoch)) #print("checkpoint['hidden_layers'] ",checkpoint['hidden_layers']) def load_model(self, filepath): print("seed ", self.seed) if os.path.isfile(filepath): print("=> loading checkpoint '{}'".format(filepath)) checkpoint = torch.load(filepath) print("checkpoint['hidden_layers'] ", checkpoint['hidden_layers']) self.qnetwork_local = QNetwork( checkpoint['input_size'], checkpoint['output_size'], self.seed, checkpoint['hidden_layers']).to(device) self.qnetwork_local.load_state_dict(checkpoint['state_dict']) self.qnetwork_local.to(device) self.qnetwork_target = QNetwork( checkpoint['input_size'], checkpoint['output_size'], self.seed, checkpoint['hidden_layers']).to(device) self.qnetwork_target.load_state_dict(checkpoint['state_dict']) self.qnetwork_target.to(device) if 'optimizer_state_dict' in checkpoint: self.optimizer.load_state_dict( checkpoint['optimizer_state_dict']) for state in self.optimizer.state.values(): for k, v in state.items(): if isinstance(v, torch.Tensor): state[k] = v.to(device) print(self.optimizer) if 'epoch' in checkpoint: self.start_epoch = checkpoint['epoch'] if 'avarage_score' in checkpoint: self.avarage_score = checkpoint['avarage_score'] print(self.qnetwork_target) print(self.optimizer) else: print("=> no checkpoint found at '{}'".format(filepath)) def update_network(self, local_model, target_model): for target, local in zip(target_model.parameters(), local_model.parameters()): target.data.copy_(local.data) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
def make_movie(env_name, checkpoint='*.tar', num_frames=20, first_frame=0, resolution=75, \ save_dir='./movies/', density=5, radius=5, prefix='default', overfit_mode=False): # log dirの名前を入れる load_dir = env_name.lower() # メタデータ meta = {} # if env_name == "Pong-v0": meta['critic_ff'] = 600 meta['actor_ff'] = 500 # 環境を作成 # env = gym.make(env_name) env = make_pytorch_env(env_name) # actor crtic ネットワークをロードする n_state = env.observation_space.shape[0] n_act = env.action_space.n model = QNetwork(n_state, n_act) model.to(device) model.load() # movieのdirを取ってくる(default-100-PongNoFrameSkip) movie_title = "{}-{}-{}.mp4".format(prefix, num_frames, env_name.lower()) print('\tmaking movie "{}" using checkpoint at {}{}'.format( movie_title, load_dir, checkpoint)) max_ep_len = first_frame + num_frames + 1 torch.manual_seed(0) # プレイしてログをえる(logitsはActorの値) history = rollout(model, env, max_ep_len=max_ep_len) print() # 保存用の作成 start = time.time() total_frames = len(history['ins']) FFMpegWriter = manimation.writers['ffmpeg'] metadata = dict(title=movie_title, artist='greydanus', comment='atari-saliency-video') writer = FFMpegWriter(fps=8, metadata=metadata) f = plt.figure(figsize=[6, 6 * 1.3], dpi=resolution) # 画像 seq_image = np.array(history['ins'][first_frame:first_frame + num_frames]) # with writer.saving(f, save_dir + movie_title, resolution): # for i in range(num_frames): # print('i: ', i) # frame = seq_image[i].copy() # actor_saliency = score_frame(model, seq_image[i].copy(), density, num_frames, mode='actor') # frame = saliency_on_atari_frame(actor_saliency, frame, num_frames, fudge_factor=meta['actor_ff']) # # # 描画する # plt.imshow(frame) # plt.gray() # plt.title(env_name.lower(), fontsize=15) # plt.show() # writer.grab_frame() # f.clear() # # tstr = time.strftime("%Hh %Mm %Ss", time.gmtime(time.time() - start)) # print('\ttime: {} | progress: {:.1f}%'.format(tstr, 100 * i / min(num_frames, total_frames)), end='\r') print('\nfinished.')
class DDQNPERAgent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, tor_dstate, srpt_pens, lrn_rate, hsize1, hsize2, seed=0): """Initialize a DDQN Agent object with PER (Prioritized Experience Replay) support. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action tor_dstate (float): tolerance for deciding whether two states are the same srpt_pens (array_like): penalty (negative reward) values for undesirable actions lrn_rate (float): learning rate for Q-Network training hsize1 (int): size of the first hidden layer of the Q-Network hsize2 (int): size of the second hidden layer of the Q-Network seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.tor_dstate = tor_dstate self.srpt_pens = srpt_pens self.lrn_rate = lrn_rate self.hsize1 = hsize1 self.hsize2 = hsize2 self.seed = seed if seed is not None: random.seed(seed) # Each penalty value adds a vector of action_size to signal which action causes the penalty. self.aug_state_size = state_size + len(srpt_pens) * action_size # Set up Q-Networks. self.qnetwork_local = QNetwork(self.aug_state_size, action_size, hsize1, hsize2, seed).to(device) self.qnetwork_local.initialize_weights( ) # initialize network with random weights self.qnetwork_target = QNetwork(self.aug_state_size, action_size, hsize1, hsize2, seed=None).to(device) self.qnetwork_target.update_weights( self.qnetwork_local) # copy network weights to target network self.qnetwork_target.eval() self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=lrn_rate) # Store trained Q-model when the environment is solved. self.qnetwork_solved = None # Set up experience replay memory. self.ebuffer = ReplayBuffer(BUFFER_SIZE, BATCH_SIZE, seed) # Initialize interval steps. self.l_step = 0 # for learning every LEARN_EVERY time steps self.t_step = 0 # for updating target network every UPDATE_EVERY learnings def reset_epsisode(self, state, srpt_det=0): """Re-initialize buffers after environment reset for a new episode. Params ====== state (array_like): initial state after environment reset srpt_det (int): number of repeated state types to be checked for post-processing """ self.srpt_det = 0 if len(self.srpt_pens) == 0: # State repeat detection for post-processing is active only when state repeat penalty option is off. self.srpt_det = srpt_det else: # This is used to signal self.step() hasn't been run yet. self.next_aug_state = None if len(self.srpt_pens) > 0 or self.srpt_det > 0: self.state_buffer = deque(maxlen=2) buffer_size = 2 * (max(len(self.srpt_pens), self.srpt_det) - 1) self.smsta_buffer = deque(maxlen=max(2, buffer_size)) # The initial state will be pushed to the buffer again and be compared to this state in the process of # selecting the first action. So add 1 to the initial state here to ensure the states are different # enough for the first comparison. self.state_buffer.append(np.array(state) + 1) # Any position and orientation can be the initial simulated state here. It is like putting in a # coordinate system (origin and x-direction) for a 2-D plane and all the other simulated states # in the episode will be specified based on this reference coordinate system. self.smsta_buffer.append((np.array([0, 0]), 0)) def step(self, state, action, reward, next_state, done): """Update replay memory and parameters of Q-Network by training. Params ====== state (array_like): starting state of the step action (int): action performed in the step reward (float): reward from the action next_state (array_like): resulting state of the action in the step done (bool): indicator for whether next_state is terminal (i.e., end of episode) or not """ if len(self.srpt_pens) > 0: # Augment state vector and modify reward using state repeat penalty values. self.state_buffer.append(np.array(next_state)) self.next_aug_state = self.augment_state(next_state) state = self.aug_state next_state = self.next_aug_state reward = self.modify_reward(reward, state, action) # Save experience in replay memory. self.ebuffer.add(state, action, reward, next_state, done) # Learn every LEARN_EVERY steps after memory reaches batch_size. if len(self.ebuffer.memory) >= self.ebuffer.batch_size: self.l_step += 1 self.l_step %= LEARN_EVERY if self.l_step == 0: experiences, weights = self.ebuffer.sample() self.learn(experiences, weights, GAMMA) def augment_state(self, state): """Augment state vector to penalize undesirable actions. Params ====== state (array_like): original state vector to be augmented Returns ====== aug_state (numpy.ndarray): augmented state vector """ # Each penalty value adds a vector of action_size to signal which action causes the penalty. aug_state = np.concatenate( (state, np.zeros((len(self.srpt_pens) * self.action_size, )))) # Detect situation where the two preceeding observed states (not augmented) are essentially the # same, which indicates the agent is either stucked at a wall or in some kind of undesirable # blind spot. The next action to avoid (i.e., to be penalized) is the one that will keep the # agent stuck or in blind spot. avoid_action = self.get_avoid_action() if avoid_action != ACT_INVALID: aug_state[self.state_size + avoid_action] = 1 if avoid_action != ACT_INVALID or len(self.srpt_pens) == 1: return aug_state # If agent is not stuck or in blind spot and there are more penalty values, continue to check # state repeats separated by more than two actions. Assuming NUM_ORIS is even, states separated # by odd number of actions won't repeat. So only even number of actions needs to be checked. for action in range(self.action_size): nxt_sta = self.sim_step(action) for act_cnt in range(2, 2 * len(self.srpt_pens), 2): if self.is_state_repeated(act_cnt, nxt_sta): aug_state[self.state_size + (act_cnt // 2) * self.action_size + action] = 1 # signal undesirable action break return aug_state def modify_reward(self, reward, aug_state, action): """Modify reward to penalized undesirable action. Params ====== reward (float): original reward aug_state (numpy.ndarray): augmented state vector action (int): action performed Returns ====== reward (float): modified reward """ # Penalize undesirable action when it doesn't earn a reward or cause a penalty. If it earns a positive # reward or causes a more negative reward, leave the reward unchanged. if reward <= 0: for i, penalty in enumerate(self.srpt_pens): if aug_state[self.state_size + i * self.action_size + action] > 0: # action is undesirable reward = min(reward, penalty) break return reward def sim_step(self, action): """Advance simulated state (position and orientation) for one step by the action. Params ====== action (int): action to advance the simulated state Returns pos, ori (numpy.ndarray, int): resulting simulated state """ # An action can either be a move or turn (but not both) with the type of actions (including non-actions) # identified by the action code. pos, ori = self.smsta_buffer[-1] act_code = ACT_CODES[action] pos = pos + act_code[0] * ORIVEC_TABLE[ori] ori = (ori + act_code[1]) % NUM_ORIS return pos, ori def is_state_repeated(self, act_cnt, nxt_sta): """Check whether the next state repeats the past state separated by the specified number of actions. Params ====== act_cnt (int): number of actions separating the past state to be checked and the next state nxt_sta (numpy.ndarray, int): next state resulting from an action Returns ====== repeated (bool): indicator for repeated state """ repeated = False if act_cnt <= len(self.smsta_buffer): chk_sta = self.smsta_buffer[-act_cnt] # past state to be checked if chk_sta[1] == nxt_sta[1]: if np.linalg.norm(nxt_sta[0] - chk_sta[0]) <= self.tor_dstate: repeated = True return repeated def act(self, state, eps=0.0): """Select action for given state as per epsilon-greedy current policy. Params ====== state (array_like): current state eps (float): epsilon, for adjusting epsilon-greedy action selection Returns ====== action (int): the chosen action """ # If the agent is in testing mode, self.step() won't be invoked and some of the operations done there # need to be done here. if (len(self.srpt_pens) > 0 and self.next_aug_state is None) or self.srpt_det > 0: # Push current state into state buffer for comparing with previous state if it is not alraedy pushed # by self.step() in the agent training process. self.state_buffer.append(np.array(state)) if len(self.srpt_pens) > 0: if self.next_aug_state is None: self.aug_state = self.augment_state(state) else: self.aug_state = self.next_aug_state state = self.aug_state if self.srpt_det == 0: # no checking for repeated states (observed or simulated) # Randomly select action. action = random.choice(np.arange(self.action_size)) # Epsilon-greedy action selection. if random.random() >= eps: state = torch.from_numpy(state).float().to(device) self.qnetwork_local.eval() with torch.no_grad(): action = self.qnetwork_local( state).squeeze().argmax().cpu().item() if len(self.srpt_pens) > 0: # Update simulated state buffer with result of chosen action. nxt_sta = self.sim_step(action) self.smsta_buffer.append(nxt_sta) return action # This is the implementation of the post-processing of the Epsilon-greedy policy to avoid repeated states # within a short series of actions. This option is set in self.reset_episode() for each espisode and is # only active when the option of penalizing undesirable actions, which is set for the class object, is # disabled when len(self.srpt_pens) == 0. To accomondate the post-processing of the selected actions, the # random policy is modified to randomly assign rankings to all the available actions. # Randomly assign rankings to action candidates. ranked_actions = np.random.permutation(self.action_size) # Epsilon-greedy action selection. if random.random() >= eps: state = torch.from_numpy(state).float().to(device) self.qnetwork_local.eval() with torch.no_grad(): neg_act_qvals = -self.qnetwork_local(state).squeeze() ranked_actions = neg_act_qvals.argsort().cpu().numpy().astype(int) # Post-process ranked action candidates to remove undesirable action. avoid_action = self.get_avoid_action() action = self.select_nosrpt_action(avoid_action, ranked_actions) return action def get_avoid_action(self): """Avoid action that will keep the agent stucked or in a blind spot. Returns avoid_action (int): next action to avoid """ avoid_action = ACT_INVALID # used to sigal agent is not stucked or in a blind spot if np.linalg.norm(self.state_buffer[1] - self.state_buffer[0]) <= self.tor_dstate: sim_sta0 = self.smsta_buffer[-2] sim_sta1 = self.smsta_buffer[-1] if sim_sta0[1] == sim_sta1[ 1]: # action is not a turn, must be a move # Agent is stuck at a wall dpos = sim_sta1[0] - sim_sta0[0] mcode = np.around(np.dot( dpos, ORIVEC_TABLE[sim_sta0[1]])).astype( int) # dot(mcode*(cos, sin), (cos, sin)) = mcode avoid_action = AVOID_MOVE_TABLE[mcode + 1] self.smsta_buffer.clear( ) # it is reasonable to backtrack to get unstucked except the last state which self.smsta_buffer.append( sim_sta0 ) # the agent is stucked in (as the new reference, it can be any state) else: # action is a turn # Agent is in a blind spot (turned, but observed same state). tcode = sim_sta1[1] - sim_sta0[1] avoid_action = AVOID_TURN_TABLE[(tcode + 1) % NUM_ORIS] self.smsta_buffer.clear( ) # it is reasonable to backtrack to get out of blind self.smsta_buffer.append( sim_sta0 ) # spot except the last two states, which represent self.smsta_buffer.append(sim_sta1) # the blind spot return avoid_action def select_nosrpt_action(self, avoid_action, ranked_actions): """Select action that avoids repeated state (i.e., loops) by a short series of actions. Params ====== avoid_action (int): action to avoid if agent is stuck or in blind spot ranked_actions (array like): action candidates ranked by decreasing Q-values Returns ====== action (int): the selected action """ action = ranked_actions[0] if action == avoid_action: action = ranked_actions[1] nxt_sta = self.sim_step(action) # If repeated observed state by an action is detected (signaled by avoid_action != ACT_INVALID), the selected # action for avoiding the repeated state will be used since it is more important to free a agent that is # stucked or in a blind spot than to go back further to check for repeated simulated states. So the checking # for repeated simulated states by 2 or more actions will only occur when avoid_action == ACT_INVALID. if avoid_action == ACT_INVALID and self.srpt_det > 1: act_heapq = [] for action in ranked_actions: nxt_sta = self.sim_step(action) for act_cnt in range( 2, 2 * self.srpt_det, 2 ): # assuming NUM_ORIS is even, only check even number of actions if self.is_state_repeated(act_cnt, nxt_sta): # Simulated state repeated, go checking next action. heapq.heappush(act_heapq, [-act_cnt, action, nxt_sta]) break else: # No repeated state detected, action is found. break else: # No action can satisfy all the no repeated state conditions, select the action that repeats the # state separated by most actions (i.e., long loop is more acceptable than short loop). action, nxt_sta = heapq.heappop(act_heapq)[1:] self.smsta_buffer.append( nxt_sta ) # update simulated state buffer with result of chosen action. return action def learn(self, experiences, is_weights, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple (s, a, r, s', done) of batched experience data is_weights (torch.Tensor): importance sampling weights for the batched experiences gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Double DQN method for obtaining target Q-values. self.qnetwork_local.eval() with torch.no_grad(): maxq_actions = self.qnetwork_local(next_states).max( 1)[1].unsqueeze(1) qouts_next_states = self.qnetwork_target(next_states).gather( 1, maxq_actions).squeeze() qouts_target = rewards + gamma * qouts_next_states * (1 - dones) # Obtain current Q-values and its difference from the target Q-values. self.qnetwork_local.train() qouts_states = self.qnetwork_local(states).gather(1, actions).squeeze() delta_qouts = qouts_states - qouts_target # Calculated weighted sum of squared losses. wsqr_loss = is_weights * delta_qouts**2 # weighted squared loss loss_sum = wsqr_loss.sum() # Update model parameters by minimizing the loss sum. self.optimizer.zero_grad() loss_sum.backward() self.optimizer.step() # Update priorities of the replay memory. neg_prios = -torch.abs(delta_qouts.detach()) self.ebuffer.update_priorities(neg_prios.cpu().numpy()) # Update target network. self.t_step += 1 self.t_step %= UPDATE_EVERY if self.t_step == 0: self.qnetwork_target.update_weights(self.qnetwork_local, TAU) def update_beta(self, beta): """Update importance sampling weights for memory buffer with new Beta. Params ====== beta (float): new Beta value """ if beta != self.ebuffer.beta: self.ebuffer.beta = beta if len(self.ebuffer.memory) >= self.ebuffer.batch_size: self.ebuffer.update_is_weights() def copy_solved_qnet(self): """Copy current local Q-Network to solved Q-Network while local Q-Network will continue the training.""" if self.qnetwork_solved is None: self.qnetwork_solved = QNetwork(self.aug_state_size, self.action_size, self.hsize1, self.hsize2, seed=None).to(device) self.qnetwork_solved.update_weights( self.qnetwork_local ) # copy local network weights to solved network def save_qnet(self, model_name): """Save Q-Network parameters into file. Params ====== model_name (str): name of the Q-Network """ # Save CPU version since it can be used with or without GPU. if self.qnetwork_solved is not None: torch.save(self.qnetwork_solved.cpu().state_dict(), model_name + '.pth') self.qnetwork_solved = self.qnetwork_solved.to(device) else: torch.save(self.qnetwork_local.cpu().state_dict(), model_name + '.pth') self.qnetwork_local = self.qnetwork_local.to(device) def load_qnet(self, model_name): """Load Q-Network parameters from file. Params ====== model_name (str): name of the Q-Network """ # Saved QNetwork is alway the CPU version. qnetwork_loaded = QNetwork(self.aug_state_size, self.action_size, self.hsize1, self.hsize2, seed=None) qnetwork_loaded.load_state_dict(torch.load(model_name + '.pth')) self.qnetwork_local.update_weights(qnetwork_loaded.to( device)) # copy loaded network weights to local network
class Agent(): """Interacts with and learns from the environment.""" def __init__(self, state_size, action_size, seed): """Initialize an Agent object. Params ====== state_size (int): dimension of each state action_size (int): dimension of each action seed (int): random seed """ self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed).to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed).to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: experiences = self.memory.sample() self.learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Variable]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones = experiences # Compute and minimize the loss criterion = torch.nn.MSELoss() ## Move input and label tensors to correct device self.qnetwork_local.to(device) self.qnetwork_target.to(device) inputs = next_states.to(device) ## Select max predicted Q value for next state using the target model with torch.no_grad(): next_target = self.qnetwork_target(inputs) next_q_target = next_target.max(1)[0].unsqueeze(1) ## Calculate q targets target_q = rewards + (gamma * next_q_target * (1 - dones)) ## Use local model to get the expected Q value expected_q = self.qnetwork_local(states).gather(1, actions) ## Compute and minimize the loss loss = criterion(expected_q, target_q) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1.0 - tau) * target_param.data)
class Agent: """Interacts with and learns from the environment.""" def __init__(self, action_size, seed, state_size, visual): """Initialize an Agent object. Params ====== action_size (int): dimension of each action seed (int): random seed state_size (int): dimension of each state. Note this can be None if visual is true visual (bool): whether to train the agent on visual pixels or vector observations """ if not visual: self.state_size = state_size self.action_size = action_size self.seed = random.seed(seed) # Q-Network self.qnetwork_local = QNetwork(state_size, action_size, seed) if not visual else VisualQNetwork(action_size, seed) self.qnetwork_local = self.qnetwork_local.to(device) self.qnetwork_target = QNetwork(state_size, action_size, seed) if not visual else VisualQNetwork(action_size, seed) self.qnetwork_target = self.qnetwork_target.to(device) self.optimizer = optim.Adam(self.qnetwork_local.parameters(), lr=LR) self.beta_start = 0.4 # Replay memory self.memory = ReplayBuffer(action_size, BUFFER_SIZE, BATCH_SIZE, seed, GAMMA) # Initialize time step (for updating every UPDATE_EVERY steps) self.t_step = 0 self.batch_no = 0 self.beta_batch_nos = 50_000 def step(self, state, action, reward, next_state, done): # Save experience in replay memory self.memory.add(state, action, reward, next_state, done) # Learn every UPDATE_EVERY time steps. self.t_step = (self.t_step + 1) % UPDATE_EVERY if self.t_step == 0: # If enough samples are available in memory, get random subset and learn if len(self.memory) > BATCH_SIZE: beta = min(1.0, self.beta_start + (self.batch_no / self.beta_batch_nos) * (1 - self.beta_start)) self.batch_no += 1 experiences = self.memory.sample(beta) self._learn(experiences, GAMMA) def act(self, state, eps=0.): """Returns actions for given state as per current policy. Params ====== state (array_like): current state eps (float): epsilon, for epsilon-greedy action selection """ state = torch.from_numpy(state).float().unsqueeze(0).to(device) self.qnetwork_local.eval() with torch.no_grad(): action_values = self.qnetwork_local(state) self.qnetwork_local.train() # Epsilon-greedy action selection if random.random() > eps: return np.argmax(action_values.cpu().data.numpy()) else: return random.choice(np.arange(self.action_size)) def _learn(self, experiences, gamma): """Update value parameters using given batch of experience tuples. Params ====== experiences (Tuple[torch.Tensor]): tuple of (s, a, r, s', done) tuples gamma (float): discount factor """ states, actions, rewards, next_states, dones, sample_indices, weight_update_weights = experiences # Get max predicted Q values (for next states) from target model q_targets_next = self.qnetwork_target(next_states).detach().max(1)[0] # Compute Q targets for current states q_targets = rewards.squeeze(1) + (gamma * q_targets_next * (1 - dones.squeeze(1))) # Get expected Q values from local model q_expected = self.qnetwork_local(states).gather(1, actions).squeeze(1) # Compute loss loss = (q_expected - q_targets.detach()).pow(2) * weight_update_weights prios = loss + 1e-5 loss = loss.mean() # Minimize the loss self.optimizer.zero_grad() loss.backward() self.memory.update_priorities(prios.data.cpu().numpy(), sample_indices) self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.qnetwork_local, self.qnetwork_target, TAU) def soft_update(self, local_model, target_model, tau): """Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Params ====== local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau*local_param.data + (1.0-tau)*target_param.data)