test_time = False n_steps = 8 n_actions = env.action_space.n img_height = 64 img_width = 64 policy_net = None network_path = "target_net.pt" if os.path.exists(network_path): policy_net = torch.load(network_path) print("successfully loaded existing network from file: " + network_path) else: policy_net = DQN(img_height, img_width, n_actions) target_net = DQN(img_height, img_width, n_actions).to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = optim.RMSprop(policy_net.parameters(), lr=LR) memory = ReplayMemory(10000) steps_done = 0 logfile = "train_log.txt" with open(logfile, "w+") as f: f.write("CS4803 MineRL Project Logs:\n") def append_log(s): with open(logfile, "a") as f: f.write(s + "\n") def state_from_obs(obs): # get the camera image from the observation dict and convert the image # to the correct shape: (C, H, W) img = torch.tensor(obs["pov"] / 255.0, dtype=torch.float32)
class Agent: def __init__( self, state_size, action_size, n_agents, buffer_size: int = 1e5, batch_size: int = 256, gamma: float = 0.995, tau: float = 1e-3, learning_rate: float = 7e-4, update_every: int = 4, ): """ Initialize DQN agent using the agent-experience buffer Args: state_size (int): Size of the state observation returned by the environment action_size (int): Action space size n_agents (int): Number of agents in the environment buffer_size (int): Desired total experience buffer size batch_size (int): Mini-batch size gamma (float): Discount factor tau (float): For soft update of target parameters learning_rate (float): Learning rate update_every (int): Number of steps before target network update """ self.state_size = state_size self.action_size = action_size self.n_agents = n_agents # Q-Networks self.policy_net = DQN(state_size, action_size).to(device) self.target_net = DQN(state_size, action_size).to(device) self.optimizer = optim.Adam(self.policy_net.parameters(), lr=learning_rate) self.memory = AgentReplayMemory(buffer_size, n_agents, state_size, device) self.t_step = 0 self.update_every = update_every self.batch_size = batch_size self.gamma = gamma self.tau = tau def step(self, states, actions, rewards, next_steps, done): self.memory.push_agent_actions(states, actions, rewards, next_steps, done) self.t_step = (self.t_step + 1) % self.update_every if self.t_step == 0: if self.memory.at_capacity(): experience = self.memory.sample(self.batch_size) self.learn(experience, self.gamma) def act(self, states, eps=0): states = torch.from_numpy(states).float().to(device) self.policy_net.eval() with torch.no_grad(): action_values = self.policy_net(states) self.policy_net.train() r = np.random.random(size=self.n_agents) action_values = np.argmax(action_values.cpu().data.numpy(), axis=1) random_choices = np.random.randint(0, self.action_size, size=self.n_agents) return np.where(r > eps, action_values, random_choices) def learn(self, experiences, gamma): states, actions, rewards, next_states, dones = experiences criterion = torch.nn.MSELoss() self.policy_net.train() self.target_net.eval() # shape of output from the model (batch_size,action_dim) = (64,4) predicted_targets = self.policy_net(states).gather(1, actions) with torch.no_grad(): labels_next = self.target_net(next_states).detach().max( 1)[0].unsqueeze(1) # .detach() -> Returns a new Tensor, detached from the current graph. labels = rewards + (gamma * labels_next * (1 - dones)) loss = criterion(predicted_targets, labels).to(device) self.optimizer.zero_grad() loss.backward() self.optimizer.step() # ------------------- update target network ------------------- # self.soft_update(self.policy_net, self.target_net, self.tau) def soft_update(self, local_model, target_model, tau): """ Soft update model parameters. θ_target = τ*θ_local + (1 - τ)*θ_target Args: local_model (PyTorch model): weights will be copied from target_model (PyTorch model): weights will be copied to tau (float): interpolation parameter """ for target_param, local_param in zip(target_model.parameters(), local_model.parameters()): target_param.data.copy_(tau * local_param.data + (1 - tau) * target_param.data)
class TrainNQL: def __init__(self, epi, cfg=dcfg, validation=False): #cpu or cuda torch.cuda.empty_cache() self.device = cfg.device #torch.device("cuda" if torch.cuda.is_available() else "cpu") self.state_dim = cfg.proc_frame_size #State dimensionality 84x84. self.state_size = cfg.state_size #self.t_steps= tsteps self.t_eps = cfg.t_eps self.minibatch_size = cfg.minibatch_size # Q-learning parameters self.discount = cfg.discount #Discount factor. self.replay_memory = cfg.replay_memory self.bufferSize = cfg.bufferSize self.target_q = cfg.target_q self.validation = validation if (validation): self.episode = epi else: self.episode = int(epi) - 1 self.cfg = cfg modelGray = 'results/ep' + str(self.episode) + '/modelGray.net' modelDepth = 'results/ep' + str(self.episode) + '/modelDepth.net' tModelGray = 'results/ep' + str(self.episode) + '/tModelGray.net' tModelDepth = 'results/ep' + str(self.episode) + '/tModelDepth.net' if os.path.exists(modelGray) and os.path.exists(modelDepth): print("Loading model") self.gray_policy_net = torch.load(modelGray).to(self.device) self.gray_target_net = torch.load(tModelGray).to(self.device) self.depth_policy_net = torch.load(modelDepth).to(self.device) self.depth_target_net = torch.load(tModelDepth).to(self.device) else: print("New model") self.gray_policy_net = DQN(noutputs=cfg.noutputs, nfeats=cfg.nfeats, nstates=cfg.nstates, kernels=cfg.kernels, strides=cfg.strides, poolsize=cfg.poolsize).to(self.device) self.gray_target_net = DQN(noutputs=cfg.noutputs, nfeats=cfg.nfeats, nstates=cfg.nstates, kernels=cfg.kernels, strides=cfg.strides, poolsize=cfg.poolsize).to(self.device) self.depth_policy_net = DQN(noutputs=cfg.noutputs, nfeats=cfg.nfeats, nstates=cfg.nstates, kernels=cfg.kernels, strides=cfg.strides, poolsize=cfg.poolsize).to(self.device) self.depth_target_net = DQN(noutputs=cfg.noutputs, nfeats=cfg.nfeats, nstates=cfg.nstates, kernels=cfg.kernels, strides=cfg.strides, poolsize=cfg.poolsize).to(self.device) if not validation and self.target_q and self.episode % self.target_q == 0: print("cloning") self.depth_policy_net = DQN(noutputs=cfg.noutputs, nfeats=cfg.nfeats, nstates=cfg.nstates, kernels=cfg.kernels, strides=cfg.strides, poolsize=cfg.poolsize).to(self.device) self.depth_target_net = DQN(noutputs=cfg.noutputs, nfeats=cfg.nfeats, nstates=cfg.nstates, kernels=cfg.kernels, strides=cfg.strides, poolsize=cfg.poolsize).to(self.device) self.gray_target_net.load_state_dict(self.gray_target_net.state_dict()) self.gray_target_net.eval() self.depth_target_net.load_state_dict( self.depth_target_net.state_dict()) self.depth_target_net.eval() self.gray_optimizer = optim.RMSprop(self.gray_policy_net.parameters()) self.depth_optimizer = optim.RMSprop( self.depth_policy_net.parameters()) self.memory = ReplayMemory(self.replay_memory) def get_tensor_from_image(self, file): convert = T.Compose([ T.ToPILImage(), T.Resize((self.state_dim, self.state_dim), interpolation=Image.BILINEAR), T.ToTensor() ]) screen = Image.open(file) screen = np.ascontiguousarray(screen, dtype=np.float32) / 255 screen = torch.from_numpy(screen) screen = convert(screen).unsqueeze(0).to(self.device) return screen def get_data(self, episode, tsteps): #images=torch.Tensor(tsteps,self.state_size,self.state_dim,self.state_dim).to(self.device) #depths=torch.Tensor(tsteps,self.state_size,self.state_dim,self.state_dim).to(self.device) images = [] depths = [] dirname_rgb = 'dataset/RGB/ep' + str(episode) dirname_dep = 'dataset/Depth/ep' + str(episode) for step in range(tsteps): #proc_image=torch.Tensor(self.state_size,self.state_dim,self.state_dim).to(self.device) #proc_depth=torch.Tensor(self.state_size,self.state_dim,self.state_dim).to(self.device) proc_image = [] proc_depth = [] dirname_rgb = 'dataset/RGB/ep' + str(episode) dirname_dep = 'dataset/Depth/ep' + str(episode) for i in range(self.state_size): grayfile = dirname_rgb + '/image_' + str(step + 1) + '_' + str( i + 1) + '.png' depthfile = dirname_dep + '/depth_' + str( step + 1) + '_' + str(i + 1) + '.png' #proc_image[i] = self.get_tensor_from_image(grayfile) #proc_depth[i] = self.get_tensor_from_image(depthfile) proc_image.append(grayfile) proc_depth.append(depthfile) #images[step]=proc_image #depths[step]=proc_depth images.append(proc_image) depths.append(proc_depth) return images, depths def load_data(self): rewards = torch.load('files/reward_history.dat') actions = torch.load('files/action_history.dat') ep_rewards = torch.load('files/ep_rewards.dat') print("Loading images") best_scores = range(len(actions)) buffer_selection_mode = 'default' if (buffer_selection_mode == 'success_handshake'): eps_values = [] for i in range(len(actions)): hspos = 0 hsneg = 0 for step in range(len(actions[i])): if (len(actions[i]) > 0): if actions[i][step] == 3: if rewards[i][step] > 0: hspos = hspos + 1 elif rewards[i][step] == -0.1: hsneg = hsneg + 1 accuracy = float(((hspos) / (hspos + hsneg))) eps_values.append(accuracy) best_scores = np.argsort(eps_values) for i in best_scores: print('Ep: ', i + 1) dirname_gray = 'dataset/RGB/ep' + str(i + 1) dirname_dep = 'dataset/Depth/ep' + str(i + 1) files = [] if (os.path.exists(dirname_gray)): files = os.listdir(dirname_gray) k = 0 for file in files: if re.match(r"image.*\.png", file): k = k + 1 k = int(k / 8) while (k % 4 != 0): k = k - 1 if (k > self.bufferSize): k = self.bufferSize print(k) #os.system("free -h") #with torch.no_grad(): images, depths = self.get_data(i + 1, k) print("Loading done") for step in range(k - 1): #print(len(rewards),i) #print(len(rewards[i]), step) reward = self.cfg.neutral_reward if rewards[i][step] >= 1: reward = self.cfg.hs_success_reward elif rewards[i][step] < 0: reward = self.cfg.hs_fail_reward reward = torch.tensor([reward], device=self.device) action = torch.tensor([[actions[i][step]]], device=self.device, dtype=torch.long) #image = images[step].unsqueeze(0).to(self.device) #depth = depths[step].unsqueeze(0).to(self.device) #next_image = images[step+1].unsqueeze(0).to(self.device) #next_depth = depths[step+1].unsqueeze(0).to(self.device) image = images[step] depth = depths[step] next_image = images[step + 1] next_depth = depths[step + 1] self.memory.push(image, depth, action, next_image, next_depth, reward) #print("Memory size: ",getsizeof(self.memory)) #torch.cuda.empty_cache() def train(self): if len(self.memory) < self.minibatch_size: return for i in range(0, len(self.memory), self.minibatch_size): #transitions = self.memory.sample(self.minibatch_size) transitions = self.memory.pull(self.minibatch_size) print('Batch train: ' + str(int(i / self.minibatch_size) + 1) + "/" + str(int(len(self.memory) / self.minibatch_size) + 1)) aux_transitions = [] for t in transitions: proc_sgray = torch.Tensor(self.state_size, self.state_dim, self.state_dim).to(self.device) proc_sdepth = torch.Tensor(self.state_size, self.state_dim, self.state_dim).to(self.device) proc_next_sgray = torch.Tensor(self.state_size, self.state_dim, self.state_dim).to(self.device) proc_next_sdepth = torch.Tensor(self.state_size, self.state_dim, self.state_dim).to(self.device) count = 0 for sgray, sdepth, next_sgray, next_sdepth in zip( t.sgray, t.sdepth, t.next_sgray, t.next_sdepth): proc_sgray[count] = self.get_tensor_from_image(sgray) proc_sdepth[count] = self.get_tensor_from_image(sdepth) proc_next_sgray[count] = self.get_tensor_from_image( next_sgray) proc_next_sdepth[count] = self.get_tensor_from_image( next_sdepth) count += 1 proc_sgray = proc_sgray.unsqueeze(0).to(self.device) proc_sdepth = proc_sdepth.unsqueeze(0).to(self.device) proc_next_sgray = proc_next_sgray.unsqueeze(0).to(self.device) proc_next_sdepth = proc_next_sdepth.unsqueeze(0).to( self.device) #('sgray','sdepth','action','next_sgray','next_sdepth','reward') one_transition = Transition(proc_sgray, proc_sdepth, t.action, proc_next_sgray, proc_next_sdepth, t.reward) aux_transitions.append(one_transition) transitions = aux_transitions # Transpose the batch (see https://stackoverflow.com/a/19343/3343043 for # detailed explanation). This converts batch-array of Transitions # to Transition of batch-arrays. batch = Transition(*zip(*transitions)) #print(batch.sgray) # Compute a mask of non-final states and concatenate the batch elements # (a final state would've been the one after which simulation ended) gray_non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, batch.next_sgray)), device=self.device, dtype=torch.bool) gray_non_final_next_states = torch.cat( [s for s in batch.next_sgray if s is not None]) depth_non_final_mask = torch.tensor(tuple( map(lambda s: s is not None, batch.next_sdepth)), device=self.device, dtype=torch.bool) depth_non_final_next_states = torch.cat( [s for s in batch.next_sdepth if s is not None]) sgray_batch = torch.cat(batch.sgray) sdepth_batch = torch.cat(batch.sdepth) action_batch = torch.cat(batch.action) reward_batch = torch.cat(batch.reward) # Compute Q(s_t, a) - the model computes Q(s_t), then we select the # columns of actions taken. These are the actions which would've been taken # for each batch state according to policy_net sgray_action_values = self.gray_policy_net(sgray_batch).gather( 1, action_batch) sdepth_action_values = self.depth_policy_net(sdepth_batch).gather( 1, action_batch) # Compute V(s_{t+1}) for all next states. # Expected values of actions for non_final_next_states are computed based # on the "older" target_net; selecting their best reward with max(1)[0]. # This is merged based on the mask, such that we'll have either the expected # state value or 0 in case the state was final. next_sgray_values = torch.zeros(self.minibatch_size, device=self.device) next_sgray_values[gray_non_final_mask] = self.gray_target_net( gray_non_final_next_states).max(1)[0].detach() next_sdepth_values = torch.zeros(self.minibatch_size, device=self.device) next_sdepth_values[depth_non_final_mask] = self.depth_target_net( depth_non_final_next_states).max(1)[0].detach() # Compute the expected Q values expected_sgray_action_values = (next_sgray_values * self.discount) + reward_batch expected_sdepth_action_values = (next_sdepth_values * self.discount) + reward_batch # Compute Huber loss gray_loss = F.smooth_l1_loss( sgray_action_values, expected_sgray_action_values.unsqueeze(1)) depth_loss = F.smooth_l1_loss( sdepth_action_values, expected_sdepth_action_values.unsqueeze(1)) # Optimize the model self.gray_optimizer.zero_grad() gray_loss.backward() for param in self.gray_policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.gray_optimizer.step() # Optimize the model self.depth_optimizer.zero_grad() depth_loss.backward() for param in self.depth_policy_net.parameters(): param.grad.data.clamp_(-1, 1) self.depth_optimizer.step()