class Testor: def __init__(self, model_dict, idx, num_channels=3, num_actions=19): import gym import minerl self.testor_idx = idx self.env = gym.make(ENV_NAME) self.port_number = int("12340") + self.testor_idx print("testor environment %d initialize successfully" % self.testor_idx) self.env.make_interactive(port=self.port_number, realtime=False) self.testor_network = DQN(num_channels, num_actions).cuda() self.testor_network.load_state_dict(model_dict) print("testor network %d initialize successfully" % self.testor_idx) self.writer = SummaryWriter(f'runs/apex/test/testor{self.testor_idx}') self.max_epi = 100 def explore(self): for num_epi in range(self.max_epi): obs = self.env.reset() state = converter(ENV_NAME, obs).cuda() state = state.float() done = False total_reward = 0 steps = 0 total_steps = 0 while not done: steps += 1 total_steps += 1 action_tensor = self.testor_network.forward(state) print(action_tensor) action_index = torch.argmax(action_tensor).item() print(action_index) action = make_19action(self.env, action_index) #print(action) obs_prime, reward, done, info = self.env.step(action) total_reward += reward state_prime = converter(ENV_NAME, obs_prime).cuda() state = state_prime if done: print("%d episode is done" % num_epi) print("total rewards : %d " % total_reward) self.writer.add_scalar('Rewards/test', total_reward, num_epi) break
class DQNAgent: def __init__(self, config: Config): self.config = config self.is_training = True self.buffer = ReplayBuffer(self.config.max_buff) self.model = DQN(self.config.state_dim, self.config.action_dim).cuda() self.model_optim = Adam(self.model.parameters(), lr=self.config.learning_rate) if self.config.use_cuda: self.cuda() def act(self, state, epsilon=None): if epsilon is None: epsilon = self.config.epsilon_min if random.random() > epsilon or not self.is_training: state = torch.tensor(state, dtype=torch.float).unsqueeze(0) if self.config.use_cuda: state = state.cuda() q_value = self.model.forward(state) action = q_value.max(1)[1].item() else: action = random.randrange(self.config.action_dim) return action def learning(self, fr): s0, a, r, s1, done = self.buffer.sample(self.config.batch_size) s0 = torch.tensor(s0, dtype=torch.float) s1 = torch.tensor(s1, dtype=torch.float) a = torch.tensor(a, dtype=torch.long) r = torch.tensor(r, dtype=torch.float) done = torch.tensor(done, dtype=torch.float) if self.config.use_cuda: s0 = s0.cuda() s1 = s1.cuda() a = a.cuda() r = r.cuda() done = done.cuda() q_values = self.model(s0).cuda() next_q_values = self.model(s1).cuda() next_q_value = next_q_values.max(1)[0] q_value = q_values.gather(1, a.unsqueeze(1)).squeeze(1) expected_q_value = r + self.config.gamma * next_q_value * (1 - done) # Notice that detach the expected_q_value loss = (q_value - expected_q_value.detach()).pow(2).mean() self.model_optim.zero_grad() loss.backward() self.model_optim.step() return loss.item() def cuda(self): self.model.cuda() def load_weights(self, model_path): if model_path is None: return self.model.load_state_dict(torch.load(model_path)) def save_model(self, output, tag=''): torch.save(self.model.state_dict(), '%s/model_%s.pkl' % (output, tag)) def save_config(self, output): with open(output + '/config.txt', 'w') as f: attr_val = get_class_attr_val(self.config) for k, v in attr_val.items(): f.write(str(k) + " = " + str(v) + "\n")
# saving training variables outliers = [] centroids = [] G = [] episode_rewards = [] mean_reward_episodes_list = [] best_reward_episodes_list = [] episode_rewards_list = [] for t in range(MAX_FRAMES): x = np.concatenate((s, g), axis=0).reshape((1, 5, 84, 84)) if t < LEARNING_STARTS: a = env.action_space.sample() else: qt = Qt.forward(torch.Tensor(x).type(dtype) / 255) a = epsilon_greedy(qt.cpu().detach().numpy(), epsilon=epsilon) # random action SP, r, terminal, step_info = step(a) episode_rewards.append(r) sp = four_frames_to_4_84_84(SP) xp = np.concatenate((sp, g), axis=0).reshape((1, 5, 84, 84)) man_mask = get_man_mask(SP) man_loc = get_man_xy_np_coordinate(man_mask) # intrinsic_done_task = are_masks_align(man_mask, subgoal_mask) intrinsic_done_task = is_man_inside_subgoal_mask(man_mask, subgoal_mask) # outlier if r > 0: print('Outler detected at', man_loc) outliers.append(man_loc) R += r
class DDQNAgent: def __init__(self, config: Config, training=True): self.config = config self.is_training = training self.buffer = ReplayBuffer(self.config.max_buff) self.model = DQN(self.config.state_shape, self.config.action_dim) self.target_model = DQN(self.config.state_shape, self.config.action_dim) self.target_model.load_state_dict(self.model.state_dict()) self.optim = Adam(self.model.parameters(), lr=self.config.learning_rate) self.model.cuda() self.target_model.cuda() def act(self, state, epsilon=None): if epsilon is None: epsilon = self.config.epsilon_min if random.random() > epsilon or not self.is_training: state = torch.tensor(state, dtype=torch.float).unsqueeze(0) state = state.cuda() q_value = self.model.forward(state) action = q_value.max(1)[1].item() else: action = random.randrange(self.config.action_dim) return action def learn(self, t): s, a, r, s2, done = self.buffer.sample(self.config.batch_size) s = torch.tensor(s, dtype=torch.float) a = torch.tensor(a, dtype=torch.long) r = torch.tensor(r, dtype=torch.float) s2 = torch.tensor(s2, dtype=torch.float) done = torch.tensor(done, dtype=torch.float) s = s.cuda() a = a.cuda() r = r.cuda() s2 = s2.cuda() done = done.cuda() q_values = self.model(s).cuda() next_q_values = self.model(s2).cuda() next_q_state_values = self.target_model(s2).cuda() q_value = q_values.gather(1, a.unsqueeze(1)).squeeze(1) next_q_value = next_q_state_values.gather( 1, next_q_values.max(1)[1].unsqueeze(1)).squeeze(1) expected_q_value = r + self.config.gamma * next_q_value * (1 - done) loss = (q_value - expected_q_value.detach()).pow(2).mean() self.optim.zero_grad() loss.backward() self.optim.step() if t % self.config.update_interval == 0: self.target_model.load_state_dict(self.model.state_dict()) return loss.item() def load_weights(self, model_path): model = torch.load(model_path) if 'model' in model: self.model.load_state_dict(model['model']) else: self.model.load_state_dict(model) def save_checkpoint(self): os.makedirs('ckpt', exist_ok=True) torch.save(self.model.state_dict(), 'ckpt/model.pt') def load_checkpoint(self): self.model.load_state_dict('ckpt/model.pt') self.target_model.load_state_dict('ckpt/model.pt')
class Algorithm(): def __init__(self, lr, gamma, act_dim, state_dim, memory_capacity, epsilon, batch_size): self.model = DQN(state_dim, act_dim) self.state_dim = state_dim self.act_dim = act_dim self.lr = lr self.gamma = gamma self.epsilon = epsilon self.target_model = copy.deepcopy(self.model) self.optimizer = optim.Adam(self.model.parameters(), lr=self.lr) self.loss = nn.MSELoss() self.memory_capacity = memory_capacity self.replay_buffer = np.zeros((memory_capacity, 2 * state_dim + 3)) self.memory_counter = 0 self.batch_size = batch_size def sync_target(self): self.target_model.load_state_dict(self.model.state_dict()) def pridict(self, obs): return self.model.forward(obs) def choose_action(self, state): state = torch.unsqueeze(torch.Tensor(state), 0) if np.random.rand() <= self.epsilon: action_value = self.model.forward(state) action = torch.max(action_value, dim=1)[1].numpy()[0] else: action = np.random.randint(0, self.act_dim) return action def store_transition(self, state, action, reward, next_state, done): transition = np.hstack((state, [action, reward], next_state, done)) index = self.memory_counter % self.memory_capacity self.replay_buffer[index, :] = transition self.memory_counter += 1 def learn(self): sample_index = np.random.choice(self.memory_capacity, self.batch_size) batch_memory = self.replay_buffer[sample_index, :] batch_state = torch.FloatTensor(batch_memory[:, :self.state_dim]) batch_action = torch.LongTensor( batch_memory[:, self.state_dim:self.state_dim + 1].astype(int)) batch_reward = torch.FloatTensor(batch_memory[:, self.state_dim + 1:self.state_dim + 2]) batch_next_state = torch.FloatTensor( batch_memory[:, self.state_dim + 2:2 * self.state_dim + 2]) batch_done = torch.FloatTensor(batch_memory[:, -1:]) next_value = self.target_model.forward(batch_next_state) max_value = torch.max(next_value, dim=1)[0] torch.detach(max_value) target = batch_reward.squeeze() + self.gamma * ( 1 - batch_done).squeeze() * max_value q_value = self.model.forward(batch_state) behavior = torch.gather(q_value, dim=1, index=batch_action).squeeze() self.optimizer.zero_grad() output = self.loss(behavior, target) output.backward() self.optimizer.step() return output