for global_step in range(args.total_timesteps): # ALGO LOGIC: put action logic here epsilon = linear_schedule(args.start_e, args.end_e, args.exploration_fraction * args.total_timesteps, global_step) obs = np.array(obs) env.render() # action, logits, _, = sampler.sample(q_network, obs, device, n, epsilon) logits = q_network.forward(obs.reshape((1, ) + obs.shape), device) if random.random() < epsilon: action = env.action_space.sample() else: action = torch.argmax(logits, dim=1).tolist()[0] # EXPERIMhENTAL PLEASE FIX SOON # TRY NOT TO MODIFY: execute the game and log data. next_obs, reward, done, info = env.step(action) episode_reward += reward # TRY NOT TO MODIFY: record rewards for plotting purposes # ALGO LOGIC: training. # when storing n, we want to keep its computational graph # other way of doing it, store: # init_obs, action, reward, (subsequent_obs), done # prob - levy sampling is stocastic # alternatively, keep the tensor n, keep graph when going back, # but do garbage collection rb.put((obs, action, reward, next_obs, done)) if global_step > args.learning_starts and global_step % args.train_frequency == 0: s_obs, s_actions, s_rewards, s_next_obses, s_dones = rb.sample(
def main(args): env = gym.make(args.env) if 'MiniGrid' in args.env: env = ImgObsWrapper(env) path = args.base_path + args.env os.makedirs(path, exist_ok=True) # obs_shape = np.prod(env.observation_space.shape).astype(int) obs_shape = env.observation_space.shape act_shape = env.action_space.n q = QNetwork(obs_shape, act_shape) q_target = QNetwork(obs_shape, act_shape) opt = optim.Adam(lr=args.lr, params=q.parameters()) memory = Memory(capacity=args.memory) scheduler = LinearSchedule(schedule_timesteps=int(args.max_steps * 0.1), final_p=0.01) avg_rw = deque(maxlen=40) avg_len = deque(maxlen=40) def get_action(s, t): s = torch.Tensor(s[None,:]) _q = q(s) if np.random.sample() > scheduler.value: best_action = np.argmax(_q.detach(), axis=-1).item() else: best_action = np.random.randint(0, act_shape) scheduler.update(t) return best_action def train(batch): batch = Transition(*zip(*batch)) s = torch.Tensor(batch.state) a = torch.Tensor(one_hot(np.array(batch.action), num_classes=act_shape)) r = torch.Tensor(batch.reward) d = torch.Tensor(batch.done) s1 = torch.Tensor(batch.next_state) value = (q(s) * a).sum(dim=-1) next_value = r + args.gamma * (1. - d) * torch.max(q_target(s1), dim=-1)[0] loss = (.5 * (next_value - value) ** 2).mean() opt.zero_grad() loss.backward() opt.step() state = env.reset() q_target.load_state_dict(q.state_dict()) ep_rw = 0 ep_len = 0 ep = 0 for t in range(args.max_steps): action = get_action(state, t) next_state, reward, done, _ = env.step(action) memory.push(state, action, next_state, reward, done) ep_rw += reward ep_len += 1 state = next_state.copy() if done: ep += 1 avg_rw.append(ep_rw) avg_len.append(ep_len) ep_rw = 0 ep_len = 0 state = env.reset() if t % args.train_every == 0 and len(memory) > args.batch_size: batch = memory.sample(batch_size=args.batch_size) train(batch) if t % args.update_every == 0: q_target.load_state_dict(q.state_dict()) print(f't:{t}\tep:{ep}\tavg_rw:{np.mean(avg_rw)}\tavg_len:{np.mean(avg_len)}\teps:{scheduler.value}') env = Monitor(env, directory=path) for ep in range(4): s = env.reset() while True: a = get_action(s, t=0) s1, r, d, _ = env.step(a) s = s1.copy() if d: break
class GridEnvironment(Environment): def __init__(self, env_id, is_render, env_idx, child_conn, history_size=1, h=84, w=84, life_done=True, sticky_action=False, p=0.25): super(GridEnvironment, self).__init__() self.daemon = True self.env = ImgObsWrapper( RGBImgObsWrapper(ReseedWrapper(gym.make(env_id)))) self.env_id = env_id self.is_render = is_render self.env_idx = env_idx self.steps = 0 self.episode = 0 self.rall = 0 self.recent_rlist = deque(maxlen=100) self.child_conn = child_conn self.sticky_action = sticky_action self.last_action = 0 self.p = p self.history_size = history_size self.history = np.zeros([history_size, h, w]) self.h = h self.w = w self.reset() def run(self): super(GridEnvironment, self).run() while True: action = self.child_conn.recv() # sticky action if self.sticky_action: if np.random.rand() <= self.p: action = self.last_action self.last_action = action s, reward, done, info = self.env.step(action) if max_step_per_episode < self.steps: done = True log_reward = reward force_done = done self.history[0, :, :] = self.pre_proc(s) self.rall += reward self.steps += 1 if done: self.recent_rlist.append(self.rall) print( "[Episode {}({})] Step: {} Reward: {} Recent Reward: {} Visited Room: [{}]" .format(self.episode, self.env_idx, self.steps, self.rall, np.mean(self.recent_rlist), info.get('episode', {}).get('visited_rooms', {}))) self.history = self.reset() self.child_conn.send([ self.history[:, :, :], reward, force_done, done, log_reward, [self.rall, self.steps] ]) def reset(self): self.last_action = 0 self.steps = 0 self.episode += 1 self.rall = 0 s = self.env.reset() self.get_init_state(self.pre_proc(s)) return self.history[:, :, :] def pre_proc(self, X): X = np.array(Image.fromarray(X).convert('L')).astype('float32') x = cv2.resize(X, (self.h, self.w)) return x def get_init_state(self, s): for i in range(self.history_size): self.history[i, :, :] = self.pre_proc(s)