def test_frame_stack(env_id, num_stack, lz4_compress): env = gym.make(env_id) shape = env.observation_space.shape env = FrameStack(env, num_stack, lz4_compress) assert env.observation_space.shape == (num_stack, ) + shape obs = env.reset() obs = np.asarray(obs) assert obs.shape == (num_stack, ) + shape for i in range(1, num_stack): assert np.allclose(obs[i - 1], obs[i]) obs, _, _, _ = env.step(env.action_space.sample()) obs = np.asarray(obs) assert obs.shape == (num_stack, ) + shape for i in range(1, num_stack - 1): assert np.allclose(obs[i - 1], obs[i]) assert not np.allclose(obs[-1], obs[-2])
def test_frame_stack(env_id, num_stack, lz4_compress): env = gym.make(env_id) shape = env.observation_space.shape env = FrameStack(env, num_stack, lz4_compress) assert env.observation_space.shape == (num_stack, ) + shape assert env.observation_space.dtype == env.env.observation_space.dtype dup = gym.make(env_id) obs = env.reset(seed=0) dup_obs = dup.reset(seed=0) assert np.allclose(obs[-1], dup_obs) for _ in range(num_stack**2): action = env.action_space.sample() dup_obs, _, _, _ = dup.step(action) obs, _, _, _ = env.step(action) assert np.allclose(obs[-1], dup_obs) assert len(obs) == num_stack
class Agent: def __init__(self, game: str, replay_buffer_capacity: int, replay_start_size: int, batch_size: int, discount_factor: float, lr: float, device: str = 'cuda:0', env_seed: int = 0, frame_buffer_size: int = 4, print_self=True): self.device = device self.discount_factor = discount_factor self.game = game self.batch_size = batch_size self.replay_buf = ReplayBuffer(capacity=replay_buffer_capacity) self.env = FrameStack( AtariPreprocessing( gym.make(self.game), # noop_max=0, # terminal_on_life_loss=True, scale_obs=False), num_stack=frame_buffer_size) self.env.seed(env_seed) self.reset() self.n_action = self.env.action_space.n self.policy_net = DQN(self.n_action).to(self.device) self.target_net = DQN(self.n_action).to(self.device).eval() self.optimizer = RMSprop( self.policy_net.parameters(), alpha=0.95, # momentum=0.95, eps=0.01) if print_self: print(self) self._fill_replay_buf(replay_start_size) def __repr__(self): return '\n'.join([ 'Agent:', f'Game: {self.game}', f'Device: {self.device}', f'Policy net: {self.policy_net}', f'Target net: {self.target_net}', f'Replay buf: {self.replay_buf}' ]) def _fill_replay_buf(self, replay_start_size): for _ in trange(replay_start_size, desc='Fill replay_buf randomly', leave=True): self.step(1.0) def reset(self): """Reset the end, pre-populate self.frame_buf and self.state""" self.state = self.env.reset() @torch.no_grad() def step(self, epsilon, clip_reward=True): """ Choose an action based on current state and epsilon-greedy policy """ # Choose action if random.random() <= epsilon: q_values = None action = self.env.action_space.sample() else: torch_state = torch.tensor(self.state, dtype=torch.float32, device=self.device).unsqueeze(0) / 255.0 q_values = self.policy_net(torch_state) action = int(q_values.argmax(dim=1).item()) # Apply action next_state, reward, done, _ = self.env.step(action) if clip_reward: reward = max(-1.0, min(reward, 1.0)) # Store into replay buffer self.replay_buf.append( (torch.tensor( np.array(self.state), dtype=torch.float32, device="cpu") / 255., action, reward, torch.tensor( np.array(next_state), dtype=torch.float32, device="cpu") / 255., done)) # Advance to next state self.state = next_state if done: self.reset() return reward, q_values, done def q_update(self): self.optimizer.zero_grad() states, actions, rewards, next_states, dones = [ x.to(self.device) for x in self.replay_buf.sample(self.batch_size) ] with torch.no_grad(): y = torch.where( dones, rewards, rewards + self.discount_factor * self.target_net(next_states).max(1)[0]) predicted_values = self.policy_net(states).gather( 1, actions.unsqueeze(-1)).squeeze(-1) loss = huber(y, predicted_values, 2.) loss.backward() self.optimizer.step() return (y - predicted_values).abs().mean()
class MarioBaseline(object): def __init__(self, episodes, checkpoint, current_episode, epsilon): self.current_episode = current_episode self.episodes = episodes self.episode_score = [] self.episode_qs = [] self.episode_distance = [] self.episode_loss = [] self.env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT) # Apply Frame Wrappers self.env = SkipFrame(self.env, 4) self.env = GrayScaleObservation(self.env) self.env = ResizeObservation(self.env, 84) self.env = FrameStack(self.env, 4) self.agent = DQNAgent(stateShape=(4, 84, 84), actionSpace=self.env.action_space, numPicks=32, memorySize=20000, epsilon=epsilon, checkpoint=checkpoint) def train(self): for _ in range(self.episodes): self.episode() self.current_episode += 1 self.env.close() def episode(self): done = False rewardsSum = 0 qSum = 0 qActions = 1 lossSum = 0 state = np.array(self.env.reset()) maxDistance = -1000000 while not done: action, q = self.agent.selectAction(state) ''' if q != -100000: qSum += q qActions += 1 ''' obs, reward, done, info = self.env.step(action) if info['x_pos'] > maxDistance: maxDistance = info['x_pos'] next_state = np.array(obs) rewardsSum = np.add(rewardsSum, reward) self.agent.addMemory(FloatTensor(state), LongTensor([action]), FloatTensor([reward]), FloatTensor(next_state), LongTensor([done])) loss = self.agent.trainDQN() state = next_state lossSum += loss if self.agent.step % self.agent.sync == 0: self.agent.targetNetwork.load_state_dict( self.agent.trainNetwork.state_dict()) self.agent.epsilon = self.agent.epsilon_min + ( self.agent.epsilon_start - self.agent.epsilon_min) * math.exp( -1 * ((self.agent.step + 1) / self.agent.epsilon_decay)) if self.current_episode % 200 == 0: self.agent.save(self.current_episode) print( "now epsilon is {}, the reward is {} with loss {} in episode {}, step {}, dist {}" .format(self.agent.epsilon, rewardsSum, lossSum, self.current_episode, self.agent.step, maxDistance)) self.episode_score.append(rewardsSum) self.episode_qs.append(qSum / qActions) self.episode_distance.append(maxDistance) self.episode_loss.append(lossSum)
class MarioBaseline(object): def __init__(self, episodes): self.current_episode = 0 self.episodes = episodes self.episode_score = [] self.episode_qs = [] self.episode_distance = [] self.episode_loss = [] self.fig, self.ax = plt.subplots(2, 2) self.fig.canvas.draw() plt.show(block=False) self.env = gym_super_mario_bros.make('SuperMarioBros-v0') # Apply Observation Wrappers self.env = GrayScaleObservation(self.env) self.env = ResizeObservation(self.env, 84) # Apply Control Wrappers self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT) self.env = NoopResetEnv(self.env) # Apply Frame Wrappers self.env = SkipFrame(self.env, 4) self.env = FrameStack(self.env, 4) self.agent = DQNAgent(stateShape=(84, 84, 4), actionSpace=self.env.action_space, numPicks=32, memorySize=100000) def train(self): for _ in range(self.episodes): self.episode() self.plot() self.current_episode += 1 self.env.close() def episode(self): done = False rewardsSum = 0 qSum = 0 qActions = 1 lossSum = 0 state = np.array(self.env.reset()).transpose(3, 1, 2, 0) maxDistance = -1000000 lastX = 0 while not done: action, q = self.agent.selectAction(state) if q != -100000: qSum += q qActions += 1 obs, reward, done, info = self.env.step(action) self.env.render() if info['x_pos'] < lastX: reward -= 1 if info['flag_get']: reward += 10 if info['x_pos'] > maxDistance: maxDistance = info['x_pos'] nextState = np.array(obs).transpose(3, 1, 2, 0) rewardsSum = np.add(rewardsSum, reward) self.agent.addMemory((state, action, reward, nextState, done)) loss = self.agent.trainDQN() state = nextState lossSum += loss if self.current_episode % 200 == 0: self.agent.save(self.current_episode) print("now epsilon is {}, the reward is {} with loss {} in episode {}".format( self.agent.epsilon, rewardsSum, lossSum, self.current_episode)) self.episode_score.append(rewardsSum) self.episode_qs.append(qSum/qActions) self.episode_distance.append(maxDistance) self.episode_loss.append(lossSum) def plot(self): self.ax[0][0].title.set_text('Training Score') self.ax[0][0].set_xlabel('Episode') self.ax[0][0].set_ylabel('Score') self.ax[0][0].plot(self.episode_score, 'b') self.ax[0][1].title.set_text('Training Distance') self.ax[0][1].set_xlabel('Episode') self.ax[0][1].set_ylabel('Distance') self.ax[0][1].plot(self.episode_distance, 'g') self.ax[1][0].title.set_text('Training Loss') self.ax[1][0].set_xlabel('Episode') self.ax[1][0].set_ylabel('Loss') self.ax[1][0].plot(self.episode_loss, 'r') self.ax[1][1].title.set_text('Training Q Vals') self.ax[1][1].set_xlabel('Episode') self.ax[1][1].set_ylabel('Qs') self.ax[1][1].plot(self.episode_qs, 'c') self.fig.canvas.draw() plt.show(block=False) plt.pause(.001)
class MarioBaseline(object): def __init__(self, episodes, checkpoint, current_episode, epsilon): self.current_episode = current_episode self.episodes = episodes self.episode_score = [] self.episode_qs = [] self.episode_distance = [] self.episode_loss = [] self.episode_policies = [] self.fig, self.ax = plt.subplots(1, 2, figsize=(12, 4)) self.fig.canvas.draw() self.env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') # Apply Observation Wrappers self.env = GrayScaleObservation(self.env) self.env = ResizeObservation(self.env, 84) # Apply Control Wrappers self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT) self.env = NoopResetEnv(self.env) # Apply Frame Wrappers self.env = SkipFrame(self.env, 4) self.env = FrameStack(self.env, 4) self.agent = DQNAgent(stateShape=(4, 84, 84), actionSpace=self.env.action_space, numPicks=32, memorySize=20000, numRewards=4, epsilon=epsilon, checkpoint=checkpoint) def train(self): for _ in range(self.episodes): self.episode() self.current_episode += 1 self.env.close() def episode(self): done = False rewardsSum = 0 qSum = 0 qActions = 1 lossSum = 0 policies = [0] * (4 + 1) lossSums = [0] * (4) state = np.array(self.env.reset()) maxDistance = -1000000 lastX = 0 lastT = 0 lastC = 0 while not done: action, policy, qs, ws, random = self.agent.selectAction(state) policies[policy] += 1 obs, _, done, info = self.env.step(action) #self.env.render() if info['x_pos'] > maxDistance: maxDistance = info['x_pos'] rewardX = info['x_pos'] - lastX lastX = info['x_pos'] rewardT = info['time'] - lastT if rewardT > 0: rewardT = 0 lastT = info['time'] rewardC = info['coins'] - lastC lastC = info['coins'] rewardD = self.env.unwrapped._death_penalty next_state = np.array(obs) rewardsSum = np.add(rewardsSum, rewardX) rewardsSum = np.add(rewardsSum, rewardT) rewardsSum = np.add(rewardsSum, rewardC) rewardsSum = np.add(rewardsSum, rewardD) self.agent.addMemory(state, action, policy, [rewardX, rewardT, rewardC, rewardD], next_state, done) loss = self.agent.trainDQN() state = next_state lossSums = [lossSums[i] + loss[i][0] for i in range(len(lossSums))] self.agent.epsilon = self.agent.epsilon_min + ( 1 - self.agent.epsilon_min) * math.exp(-1 * ( (self.agent.step + 1) / self.agent.epsilon_decay)) print( "now epsilon is {}, the reward is {} with loss {} in episode {}, step {}, dist {}" .format(self.agent.epsilon, rewardsSum, lossSums, self.current_episode, self.agent.step, maxDistance)) self.episode_score.append(rewardsSum) self.episode_policies.append(policies) if self.current_episode % 200 == 0: self.agent.save(self.current_episode) self.plot() def plot(self): spline_x = np.linspace(0, self.current_episode, num=self.current_episode) ep_scores = np.array(self.episode_score) ep_groups = [ ep_scores[i * GROUP_NUM:(i + 1) * GROUP_NUM] for i in range((len(ep_scores) + GROUP_NUM - 1) // GROUP_NUM) ] # Pad for weird numpy error for now ep_groups[-1] = np.append(ep_groups[-1], [np.mean(ep_groups[-1])] * (GROUP_NUM - len(ep_groups[-1]))) x_groups = [i * GROUP_NUM for i in range(len(ep_groups))] self.ax[0].clear() if len(x_groups) > 5: ep_avgs = np.mean(ep_groups, 1) avg_spl = interp1d(x_groups, ep_avgs, kind="cubic", fill_value="extrapolate") ep_std = np.std(ep_groups, 1) std_spl = interp1d(x_groups, ep_std, kind="cubic", fill_value="extrapolate") self.ax[0].plot(spline_x, avg_spl(spline_x), lw=0.7, c="blue") self.ax[0].fill_between( spline_x, avg_spl(spline_x) - std_spl(spline_x), avg_spl(spline_x) + std_spl(spline_x), alpha=0.5, facecolor="red", interpolate=True, ) self.ax[0].title.set_text("Training Score") self.ax[0].set_xlabel("Episode") self.ax[0].set_ylabel("Score") policies = np.transpose(self.episode_policies) colors = pl.cm.jet(np.linspace(0, 1, len(policies) * 2)) self.ax[1].clear() self.ax[1].title.set_text("Policy Choices") for i, policy in enumerate(policies): if len(x_groups) > 5: ep_groups = [ policy[i * GROUP_NUM:(i + 1) * GROUP_NUM] for i in range((len(policy) + GROUP_NUM - 1) // GROUP_NUM) ] # Pad for weird numpy error for now ep_groups[-1] = np.append( ep_groups[-1], [np.mean(ep_groups[-1])] * (GROUP_NUM - len(ep_groups[-1])), ) x_groups = [i * GROUP_NUM for i in range(len(ep_groups))] ep_avgs = np.mean(ep_groups, 1) avg_spl = interp1d(x_groups, ep_avgs, kind="cubic", fill_value="extrapolate") ep_std = np.std(ep_groups, 1) std_spl = interp1d(x_groups, ep_std, kind="cubic", fill_value="extrapolate") self.ax[1].plot( spline_x, avg_spl(spline_x), lw=0.7, c=colors[i], label="{} policy".format(PolEnum(i).name), ) self.ax[1].fill_between( spline_x, avg_spl(spline_x) - std_spl(spline_x), avg_spl(spline_x) + std_spl(spline_x), alpha=0.5, facecolor=colors[-1 - i], interpolate=True, ) self.ax[1].legend() self.fig.canvas.draw() plt.savefig("mario_w_pddqn_{}.png".format(self.current_episode))