class Env(object): def __init__(self, act_space, act_repeats, frames, state_size, game): self.act_space = act_space self.act_repeats = act_repeats self.act_repeat = random.choice(self.act_repeats) self.frames = frames self.state_size = state_size self.game = game self.max_pos = -10000 self.count = 0 env = gym_super_mario_bros.make(game) if self.act_space == 7: self.env = JoypadSpace(env, SIMPLE_MOVEMENT) elif self.act_space == 12: self.env = JoypadSpace(env, COMPLEX_MOVEMENT) s_t = self.resize_image(self.env.reset()) self.s_t = np.tile(s_t, [1, 1, frames]) self.s = [self.s_t] self.a_t = random.randint(0, act_space - 1) self.a = [self.a_t] self.a_logits = [] self.r = [0] self.pos = [] self.v_cur = [] state_in = np.zeros(self.state_size, dtype=np.float32) self.state_in = [state_in] self.done = False def step(self, a, a_logits, state_in): self.count += 1 if self.count % self.act_repeat == 0: self.a_t = a self.count = 0 self.act_repeat = random.choice(self.act_repeats) gs_t1, gr_t, gdone, ginfo = self.env.step(self.a_t) self.env.render() if not gdone: s_t1, r_t, done, info = self.env.step(self.a_t) r_t += gr_t r_t /= 2. else: s_t1 = gs_t1 r_t = gr_t done = gdone info = ginfo r_t /= 15. s_t1 = self.resize_image(s_t1) channels = s_t1.shape[-1] self.s_t = np.concatenate([s_t1, self.s_t[:, :, :-channels]], axis=-1) self.s.append(self.s_t) self.a.append(self.a_t) self.a_logits.append(a_logits) self.r.append(r_t) self.max_pos = max(self.max_pos, info["x_pos"]) self.pos.append(info["x_pos"]) if (len(self.pos) > 500) and (info["x_pos"] - self.pos[-500] < 5) and ( self.pos[-500] - info["x_pos"] < 5): done = True self.done = done self.state_in.append(state_in) def update_v(self, v_cur): self.v_cur.append(v_cur) def reset(self, force=False): if self.done or force: max_pos = self.max_pos self.max_pos = -10000 logging.info(" Max Position %s : %d" % (self.game, max_pos)) self.count = 0 self.act_repeat = random.choice(self.act_repeats) s_t = self.resize_image(self.env.reset()) self.s_t = np.tile(s_t, [1, 1, self.frames]) self.s = [self.s_t] self.a_t = random.randint(0, self.act_space - 1) self.a = [self.a_t] self.a_logits = [] self.r = [0] self.pos = [] self.v_cur = [] state_in = np.zeros(self.state_size, dtype=np.float32) self.state_in = [state_in] self.done = False def get_state(self): return self.s_t def get_act(self): return self.a_t def get_max_pos(self): return self.max_pos def reset_max_pos(self): self.max_pos = -10000 def get_state_in(self): return self.state_in[-1] def get_history(self, force=False): if self.done or force: if self.done: gaes = get_gaes(None, self.r, self.v_cur, self.v_cur[1:] + [0], 0.99, 0.95)[0] seg = Seg(self.s, self.a, self.a_logits, self.r, gaes, self.v_cur, self.state_in) return seg if force and len(self.r) > 1: gaes = get_gaes(None, self.r[:-1], self.v_cur[:-1], self.v_cur[1:], 0.99, 0.95)[0] seg = Seg(self.s[:-1], self.a[:-1], self.a_logits[:-1], self.r[:-1], gaes, self.v_cur[:-1], self.state_in[:-1]) return seg return None @staticmethod def resize_image(image, size=84): image = Image.fromarray(image) image = image.convert("L") image = image.resize((size, size)) image = np.array(image) image = image / 255. image = np.array(image, np.float32) return image[:, :, None]
from nes_py.wrappers import JoypadSpace import gym from Contra.actions import SIMPLE_MOVEMENT, COMPLEX_MOVEMENT, RIGHT_ONLY import time """ pip install gym-contra https://github.com/OuYanghaoyue/gym_contra """ env = gym.make('Contra-v0') env = JoypadSpace(env, RIGHT_ONLY) print("actions", env.action_space) print("observation_space ", env.observation_space.shape[0]) done = False env.reset() for step in range(5000): if done: print("Over") break time.sleep(0.01) action = env.action_space.sample() print("action", action) state, reward, done, info = env.step(action) env.render() env.close()
def train(): # Hyper parameters cfg = DictConfig({ "epochs": 1, "lr": 1e-4, "use_extrinsic": True, "max_episode_len": 1000, "min_progress": 15, "frames_per_state": 3, "action_repeats": 6, "gamma_q": 0.85, "epsilon_random": 0.1, # Sample random action with epsilon probability "epsilon_greedy_switch": 1000, "q_loss_weight": 0.01, "inverse_loss_weight": 0.5, "forward_loss_weight": 0.5, "intrinsic_weight": 1.0, "extrinsic_weight": 1.0, }) # ---- setting up variables ----- q_model = MarioModel(cfg.frames_per_state) icm_model = MarioICM(cfg.frames_per_state) optim = torch.optim.Adam(list(q_model.parameters()) + list(icm_model.parameters()), lr=cfg.lr) replay = ExperienceReplay(buffer_size=500, batch_size=100) env = gym_super_mario_bros.make("SuperMarioBros-v0") env = JoypadSpace(env, COMPLEX_MOVEMENT) # Counters and stats last_x_pos = 0 current_episode = 0 global_step = 0 current_step = 0 cumulative_reward = 0 ep_rewards = [] # ----- training loop ------ for epoch in range(cfg.epochs): state = env.reset() done = False # Monte Carlo loop while not done: # ------------ Q Learning -------------- if current_step == 0: state = prepare_initial_state(env.render("rgb_array")) else: state = prepare_multi_state(state, env.render("rgb_array")) q_values = q_model(state) action = sample_action( q_values, cfg.epsilon, apply_epsilon=global_step > cfg.epsilon_greedy_switch, ) action_count = 0 state2 = None while True: state2_, reward, done, info = env.step(action) if state2 is None: state2 = state2_ env.render() if action_count >= cfg.action_repeats or done: break action_count += 1 state2 = prepare_multi_state(state, state2) # Add intrinsic reward intrinsic_reward = get_intrinsic_reward(state, action, state2, icm_model) print("in reward", intrinsic_reward.item()) print("ex reward", reward) reward = (cfg.intrinsic_weight * intrinsic_reward) + (cfg.extrinsic_weight * reward) q_loss = get_q_loss(q_values[0][action], reward, q_model, state2, cfg.gamma_q) replay.add(state, action, reward, state2) state = state2 # ------------- ICM ------------------- state1_batch, action_batch, reward_batch, state2_batch = replay.get_batch( ) action_pred, state2_encoded, state2_pred = icm_model( state1_batch, action_batch, state2_batch) inverse_loss = F.cross_entropy(action_pred, action_batch) forward_loss = F.mse_loss(state2_pred, state2_encoded) # ------------ Learning ------------ final_loss = ((cfg.q_loss_weight * q_loss) + (cfg.inverse_loss_weight * inverse_loss) + (cfg.forward_loss_weight * forward_loss)) optim.zero_grad() final_loss.backward() optim.step() # ------------ updates -------------- # TODO: add loss scalars print("--------loss: ", final_loss.item()) max_episode_len_reached = current_step >= cfg.max_episode_len no_progress = False # TODO: Figure out the progress shit done = done or max_episode_len_reached or no_progress if done: if max_episode_len_reached: # TODO: Add scalar: 'max episode len reached' current_episode, auto pass elif no_progress: # TODO: Add scalar: 'no progress' current_episode, auto pass # TODO: add scalar: 'episode len' current_step, current_episode # TODO: Plot cumulative reward for each episode # TODO: Plot the x_pos after the episode # TODO: Plot total sum of rewards for each episode # TODO: Every n episodes store save the video -> imageio.mimwrite('gameplay.mp4', renders: ndArray of frames, fps=30) current_step = -1 current_episode += 1 global_step += 1 current_step += 1
class MoMarioEnv(Process): def __init__(self, args, env_idx, child_conn, history_size=4, h=84, w=84): super(MoMarioEnv, self).__init__() self.daemon = True self.env = JoypadSpace(gym_super_mario_bros.make(args.env_id), SIMPLE_MOVEMENT) self.is_render = args.render self.env_idx = env_idx self.steps = 0 self.episode = 0 self.rall = 0 self.coin = 0 self.x_pos = 0 self.time = 0 self.score = 0 self.n_mo = 5 self.morall = np.zeros(self.n_mo) self.recent_rlist = deque(maxlen=100) self.recent_morlist = deque(maxlen=100) self.child_conn = child_conn self.life_done = args.life_done self.single_stage = args.single_stage self.stage_bonus = 0 self.history_size = history_size self.history = np.zeros([history_size, h, w]) self.h = h self.w = w self.reset() def run(self): super(MoMarioEnv, self).run() while True: action = self.child_conn.recv() if self.is_render: self.env.render() obs, reward, done, info = self.env.step(action) if self.single_stage and info["flag_get"]: self.stage_bonus = 10000 done = True ''' Construct Multi-Objective Reward''' ##################################### # [x_pos, time, death, coin] moreward = [] # 1. x position xpos_r = info["x_pos"] - self.x_pos self.x_pos = info["x_pos"] # resolve an issue where after death the x position resets if xpos_r < -5: xpos_r = 0 moreward.append(xpos_r) # 2. time penaltiy time_r = info["time"] - self.time self.time = info["time"] # time is aways decreasing if time_r > 0: time_r = 0 moreward.append(time_r) # 3. death if self.lives > info['life']: death_r = -25 else: death_r = 0 moreward.append(death_r) # 4. coin coin_r = (info['coins'] - self.coin) * 100 self.coin = info['coins'] moreward.append(coin_r) # 5. enemy enemy_r = info['score'] - self.score if coin_r > 0 or done: enemy_r = 0 self.score = info['score'] moreward.append(enemy_r) ############################################################################ if self.life_done: # when Mario loses life, changes the state to the terminal # state. if self.lives > info['life'] and info['life'] > 0: force_done = True self.lives = info['life'] else: force_done = done self.lives = info['life'] else: # normal terminal state force_done = done # reward range -15 ~ 15 r = reward / 15 self.rall += reward self.morall += np.array(moreward) mor = np.array(moreward) * self.n_mo / 15 self.history[:3, :, :] = self.history[1:, :, :] self.history[3, :, :] = self.pre_proc(obs) self.steps += 1 score = info['score'] + self.stage_bonus if done: self.recent_rlist.append(self.rall) self.recent_morlist.append(self.morall) print( "[Episode {}({})]\tStep: {}\tScore: {}\tMoReward: {}\tRecent MoReward: {}\tcoin: {}\tcurrent x:{}" .format(self.episode, self.env_idx, self.steps, score, self.morall, np.mean(self.recent_morlist, axis=0), info['coins'], info['x_pos'])) self.history = self.reset() self.child_conn.send( [self.history[:, :, :], r, force_done, done, mor, score]) def reset(self): self.steps = 0 self.episode += 1 self.rall = 0 self.lives = 3 self.coin = 0 self.x_pos = 0 self.time = 0 self.score = 0 self.stage_bonus = 0 self.morall = np.zeros(self.n_mo) self.get_init_state(self.env.reset()) return self.history[:, :, :] def pre_proc(self, X): # grayscaling x = cv2.cvtColor(X, cv2.COLOR_RGB2GRAY) # resize x = cv2.resize(x, (self.h, self.w)) x = np.float32(x) * (1.0 / 255.0) return x def get_init_state(self, s): for i in range(self.history_size): self.history[i, :, :] = self.pre_proc(s)
class ICMTrainer: """ Compose encoder, forward/inverse, and q_model into single trainer entity """ def __init__(self): self.env = gym_super_mario_bros.make('SuperMarioBros-v0') self.env = JoypadSpace(self.env, COMPLEX_MOVEMENT) self.replay = SMBExperienceReplay(buffer_size=BUFFER_SIZE, batch_size=BATCH_SIZE) self.q_model = Qnetwork() self.encoder = Phi() self.forward_model = Fnet() self.inverse_model = Gnet() all_model_params = list(self.q_model.parameters()) + list( self.encoder.parameters()) all_model_params += list(self.forward_model.parameters()) + \ list(self.inverse_model.parameters()) self.opt = optim.Adam(lr=0.001, params=all_model_params) @staticmethod def combined_loss(q_loss, inverse_loss, forward_loss): """ overall loss fn lambda*Qloss + (1-beta)*forward_loss + beta*inverse_loss """ loss_ = (1 - BETA) * inverse_loss loss_ += BETA * forward_loss loss_ = loss_.sum() / loss_.flatten().shape[0] loss = loss_ + LAMBDA * q_loss return loss def icm_loss(self, state1, action, state2, forward_scale=1., inverse_scale=1e4): """ calculate forward and inverse model losses for ICM """ fwd_loss_fn = nn.MSELoss(reduction='none') inverse_loss_fn = nn.CrossEntropyLoss(reduction='none') # encode input states state1_hat = self.encoder(state1) state2_hat = self.encoder(state2) # detach because don't want to back-prop here on the encoder state2_hat_pred = self.forward_model(state1_hat.detach(), action.detach()) forward_pred_err = fwd_loss_fn( state2_hat_pred, state2_hat.detach()).sum(dim=1).unsqueeze(dim=1) forward_pred_err *= forward_scale pred_action = self.inverse_model(state1_hat, state2_hat) inverse_pred_err = inverse_loss_fn( pred_action, action.detach().flatten()).unsqueeze(dim=1) inverse_pred_err *= inverse_scale return forward_pred_err, inverse_pred_err def batch_forward_pass(self, use_extrinsic=True): """ single forward pass that generates forward err, inverse err and q_loss""" # pylint: disable=E1101 state1_batch, action_batch, reward_batch, state2_batch = self.replay.get_batch( ) # reshape action/reward batches to be compatible with models action_batch = action_batch.view(action_batch.shape[0], 1) reward_batch = reward_batch.view(reward_batch.shape[0], 1) # run ICM forward_pred_err, inverse_pred_err = self.icm_loss( state1_batch, action_batch, state2_batch) # scale forward pred err using the Eta parameter i_reward = (1. / ETA) * forward_pred_err reward = i_reward.detach() if use_extrinsic: # whether to include explicit rewards in training reward += reward_batch # discount expected values for next state qvals = self.q_model(state2_batch) reward += GAMMA * torch.max(qvals) reward_pred = self.q_model(state1_batch) reward_target = reward_pred.clone() # convert action batch (integers) to OHE indices = torch.stack( (torch.arange(action_batch.shape[0]), action_batch.squeeze()), dim=0) indices = indices.tolist() reward_target[indices] = reward.squeeze() q_loss = 1e5 * nn.MSELoss()(F.normalize(reward_pred), F.normalize(reward_target.detach())) return forward_pred_err, inverse_pred_err, q_loss def repeat_action(self, action): """ given action, repeat it specified times, and return combined state and rewards """ state_deque = deque(maxlen=FRAMES_PER_STATE) sum_rewards = 0 for _ in range(ACTION_REPEATS): state2, e_reward_, done, info = self.env.step(action) if done: break sum_rewards += e_reward_ downscaled_state2 = downscale_img(state2, to_gray=True) # pylint: disable=E1101 prepped_state2 = torch.from_numpy(downscaled_state2). \ float().unsqueeze(dim=0) state_deque.append(prepped_state2) return state_deque, done, sum_rewards, info def train(self): """ full training loop """ self.env.reset() state1 = prepare_initial_state(self.env.render('rgb_array')) losses = [] ep_lengths = [] episode_length = 0 last_x_pos = self.env.env.env._x_position for i in range(TRAINING_STEPS): self.opt.zero_grad() episode_length += 1 q_val_pred = self.q_model(state1) if i > SWITCH_TO_EPS_GREEDY: action = int(sample_action(q_val_pred, EPS)) else: action = int(sample_action(q_val_pred)) state_deque, done, extrinsic_reward, info = self.repeat_action( action) # pylint: disable=E1101 state2 = torch.stack(list(state_deque), dim=1) self.replay.add_memory( state1, action, extrinsic_reward, # summed across repeated actions state2) if i % MAX_EPISODE_LEN == 0 and i != 0: if (info['x_pos'] - last_x_pos) < MIN_PROGRESS: done = True else: last_x_pos = info['x_pos'] if done: print("Episode over.") ep_lengths.append(info['x_pos']) self.env.reset() state1 = prepare_initial_state(self.env.render('rgb_array')) last_x_pos = self.env.env.env._x_position episode_length = 0 else: state1 = state2 # Enter mini-batch training if len(self.replay.memory) < BATCH_SIZE: continue forward_pred_err, inverse_pred_err, q_loss \ = self.batch_forward_pass(use_extrinsic=False) loss = self.combined_loss(q_loss, forward_pred_err, inverse_pred_err) loss_list = (q_loss.mean(), forward_pred_err.flatten().mean(), inverse_pred_err.flatten().mean(), episode_length) if i % 250 == 0: print("Epoch {}, Loss: {}".format(i, loss)) print( "Forward loss: {} \n Inverse loss: {} \n Qloss: {}".format( forward_pred_err.mean(), inverse_pred_err.mean(), q_loss.mean())) print(info) losses.append(loss_list) loss.backward() self.opt.step()
class MarioEnvironment(Process): def __init__( self, env_id, is_render, env_idx, child_conn, history_size=4, life_done=True, h=84, w=84, movement=COMPLEX_MOVEMENT, sticky_action=True, p=0.25): super(MarioEnvironment, self).__init__() self.daemon = True self.env = JoypadSpace( gym_super_mario_bros.make(env_id), COMPLEX_MOVEMENT) self.is_render = is_render self.env_idx = env_idx self.steps = 0 self.episode = 0 self.rall = 0 self.recent_rlist = deque(maxlen=100) self.child_conn = child_conn self.life_done = life_done self.history_size = history_size self.history = np.zeros([history_size, h, w]) self.h = h self.w = w self.reset() def run(self): super(MarioEnvironment, self).run() while True: action = self.child_conn.recv() if self.is_render: self.env.render() obs, reward, done, info = self.env.step(action) # when Mario loses life, changes the state to the terminal # state. if self.life_done: if self.lives > info['life'] and info['life'] > 0: force_done = True self.lives = info['life'] else: force_done = done self.lives = info['life'] else: force_done = done # reward range -15 ~ 15 log_reward = reward / 15 self.rall += log_reward r = log_reward self.history[:3, :, :] = self.history[1:, :, :] self.history[3, :, :] = self.pre_proc(obs) self.steps += 1 if done: self.recent_rlist.append(self.rall) print( "[Episode {}({})] Step: {} Reward: {} Recent Reward: {} Stage: {} current x:{} max x:{}".format( self.episode, self.env_idx, self.steps, self.rall, np.mean( self.recent_rlist), info['stage'], info['x_pos'], self.max_pos)) self.history = self.reset() self.child_conn.send([self.history[:, :, :], r, force_done, done, log_reward]) def reset(self): self.last_action = 0 self.steps = 0 self.episode += 1 self.rall = 0 self.lives = 3 self.stage = 1 self.max_pos = 0 self.get_init_state(self.env.reset()) return self.history[:, :, :] def pre_proc(self, X): # grayscaling x = cv2.cvtColor(X, cv2.COLOR_RGB2GRAY) # resize x = cv2.resize(x, (self.h, self.w)) return x def get_init_state(self, s): for i in range(self.history_size): self.history[i, :, :] = self.pre_proc(s)
class Env(object): def __init__(self, game, **kwargs): self.act_space = kwargs.get("act_space") self.state_size = kwargs.get("state_size") self.burn_in = kwargs.get("burn_in") self.seqlen = kwargs.get("seqlen") self.n_step = kwargs.get("n_step") self.use_soft = kwargs.get("use_soft") self.frames = kwargs.get("frames") self.sample_epsilon_per_step = kwargs.get("sample_epsilon_per_step") self.epsilon = np.power(0.4, random.uniform(4, 8)) self.game = game self.count = 0 self.count_maxpos = [] env = gym_super_mario_bros.make(game) if self.act_space == 7: self.env = JoypadSpace(env, SIMPLE_MOVEMENT) elif self.act_space == 12: self.env = JoypadSpace(env, COMPLEX_MOVEMENT) self.max_pos = -10000 self.done = True self.reset() def step(self, a, state_in): maxpos = self.reset() self.count += 1 if not self.use_soft: if self.sample_epsilon_per_step: self.epsilon = np.power(0.4, random.uniform(4, 8)) if random.random() < self.epsilon: a = random.randint(0, self.act_space - 1) self.a_t = a gs_t1, gr_t, gdone, ginfo = self.env.step(self.a_t) self.env.render() if not gdone: s_t1, r_t, done, info = self.env.step(self.a_t) r_t += gr_t r_t /= 2. else: s_t1 = gs_t1 r_t = gr_t done = gdone info = ginfo s_t1 = self.resize_image(s_t1) channels = s_t1.shape[-1] self.s_t = np.concatenate([s_t1, self.s_t[:, :, :-channels]], axis=-1) self.s.append(self.s_t) self.a.append(self.a_t) self.r.append(r_t) self.max_pos = max(self.max_pos, info["x_pos"]) self.pos.append(info["x_pos"]) if (len(self.pos) > 100) and (info["x_pos"] - self.pos[-100] < 5) and ( self.pos[-100] - info["x_pos"] < 5): done = True self.done = done if self.done: self.mask.append(0) else: self.mask.append(1) self.state_in.append(state_in) """ get segs """ # segs = self.get_history() # # return segs return maxpos def reset(self): if self.done: self.count_maxpos.append(self.max_pos) print(self.game, self.max_pos, len(self.count_maxpos[1:]), np.mean(self.count_maxpos[1:])) self.epsilon = np.power(0.4, random.uniform(4, 8)) self.count = 0 s_t = self.resize_image(self.env.reset()) self.s_t = np.tile(s_t, [1, 1, self.frames]) self.s = [self.s_t] self.a_t = random.randint(0, self.act_space - 1) self.a = [self.a_t] self.r = [0] self.mask = [1] self.max_pos = -10000 self.pos = [] state_in = np.zeros(self.state_size, dtype=np.float32) self.state_in = [state_in] self.done = False return self.count_maxpos return None def get_state(self): return self.s_t def get_act(self): return self.a_t def get_reward(self): return self.r[-1] def get_max_pos(self): return self.max_pos def get_state_in(self): return self.state_in[-1] @staticmethod def resize_image(image, size=84): image = Image.fromarray(image) image = image.convert("L") image = image.resize((size, size)) image = np.array(image, np.uint8) return image[:, :, None]
class MarioEnvironment(dm_env.Environment): def __init__( self, skip_frames: int = 3, img_rescale_pc: float = 0.4, stack_func: Optional[Callable[[List[np.ndarray]], np.ndarray]] = np.hstack, stack_mode: str = "all", grayscale: bool = True, black_background: bool = True, in_game_score_weight: float = 0.01, movement_type: str = "simple", world_and_level: Optional[Tuple[int, int]] = None, idle_frames_threshold: Optional[int] = 1250, colorful_rendering: bool = True, ) -> None: assert stack_mode in ("first_and_last", "all") self._stack_mode = stack_mode env_name = (f"SuperMarioBros" if world_and_level is None else "SuperMarioBros-%d-%d" % world_and_level) env_name += f"-v{int(black_background)}" self._smb_env = gym_super_mario_bros.make(env_name) self._smb_env = JoypadSpace(self._smb_env, MOVEMENTS_TYPES[movement_type]) self._actions_queue = [] self._colorful_env = None if (grayscale or black_background) and colorful_rendering: self._colorful_env = gym_super_mario_bros.make( "SuperMarioBros-%d-%d-v0" % world_and_level) self._colorful_env = JoypadSpace(self._colorful_env, MOVEMENTS_TYPES[movement_type]) self._stack_func = stack_func self._grayscale = grayscale self._score_weight = in_game_score_weight self._idle_frames_threshold = idle_frames_threshold self._last_score = 0 self._last_x = 40 self._idle_counter = 0 self._rescale_pc = img_rescale_pc self._skip_frames = skip_frames self._obs_shape = self.reset().observation.shape self._num_actions = self._smb_env.action_space.n def reset(self): """ Returns the first `TimeStep` of a new episode. """ self._smb_env.reset() self._last_score = 0 self._last_x = 40 self._idle_counter = 0 self._actions_queue = [] if self._colorful_env is not None: self._colorful_env.reset() return dm_env.restart(self.step(0).observation) def _is_idle(self, info): if self._idle_frames_threshold is None: return False x = info["x_pos"] delta_x = x - self._last_x self._last_x = x if abs(delta_x) < 1: self._idle_counter += 1 return self._idle_counter > self._idle_frames_threshold self._idle_counter = 0 return False def step(self, action) -> TimeStep: """ Updates the environment's state. """ # NOTE: # The gym_super_mario_bros environment reuses the numpy array it # returns as observation. When stacking observations, this might be # a source of bugs (all observations in the stack might be representing # the same, final frame!), so always copy the arrays when doing that. # The observation arrays are already being copied inside # `self._preprocess_img`, so no explicit copying is needed here. action = int(action) initial_img, total_reward, done, info = self._smb_env.step(action) self._actions_queue.append(action) done = done or self._is_idle(info) # Skipping frames: if self._skip_frames > 0: imgs = [self._process_img(initial_img)] skip_count = 0 while skip_count < self._skip_frames: skip_count += 1 if not done: last_img, reward, done, info = self._smb_env.step(action) self._actions_queue.append(action) done = done or self._is_idle(info) total_reward += reward else: last_img = np.zeros_like(initial_img) if self._stack_mode == "all" or skip_count == self._skip_frames: imgs.append(self._process_img(last_img)) obs = self._stack_func(imgs) # Single frame: else: obs = self._process_img(initial_img) score_diff = info["score"] - self._last_score self._last_score = info["score"] total_reward = np.float64(total_reward + self._score_weight * score_diff) if done: return dm_env.termination(reward=total_reward, observation=obs) return dm_env.transition(reward=total_reward, observation=obs) def observation_spec(self): return dm_env.specs.BoundedArray(shape=self._obs_shape, dtype=np.float32, name="image", minimum=0, maximum=1) def action_spec(self): return dm_env.specs.DiscreteArray(dtype=np.int32, name="action", num_values=self._num_actions) def _process_img(self, img): img = np.divide(img, 255) img = img[50:, :, :] if abs(self._rescale_pc - 1) > 1e-2: img = rescale(img, scale=self._rescale_pc, multichannel=True) if self._grayscale: img = img @ RGB2GRAY_COEFFICIENTS return img.astype(np.float32, copy=True) def render(self, mode="human", return_all_imgs=False): if return_all_imgs: assert self._colorful_env is not None and mode == "rgb_array", ( "The option 'return_all_imgs' is valid only when using " "colorful rendering and rgb array mode!") # Regular rendering: if self._colorful_env is None: return self._smb_env.render(mode) # Colorful rendering: img_list = [] for action in self._actions_queue: self._colorful_env.step(action) if return_all_imgs: # NOTE: make sure a copy of the returned rgb array is made! img_list.append(self._colorful_env.render(mode).copy()) self._actions_queue = [] return img_list if return_all_imgs else self._colorful_env.render(mode) def plot_obs(self, obs): plt.imshow(obs, cmap="gray" if self._grayscale else None) plt.show() def close(self): self._smb_env.close()
class DQLMarioAgent(DQLAgent.DQLAgent): def __init__(self, action_type, batch_size, model_type, success_margin, success_score, memory_size, record_video, target_model, project, wrapper_type): super().__init__(action_type, batch_size, model_type, success_margin, success_score, memory_size, record_video, target_model, project) self.env = gym_super_mario_bros.make('SuperMarioBros-v0') if wrapper_type == 'COMPLEX': self.env = JoypadSpace(self.env, COMPLEX_MOVEMENT) # -> 12 elif wrapper_type == 'SIMPLE': self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT) # -> 7 else: self.env = JoypadSpace(self.env, RIGHT_ONLY) # -> 5 self.action_size = self.env.action_space.n self.num_states = 1 self.state_single_size = 80 self.state_size = (self.state_single_size, self.state_single_size) self.action = self.env.action_space.sample() self.first_last_x_pos = self.env.env.env._x_position self.max_distance = self.first_last_x_pos self.DLModel = NNModel.DLModel(env=self.env, action_size=self.action_size, state_size=self.state_single_size, states=self.num_states, model_type=model_type, output_dir=self.others_dir) def append_new_frame(self): """ Save generated env's frame """ self.renders.append( img_as_ubyte( resize(self.env.render(mode='rgb_array'), (480, 480, 3)))) def get_first_state(self): """ :return: inicial state """ first_state = self.env.reset() return Utils.prepare_initial_state(first_state, self.state_size, channels=1) def get_first_x_pos(self): """ :return: inicial x position """ return self.first_last_x_pos def prepare_state(self, next_state, channels=1): """ Remove upper image info, reduce channels, and reduce image size :param channels: number of layers :param next_state: state to process :return: preprocessed image generated by env """ return Utils.prepare_initial_state(next_state, self.state_size, channels=channels) def reset_max_distance(self): """ Reset episode max distance """ self.max_distance = self.first_last_x_pos def update_max_distance(self, dist): """ Update episode max distance :param dist: new distance :return: new max distance """ if dist > self.max_distance: self.max_distance = dist return self.max_distance