class Worker(threading.Thread): global_episode = 0 global_moving_average_reward = 0 best_score = 0 save_lock = threading.Lock() def __init__(self, state_size, action_size, global_model, opt, result_queue, idx, game_name='Tetris', save_dir='/tmp'): super(Worker, self).__init__() self.state_size = state_size self.action_size = action_size self.result_queue = result_queue self.global_model = global_model self.opt = opt self.local_model = ActorCriticModel(self.state_size, self.action_size) self.worker_idx = idx self.env = gym_tetris.make('TetrisA-v0') self.env = JoypadSpace(self.env, MOVEMENT) self.save_dir = save_dir self.ep_loss = 0.0 self.game_name = 'Tetris' def run(self): total_step = 1 mem = Memory() while Worker.global_episode < episodios: self.env.reset() estado = [0., 0., 0., 0.] mem.clear() ep_reward = 0. ep_steps = 0 self.ep_loss = 0 informacion = self.env.get_info() antiguo_statistics = informacion['statistics'] time_count = 0 done = False pieza_colocada = True while not done: # Si hemos colocado la pieza calculamos la posicion y el giro de la proxima pieza if pieza_colocada: pieza_colocada = False pos = 5 giro = 1 u = -1 ant_nom_piez = '' estado = [estado] logits, _ = self.local_model( tf.convert_to_tensor(estado, dtype=tf.float32)) probs = tf.nn.softmax(logits) prob = probs[0][39] probs = np.delete(probs[0], 39) suma = np.sum(probs) probs = np.insert(probs, 39, abs(1 - suma)) action = np.random.choice(self.action_size, p=probs) pos_objetivo = action % 10 giro_objetivo = (action // 10) + 1 # Colocamos la pieza donde hemos calculado girandola y moviendola if (giro % giro_objetivo) != 0 and not done: state, reward, done, info = self.env.step(1) accion = 0 giro = giro + 1 elif pos > pos_objetivo and not done: state, reward, done, info = self.env.step(6) pos = pos - 1 accion = 0 elif pos < pos_objetivo and not done: state, reward, done, info = self.env.step(3) pos = pos + 1 accion = 0 elif not done and not pieza_colocada: state, reward, done, info = self.env.step(9) accion = 9 else: accion = 0 if not done: new_state, reward, done, info = self.env.step(accion) informacion = self.env.get_info() # Si la pieza ha sido colocada calculamos las ganancias del movimiento if antiguo_statistics != informacion['statistics']: antiguo_statistics = informacion['statistics'] ep_reward_new = informacion['score'] reward = ep_reward_new - ep_reward board = self.env.board() nuevo_estado = board_prop(board)[:] pieza_colocada = True k = 1 if nuevo_estado[0] > 18: done = True ep_reward = ep_reward_new mem.store(estado[0], action, reward) # Calculamos el gradiente local usando la perdida calculada de nuestra partida actual y # nuestro modelo if time_count == 10 or done: with tf.GradientTape() as tape: total_loss = self.compute_loss( done, nuevo_estado, mem, 0.99) self.ep_loss += total_loss grads = tape.gradient( total_loss, self.local_model.trainable_weights) self.opt.apply_gradients( zip(grads, self.global_model.trainable_weights)) self.local_model.set_weights( self.global_model.get_weights()) mem.clear() time_count = 0 if done: Worker.global_moving_average_reward = \ record(Worker.global_episode, ep_reward, self.worker_idx, Worker.global_moving_average_reward, self.result_queue, self.ep_loss, ep_steps) if ep_reward > Worker.best_score: with Worker.save_lock: self.global_model.save_weights( os.path.join( self.save_dir, 'model_{}.h5'.format( self.game_name))) Worker.best_score = ep_reward Worker.global_episode += 1 ep_steps += 1 time_count += 1 estado = nuevo_estado total_step += 1 self.result_queue.put(None) # Calculamos la perdida def compute_loss(self, done, nuevo_estado, memory, gamma=0.99): if done: reward_sum = 0. # terminal else: nuevo_estado = [nuevo_estado] reward_sum = self.local_model( tf.convert_to_tensor(nuevo_estado, dtype=tf.float32))[-1].numpy()[0] discounted_rewards = [] for reward in memory.rewards[::-1]: reward_sum = reward + gamma * reward_sum discounted_rewards.append(reward_sum) discounted_rewards.reverse() logits, values = self.local_model( tf.convert_to_tensor(np.vstack(memory.states), dtype=tf.float32)) advantage = tf.convert_to_tensor(np.array(discounted_rewards)[:, None], dtype=tf.float32) - values value_loss = advantage**2 policy = tf.nn.softmax(logits) entropy = tf.nn.softmax_cross_entropy_with_logits_v2(labels=policy, logits=logits) policy_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=memory.actions, logits=logits) policy_loss *= tf.stop_gradient(advantage) policy_loss -= 0.01 * entropy total_loss = tf.reduce_mean((0.5 * value_loss + policy_loss)) return total_loss
class Env(object): def __init__(self, game, **kwargs): self.act_space = kwargs.get("act_space") self.state_size = kwargs.get("state_size") self.burn_in = kwargs.get("burn_in") self.seqlen = kwargs.get("seqlen") self.n_step = kwargs.get("n_step") self.frames = kwargs.get("frames") self.replay = kwargs.get("replay") self.use_epsilon_greedy = kwargs.get("use_epsilon_greedy") self.game = game self.count = 0 env = gym_super_mario_bros.make(game) if self.act_space == 7: self.env = JoypadSpace(env, SIMPLE_MOVEMENT) elif self.act_space == 12: self.env = JoypadSpace(env, COMPLEX_MOVEMENT) self.max_pos = -10000 self.done = True self.reset() def step(self, a, a_logits, v_cur, state_in): self.count += 1 if self.use_epsilon_greedy: a = np.argmax(a_logits) a_logits = self.epsilon / self.act_space * np.ones( self.act_space) a_logits[a] += (1 - self.epsilon) a_logits = np.log(a_logits) if random.random() < self.epsilon: a = random.randint(0, self.act_space - 1) self.a_t = a gs_t1, gr_t, gdone, ginfo = self.env.step(self.a_t) if not gdone: s_t1, r_t, done, info = self.env.step(self.a_t) r_t += gr_t r_t /= 2. else: s_t1 = gs_t1 r_t = gr_t done = gdone info = ginfo r_t /= 15.0 s_t1 = self.resize_image(s_t1) channels = s_t1.shape[-1] self.s_t = np.concatenate([s_t1, self.s_t[:, :, :-channels]], axis=-1) self.s.append(self.s_t) self.a.append(self.a_t) self.a_logits.append(a_logits) self.r.append(r_t) self.v_cur.append(v_cur) self.max_pos = max(self.max_pos, info["x_pos"]) self.pos.append(info["x_pos"]) if (len(self.pos) > 100) and (info["x_pos"] - self.pos[-100] < 5) and (self.pos[-100] - info["x_pos"] < 5): done = True self.done = done if self.done: self.mask.append(0) else: self.mask.append(1) self.state_in.append(state_in) """ get segs """ segs = self.get_history() self.reset() return segs def reset(self): if self.done: print(self.game, self.max_pos) self.count = 0 self.epsilon = 0.4**random.uniform(1, 8) s_t = self.resize_image(self.env.reset()) self.s_t = np.tile(s_t, [1, 1, self.frames]) self.s = [ np.zeros_like(self.s_t) for i in range(self.burn_in) ] + [self.s_t] self.a_t = random.randint(0, self.act_space - 1) self.a = [ random.randint(0, self.act_space - 1) for i in range(self.burn_in) ] + [self.a_t] self.a_logits = [ np.zeros(self.act_space) for i in range(self.burn_in) ] self.r = [0] * self.burn_in + [0] self.v_cur = [0] * self.burn_in self.mask = [1] * self.burn_in + [1] self.max_pos = -10000 self.pos = [] state_in = np.zeros(self.state_size, dtype=np.float32) self.state_in = [state_in] * self.burn_in + [state_in] self.done = False def get_state(self): return self.s_t def get_act(self): return self.a_t def get_reward(self): return self.r[-1] def get_max_pos(self): return self.max_pos def get_state_in(self): return self.state_in[-1] def get_history(self): segs = [] t = self.burn_in // self.replay if self.done: for i in range(self.replay): seg = Seg(self.s[i * t:], self.a[i * t:], self.a_logits[i * t:], self.r[i * t:], self.v_cur[i * t:], self.state_in[i * t:], self.mask[i * t:]) segs += self.postprocess(seg) elif len(self.s) >= self.burn_in + self.seqlen + self.n_step: cut = self.burn_in + self.seqlen + self.n_step seg = Seg(self.s[:cut], self.a[:cut], self.a_logits[:cut], self.r[:cut], self.v_cur[:cut], self.state_in[:cut], self.mask[:cut]) self.s = self.s[t:] self.a = self.a[t:] self.a_logits = self.a_logits[t:] self.r = self.r[t:] self.v_cur = self.v_cur[t:] self.state_in = self.state_in[t:] self.mask = self.mask[t:] return [self.postprocess_one_seg(seg)] return segs def postprocess_one_seg(self, seg): seqlen = self.seqlen + self.burn_in + self.n_step next_seg = dict() next_seg["s"] = padding(seg.s[:seqlen], seqlen, np.uint8) next_seg["a"] = padding(seg.a[:seqlen], seqlen, np.int32) next_seg["a_logits"] = padding(seg.a_logits[:seqlen], seqlen, np.float32) next_seg["r"] = padding(seg.r[:seqlen], seqlen, np.float32) next_seg["v_cur"] = padding(seg.v_cur[:seqlen], seqlen, np.float32) next_seg["state_in"] = np.array(seg.state_in[0], np.float32) next_seg["mask"] = padding(seg.mask[:seqlen], seqlen, np.int32) return next_seg def postprocess(self, seg): """ postprocess the seg for training :author lhw """ burn_in = self.burn_in seg_results = [] if seg is not None: while len(seg[0]) > burn_in + self.n_step: next_seg = self.postprocess_one_seg(seg) seg_results.append(next_seg) seg = Seg(*[t[burn_in:] for t in seg]) return seg_results @staticmethod def resize_image(image, size=84): image = Image.fromarray(image) image = image.convert("L") image = image.resize((size, size)) image = np.array(image, np.uint8) return image[:, :, None]
class MarioEnvironment(dm_env.Environment): def __init__( self, skip_frames: int = 3, img_rescale_pc: float = 0.4, stack_func: Optional[Callable[[List[np.ndarray]], np.ndarray]] = np.hstack, stack_mode: str = "all", grayscale: bool = True, black_background: bool = True, in_game_score_weight: float = 0.01, movement_type: str = "simple", world_and_level: Optional[Tuple[int, int]] = None, idle_frames_threshold: Optional[int] = 1250, colorful_rendering: bool = True, ) -> None: assert stack_mode in ("first_and_last", "all") self._stack_mode = stack_mode env_name = (f"SuperMarioBros" if world_and_level is None else "SuperMarioBros-%d-%d" % world_and_level) env_name += f"-v{int(black_background)}" self._smb_env = gym_super_mario_bros.make(env_name) self._smb_env = JoypadSpace(self._smb_env, MOVEMENTS_TYPES[movement_type]) self._actions_queue = [] self._colorful_env = None if (grayscale or black_background) and colorful_rendering: self._colorful_env = gym_super_mario_bros.make( "SuperMarioBros-%d-%d-v0" % world_and_level) self._colorful_env = JoypadSpace(self._colorful_env, MOVEMENTS_TYPES[movement_type]) self._stack_func = stack_func self._grayscale = grayscale self._score_weight = in_game_score_weight self._idle_frames_threshold = idle_frames_threshold self._last_score = 0 self._last_x = 40 self._idle_counter = 0 self._rescale_pc = img_rescale_pc self._skip_frames = skip_frames self._obs_shape = self.reset().observation.shape self._num_actions = self._smb_env.action_space.n def reset(self): """ Returns the first `TimeStep` of a new episode. """ self._smb_env.reset() self._last_score = 0 self._last_x = 40 self._idle_counter = 0 self._actions_queue = [] if self._colorful_env is not None: self._colorful_env.reset() return dm_env.restart(self.step(0).observation) def _is_idle(self, info): if self._idle_frames_threshold is None: return False x = info["x_pos"] delta_x = x - self._last_x self._last_x = x if abs(delta_x) < 1: self._idle_counter += 1 return self._idle_counter > self._idle_frames_threshold self._idle_counter = 0 return False def step(self, action) -> TimeStep: """ Updates the environment's state. """ # NOTE: # The gym_super_mario_bros environment reuses the numpy array it # returns as observation. When stacking observations, this might be # a source of bugs (all observations in the stack might be representing # the same, final frame!), so always copy the arrays when doing that. # The observation arrays are already being copied inside # `self._preprocess_img`, so no explicit copying is needed here. action = int(action) initial_img, total_reward, done, info = self._smb_env.step(action) self._actions_queue.append(action) done = done or self._is_idle(info) # Skipping frames: if self._skip_frames > 0: imgs = [self._process_img(initial_img)] skip_count = 0 while skip_count < self._skip_frames: skip_count += 1 if not done: last_img, reward, done, info = self._smb_env.step(action) self._actions_queue.append(action) done = done or self._is_idle(info) total_reward += reward else: last_img = np.zeros_like(initial_img) if self._stack_mode == "all" or skip_count == self._skip_frames: imgs.append(self._process_img(last_img)) obs = self._stack_func(imgs) # Single frame: else: obs = self._process_img(initial_img) score_diff = info["score"] - self._last_score self._last_score = info["score"] total_reward = np.float64(total_reward + self._score_weight * score_diff) if done: return dm_env.termination(reward=total_reward, observation=obs) return dm_env.transition(reward=total_reward, observation=obs) def observation_spec(self): return dm_env.specs.BoundedArray(shape=self._obs_shape, dtype=np.float32, name="image", minimum=0, maximum=1) def action_spec(self): return dm_env.specs.DiscreteArray(dtype=np.int32, name="action", num_values=self._num_actions) def _process_img(self, img): img = np.divide(img, 255) img = img[50:, :, :] if abs(self._rescale_pc - 1) > 1e-2: img = rescale(img, scale=self._rescale_pc, multichannel=True) if self._grayscale: img = img @ RGB2GRAY_COEFFICIENTS return img.astype(np.float32, copy=True) def render(self, mode="human", return_all_imgs=False): if return_all_imgs: assert self._colorful_env is not None and mode == "rgb_array", ( "The option 'return_all_imgs' is valid only when using " "colorful rendering and rgb array mode!") # Regular rendering: if self._colorful_env is None: return self._smb_env.render(mode) # Colorful rendering: img_list = [] for action in self._actions_queue: self._colorful_env.step(action) if return_all_imgs: # NOTE: make sure a copy of the returned rgb array is made! img_list.append(self._colorful_env.render(mode).copy()) self._actions_queue = [] return img_list if return_all_imgs else self._colorful_env.render(mode) def plot_obs(self, obs): plt.imshow(obs, cmap="gray" if self._grayscale else None) plt.show() def close(self): self._smb_env.close()
class Env(object): def __init__(self, act_space, act_repeats, frames, epsilon, game): self.act_space = act_space self.act_repeats = act_repeats self.act_repeat = random.choice(self.act_repeats) self.epsilon = epsilon self.frames = frames self.max_pos = -10000 self.count = 0 env = gym_super_mario_bros.make(game) if act_space == 7: self.env = JoypadSpace(env, SIMPLE_MOVEMENT) elif act_space == 12: self.env = JoypadSpace(env, COMPLEX_MOVEMENT) s_t = self.resize_image(self.env.reset()) self.s_t = np.tile(s_t, [1, 1, frames]) self.s = [self.s_t] self.a_t = random.randint(0, act_space - 1) self.a = [self.a_t] self.r = [] self.pos = [] c_in = np.zeros(256, dtype=np.float32) h_in = np.zeros(256, dtype=np.float32) state_in = np.concatenate([c_in, h_in], axis=-1) self.state_in = [state_in] self.done = False def step(self, a, state_in): self.count += 1 if random.random() < self.epsilon: a = random.randint(0, self.act_space - 1) if self.count % self.act_repeat == 0: self.a_t = a self.count = 0 self.act_repeat = random.choice(self.act_repeats) gs_t1, gr_t, gdone, ginfo = self.env.step(self.a_t) if not gdone: s_t1, r_t, done, info = self.env.step(self.a_t) r_t += gr_t r_t /= 2. else: s_t1 = gs_t1 r_t = gr_t done = gdone info = ginfo r_t /= 15. s_t1 = self.resize_image(s_t1) channels = s_t1.shape[-1] self.s_t = np.concatenate([s_t1, self.s_t[:, :, :-channels]], axis=-1) self.s.append(self.s_t) self.a.append(self.a_t) self.r.append(r_t) self.max_pos = max(self.max_pos, info["x_pos"]) self.pos.append(info["x_pos"]) if (len(self.pos) > 500) and (info["x_pos"] - self.pos[-500] < 5) and ( self.pos[-500] - info["x_pos"] < 5): done = True self.done = done self.state_in.append(state_in) def reset(self, force=False): if self.done or force: self.count = 0 self.act_repeat = random.choice(self.act_repeats) s_t = self.resize_image(self.env.reset()) self.s_t = np.tile(s_t, [1, 1, self.frames]) self.s = [self.s_t] self.a_t = random.randint(0, self.act_space - 1) self.a = [self.a_t] self.r = [] self.pos = [] c_in = np.zeros(256, dtype=np.float32) h_in = np.zeros(256, dtype=np.float32) state_in = np.concatenate([c_in, h_in], axis=-1) self.state_in = [state_in] self.done = False def get_state(self): return self.s_t def get_act(self): return self.a_t def get_max_pos(self): return self.max_pos def reset_max_pos(self): self.max_pos = -10000 def get_state_in(self): return self.state_in[-1] def get_history(self, force=False): if self.done or force: seg = Seg(self.s, self.a, self.r, self.state_in) return seg return None @staticmethod def resize_image(image, size=84): image = Image.fromarray(image) image = image.convert("L") image = image.resize((size, size)) image = np.array(image) image = image / 255. image = np.array(image, np.float32) return image[:, :, None]
In Mario, the environment consists of tubes, mushrooms and other components. When Mario makes an action, the environment responds with the changed (next) state, reward and other info. """ # Initialize Super Mario environment env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0") # Limit the action-space to # 0. walk right # 1. jump right env = JoypadSpace(env, [["right"], ["right", "A"]]) env.reset() next_state, reward, done, info = env.step(action=0) print(f"{next_state.shape},\n {reward},\n {done},\n {info}") """Preprocess Environment ------------------------ Environment data is returned to the agent in ``next_state``. As you saw above, each state is represented by a ``[3, 240, 256]`` size array. Often that is more information than our agent needs; for instance, Mario’s actions do not depend on the color of the pipes or the sky! We use **Wrappers** to preprocess environment data before sending it to the agent. ``GrayScaleObservation`` is a common wrapper to transform an RGB image to grayscale; doing so reduces the size of the state representation
'ac_kwargs': dict(hidden_sizes=[64] * 2), 'device': 'cpu' } policy = PPO(**kwargs) policy.ac.load_state_dict(torch.load(model_name, map_location='cpu')) obs_normal = joblib.load(save_name)['obs_normal'] if obs_normal is not None: obs_normal.cpu = 1 # This Command for slower(setpts > 1.0) of faster(setpts < 1.0) video # ffmpeg -r 60 -i input.mp4 -filter:v "setpts=2.0*PTS" output.mp4 # This Command for add audio to video(Magic) # ffmpeg -i video.mp4 -i audio.mp3 -map 0:v -map 1:a -codec copy -shortest out.mp4 # Source: https://stackoverflow.com/questions/20254846/how-to-add-an-external-audio-track-to-a-video-file-using-vlc-or-ffmpeg-command-l # xvfb-run -s "-screen 0 640x480x24" python test_model.py # this cmd are for envs needs display to render(like CartPloy-v1) for ep in range(10): obs = env.reset() if obs_normal is not None: obs = obs_normal.normalize_all(obs, update=False) while True: act = policy.act(obs) nx_obs, rew, done, info = env.step(act) obs = nx_obs if obs_normal is not None: obs = obs_normal.normalize_all(obs, update=False) if done: break
class MoMarioEnv(Process): def __init__(self, args, env_idx, child_conn, history_size=4, h=84, w=84): super(MoMarioEnv, self).__init__() self.daemon = True self.env = JoypadSpace(gym_super_mario_bros.make(args.env_id), SIMPLE_MOVEMENT) self.is_render = args.render self.env_idx = env_idx self.steps = 0 self.episode = 0 self.rall = 0 self.coin = 0 self.x_pos = 0 self.time = 0 self.score = 0 self.n_mo = 5 self.morall = np.zeros(self.n_mo) self.recent_rlist = deque(maxlen=100) self.recent_morlist = deque(maxlen=100) self.child_conn = child_conn self.life_done = args.life_done self.single_stage = args.single_stage self.stage_bonus = 0 self.history_size = history_size self.history = np.zeros([history_size, h, w]) self.h = h self.w = w self.reset() def run(self): super(MoMarioEnv, self).run() while True: action = self.child_conn.recv() if self.is_render: self.env.render() obs, reward, done, info = self.env.step(action) if self.single_stage and info["flag_get"]: self.stage_bonus = 10000 done = True ''' Construct Multi-Objective Reward''' ##################################### # [x_pos, time, death, coin] moreward = [] # 1. x position xpos_r = info["x_pos"] - self.x_pos self.x_pos = info["x_pos"] # resolve an issue where after death the x position resets if xpos_r < -5: xpos_r = 0 moreward.append(xpos_r) # 2. time penaltiy time_r = info["time"] - self.time self.time = info["time"] # time is aways decreasing if time_r > 0: time_r = 0 moreward.append(time_r) # 3. death if self.lives > info['life']: death_r = -25 else: death_r = 0 moreward.append(death_r) # 4. coin coin_r = (info['coins'] - self.coin) * 100 self.coin = info['coins'] moreward.append(coin_r) # 5. enemy enemy_r = info['score'] - self.score if coin_r > 0 or done: enemy_r = 0 self.score = info['score'] moreward.append(enemy_r) ############################################################################ if self.life_done: # when Mario loses life, changes the state to the terminal # state. if self.lives > info['life'] and info['life'] > 0: force_done = True self.lives = info['life'] else: force_done = done self.lives = info['life'] else: # normal terminal state force_done = done # reward range -15 ~ 15 r = reward / 15 self.rall += reward self.morall += np.array(moreward) mor = np.array(moreward) * self.n_mo / 15 self.history[:3, :, :] = self.history[1:, :, :] self.history[3, :, :] = self.pre_proc(obs) self.steps += 1 score = info['score'] + self.stage_bonus if done: self.recent_rlist.append(self.rall) self.recent_morlist.append(self.morall) print( "[Episode {}({})]\tStep: {}\tScore: {}\tMoReward: {}\tRecent MoReward: {}\tcoin: {}\tcurrent x:{}" .format(self.episode, self.env_idx, self.steps, score, self.morall, np.mean(self.recent_morlist, axis=0), info['coins'], info['x_pos'])) self.history = self.reset() self.child_conn.send( [self.history[:, :, :], r, force_done, done, mor, score]) def reset(self): self.steps = 0 self.episode += 1 self.rall = 0 self.lives = 3 self.coin = 0 self.x_pos = 0 self.time = 0 self.score = 0 self.stage_bonus = 0 self.morall = np.zeros(self.n_mo) self.get_init_state(self.env.reset()) return self.history[:, :, :] def pre_proc(self, X): # grayscaling x = cv2.cvtColor(X, cv2.COLOR_RGB2GRAY) # resize x = cv2.resize(x, (self.h, self.w)) x = np.float32(x) * (1.0 / 255.0) return x def get_init_state(self, s): for i in range(self.history_size): self.history[i, :, :] = self.pre_proc(s)
def run(self): global episode env = gym_super_mario_bros.make('SuperMarioBros-v0') env = JoypadSpace(env, SIMPLE_MOVEMENT) # env = gym.make(env_name) # env.render() step = 0 gc.collect() while episode < EPISODES: done = False dead = False score, start_life = 0, 5 observe = env.reset() next_observe = observe # 0~30 상태동안 정지 for _ in range(random.randint(1, 30)): observe = next_observe next_observe, _, _, _ = env.step(1) state = pre_processing(next_observe, observe) history = np.stack((state, state, state, state), axis=2) history = np.reshape([history], (1, 240, 256, 4)) coinStatus = 0 marioStatus = "small" flagStatus = False softReward = 0 lifeStatus = 2 while not done: step += 1 self.t += 1 observe = next_observe action, policy = self.get_action(history) # # 1: 정지, 2: 왼쪽, 3: 오른쪽 # if action == 0: # real_action = 1 # elif action == 1: # real_action = 2 # else: # real_action = 3 # # # 죽었을 때 시작하기 위해 발사 행동을 함 # if dead: # action = 0 # real_action = 1 # dead = False # 선택한 행동으로 한 스텝을 실행 next_observe, reward, done, info = env.step(action) # 각 타임스텝마다 상태 전처리 next_state = pre_processing(next_observe, observe) next_state = np.reshape([next_state], (1, 240, 256, 1)) next_history = np.append(next_state, history[:, :, :, :3], axis=3) # 정책의 최대값 self.avg_p_max += np.amax(self.actor.predict(np.float32(history / 255.))) real_reward = reward if start_life > info['life']: dead = True start_life = info['life'] # ### # if coinStatus != info["coins"]: # coinStatus = info["coins"] # reward = reward + 10 # if marioStatus != info["status"]: # marioStatus = info["status"] # reward = reward + 200 # if flagStatus != info["flag_get"]: # flagStatus = info["flag_get"] # reward = reward + 200 # if lifeStatus != info["life"]: # lifeStatus = info["life"] # reward = reward - 200 # # if info["x_pos"] < 10: # info["x_pos"] = 10 # if info["time"] < 10: # info["time"] = 10 # # reward = reward + ((info["x_pos"] / info["time"]) + info["x_pos"]) / 100 score += real_reward # reward = np.clip(reward, -1., 1.) # 샘플을 저장 self.append_sample(history, action, reward) gc.collect() if dead: history = np.stack((next_state, next_state, next_state, next_state), axis=2) history = np.reshape([history], (1, 240, 256, 4)) else: history = next_history # 에피소드가 끝나거나 최대 타임스텝 수에 도달하면 학습을 진행 if self.t >= self.t_max or done: self.train_model(done) self.update_local_model() self.t = 0 if done: # 각 에피소드 당 학습 정보를 기록 episode += 1 ep_res = "episode: {}, score: {}, step: {}".format(episode, score, step) print(ep_res) if episode % 20 == 0: slack_msg(ep_res) # stats = [score, self.avg_p_max / float(step), step] # for i in range(len(stats)): # self.sess.run(self.update_ops[i], feed_dict={ self.summary_placeholders[i]: float(stats[i]) }) # summary_str = self.sess.run(self.summary_op) # self.summary_writer.add_summary(summary_str, episode + 1) self.avg_p_max = 0 self.avg_loss = 0 step = 0
class ICMTrainer: """ Compose encoder, forward/inverse, and q_model into single trainer entity """ def __init__(self): self.env = gym_super_mario_bros.make('SuperMarioBros-v0') self.env = JoypadSpace(self.env, COMPLEX_MOVEMENT) self.replay = SMBExperienceReplay(buffer_size=BUFFER_SIZE, batch_size=BATCH_SIZE) self.q_model = Qnetwork() self.encoder = Phi() self.forward_model = Fnet() self.inverse_model = Gnet() all_model_params = list(self.q_model.parameters()) + list( self.encoder.parameters()) all_model_params += list(self.forward_model.parameters()) + \ list(self.inverse_model.parameters()) self.opt = optim.Adam(lr=0.001, params=all_model_params) @staticmethod def combined_loss(q_loss, inverse_loss, forward_loss): """ overall loss fn lambda*Qloss + (1-beta)*forward_loss + beta*inverse_loss """ loss_ = (1 - BETA) * inverse_loss loss_ += BETA * forward_loss loss_ = loss_.sum() / loss_.flatten().shape[0] loss = loss_ + LAMBDA * q_loss return loss def icm_loss(self, state1, action, state2, forward_scale=1., inverse_scale=1e4): """ calculate forward and inverse model losses for ICM """ fwd_loss_fn = nn.MSELoss(reduction='none') inverse_loss_fn = nn.CrossEntropyLoss(reduction='none') # encode input states state1_hat = self.encoder(state1) state2_hat = self.encoder(state2) # detach because don't want to back-prop here on the encoder state2_hat_pred = self.forward_model(state1_hat.detach(), action.detach()) forward_pred_err = fwd_loss_fn( state2_hat_pred, state2_hat.detach()).sum(dim=1).unsqueeze(dim=1) forward_pred_err *= forward_scale pred_action = self.inverse_model(state1_hat, state2_hat) inverse_pred_err = inverse_loss_fn( pred_action, action.detach().flatten()).unsqueeze(dim=1) inverse_pred_err *= inverse_scale return forward_pred_err, inverse_pred_err def batch_forward_pass(self, use_extrinsic=True): """ single forward pass that generates forward err, inverse err and q_loss""" # pylint: disable=E1101 state1_batch, action_batch, reward_batch, state2_batch = self.replay.get_batch( ) # reshape action/reward batches to be compatible with models action_batch = action_batch.view(action_batch.shape[0], 1) reward_batch = reward_batch.view(reward_batch.shape[0], 1) # run ICM forward_pred_err, inverse_pred_err = self.icm_loss( state1_batch, action_batch, state2_batch) # scale forward pred err using the Eta parameter i_reward = (1. / ETA) * forward_pred_err reward = i_reward.detach() if use_extrinsic: # whether to include explicit rewards in training reward += reward_batch # discount expected values for next state qvals = self.q_model(state2_batch) reward += GAMMA * torch.max(qvals) reward_pred = self.q_model(state1_batch) reward_target = reward_pred.clone() # convert action batch (integers) to OHE indices = torch.stack( (torch.arange(action_batch.shape[0]), action_batch.squeeze()), dim=0) indices = indices.tolist() reward_target[indices] = reward.squeeze() q_loss = 1e5 * nn.MSELoss()(F.normalize(reward_pred), F.normalize(reward_target.detach())) return forward_pred_err, inverse_pred_err, q_loss def repeat_action(self, action): """ given action, repeat it specified times, and return combined state and rewards """ state_deque = deque(maxlen=FRAMES_PER_STATE) sum_rewards = 0 for _ in range(ACTION_REPEATS): state2, e_reward_, done, info = self.env.step(action) if done: break sum_rewards += e_reward_ downscaled_state2 = downscale_img(state2, to_gray=True) # pylint: disable=E1101 prepped_state2 = torch.from_numpy(downscaled_state2). \ float().unsqueeze(dim=0) state_deque.append(prepped_state2) return state_deque, done, sum_rewards, info def train(self): """ full training loop """ self.env.reset() state1 = prepare_initial_state(self.env.render('rgb_array')) losses = [] ep_lengths = [] episode_length = 0 last_x_pos = self.env.env.env._x_position for i in range(TRAINING_STEPS): self.opt.zero_grad() episode_length += 1 q_val_pred = self.q_model(state1) if i > SWITCH_TO_EPS_GREEDY: action = int(sample_action(q_val_pred, EPS)) else: action = int(sample_action(q_val_pred)) state_deque, done, extrinsic_reward, info = self.repeat_action( action) # pylint: disable=E1101 state2 = torch.stack(list(state_deque), dim=1) self.replay.add_memory( state1, action, extrinsic_reward, # summed across repeated actions state2) if i % MAX_EPISODE_LEN == 0 and i != 0: if (info['x_pos'] - last_x_pos) < MIN_PROGRESS: done = True else: last_x_pos = info['x_pos'] if done: print("Episode over.") ep_lengths.append(info['x_pos']) self.env.reset() state1 = prepare_initial_state(self.env.render('rgb_array')) last_x_pos = self.env.env.env._x_position episode_length = 0 else: state1 = state2 # Enter mini-batch training if len(self.replay.memory) < BATCH_SIZE: continue forward_pred_err, inverse_pred_err, q_loss \ = self.batch_forward_pass(use_extrinsic=False) loss = self.combined_loss(q_loss, forward_pred_err, inverse_pred_err) loss_list = (q_loss.mean(), forward_pred_err.flatten().mean(), inverse_pred_err.flatten().mean(), episode_length) if i % 250 == 0: print("Epoch {}, Loss: {}".format(i, loss)) print( "Forward loss: {} \n Inverse loss: {} \n Qloss: {}".format( forward_pred_err.mean(), inverse_pred_err.mean(), q_loss.mean())) print(info) losses.append(loss_list) loss.backward() self.opt.step()
def train(): # Hyper parameters cfg = DictConfig({ "epochs": 1, "lr": 1e-4, "use_extrinsic": True, "max_episode_len": 1000, "min_progress": 15, "frames_per_state": 3, "action_repeats": 6, "gamma_q": 0.85, "epsilon_random": 0.1, # Sample random action with epsilon probability "epsilon_greedy_switch": 1000, "q_loss_weight": 0.01, "inverse_loss_weight": 0.5, "forward_loss_weight": 0.5, "intrinsic_weight": 1.0, "extrinsic_weight": 1.0, }) # ---- setting up variables ----- q_model = MarioModel(cfg.frames_per_state) icm_model = MarioICM(cfg.frames_per_state) optim = torch.optim.Adam(list(q_model.parameters()) + list(icm_model.parameters()), lr=cfg.lr) replay = ExperienceReplay(buffer_size=500, batch_size=100) env = gym_super_mario_bros.make("SuperMarioBros-v0") env = JoypadSpace(env, COMPLEX_MOVEMENT) # Counters and stats last_x_pos = 0 current_episode = 0 global_step = 0 current_step = 0 cumulative_reward = 0 ep_rewards = [] # ----- training loop ------ for epoch in range(cfg.epochs): state = env.reset() done = False # Monte Carlo loop while not done: # ------------ Q Learning -------------- if current_step == 0: state = prepare_initial_state(env.render("rgb_array")) else: state = prepare_multi_state(state, env.render("rgb_array")) q_values = q_model(state) action = sample_action( q_values, cfg.epsilon, apply_epsilon=global_step > cfg.epsilon_greedy_switch, ) action_count = 0 state2 = None while True: state2_, reward, done, info = env.step(action) if state2 is None: state2 = state2_ env.render() if action_count >= cfg.action_repeats or done: break action_count += 1 state2 = prepare_multi_state(state, state2) # Add intrinsic reward intrinsic_reward = get_intrinsic_reward(state, action, state2, icm_model) print("in reward", intrinsic_reward.item()) print("ex reward", reward) reward = (cfg.intrinsic_weight * intrinsic_reward) + (cfg.extrinsic_weight * reward) q_loss = get_q_loss(q_values[0][action], reward, q_model, state2, cfg.gamma_q) replay.add(state, action, reward, state2) state = state2 # ------------- ICM ------------------- state1_batch, action_batch, reward_batch, state2_batch = replay.get_batch( ) action_pred, state2_encoded, state2_pred = icm_model( state1_batch, action_batch, state2_batch) inverse_loss = F.cross_entropy(action_pred, action_batch) forward_loss = F.mse_loss(state2_pred, state2_encoded) # ------------ Learning ------------ final_loss = ((cfg.q_loss_weight * q_loss) + (cfg.inverse_loss_weight * inverse_loss) + (cfg.forward_loss_weight * forward_loss)) optim.zero_grad() final_loss.backward() optim.step() # ------------ updates -------------- # TODO: add loss scalars print("--------loss: ", final_loss.item()) max_episode_len_reached = current_step >= cfg.max_episode_len no_progress = False # TODO: Figure out the progress shit done = done or max_episode_len_reached or no_progress if done: if max_episode_len_reached: # TODO: Add scalar: 'max episode len reached' current_episode, auto pass elif no_progress: # TODO: Add scalar: 'no progress' current_episode, auto pass # TODO: add scalar: 'episode len' current_step, current_episode # TODO: Plot cumulative reward for each episode # TODO: Plot the x_pos after the episode # TODO: Plot total sum of rewards for each episode # TODO: Every n episodes store save the video -> imageio.mimwrite('gameplay.mp4', renders: ndArray of frames, fps=30) current_step = -1 current_episode += 1 global_step += 1 current_step += 1
def dqn(): env = gym_tetris.make('TetrisA-v2') env = JoypadSpace(env, MOVEMENT) episodes = 2000 max_steps = None epsilon_stop_episode = 1500 mem_size = 20000 discount = 0.95 batch_size = 512 epochs = 1 render_every = 50 log_every = 50 replay_start_size = 2000 train_every = 1 n_neurons = [32, 32] render_delay = None activations = ['relu', 'relu', 'linear'] agent = DQNAgent(env.get_state_size(), n_neurons=n_neurons, activations=activations, epsilon_stop_episode=epsilon_stop_episode, mem_size=mem_size, discount=discount, replay_start_size=replay_start_size) log_dir = f'logs/tetris-nn={str(n_neurons)}-mem={mem_size}-bs={batch_size}-e={epochs}-{datetime.now().strftime("%Y%m%d-%H%M%S")}' log = CustomTensorBoard(log_dir=log_dir) scores = [] for episode in tqdm(range(episodes)): current_state = env.reset() done = False steps = 0 if render_every and episode % render_every == 0: render = True else: render = False # Game while not done and (not max_steps or steps < max_steps): next_states = env.get_next_states() best_state = agent.best_state(next_states.values()) best_action = None for action, state in next_states.items(): if state == best_state: best_action = action break reward, done = env.play(best_action[0], best_action[1], render=render, render_delay=render_delay) agent.add_to_memory(current_state, next_states[best_action], reward, done) current_state = next_states[best_action] steps += 1 scores.append(env.get_game_score()) # Train if episode % train_every == 0: agent.train(batch_size=batch_size, epochs=epochs) # Logs if log_every and episode and episode % log_every == 0: avg_score = mean(scores[-log_every:]) min_score = min(scores[-log_every:]) max_score = max(scores[-log_every:]) log.log(episode, avg_score=avg_score, min_score=min_score, max_score=max_score)
class MarioEnvironment(Process): def __init__( self, env_id, is_render, env_idx, child_conn, history_size=4, life_done=True, h=84, w=84, movement=COMPLEX_MOVEMENT, sticky_action=True, p=0.25): super(MarioEnvironment, self).__init__() self.daemon = True self.env = JoypadSpace( gym_super_mario_bros.make(env_id), COMPLEX_MOVEMENT) self.is_render = is_render self.env_idx = env_idx self.steps = 0 self.episode = 0 self.rall = 0 self.recent_rlist = deque(maxlen=100) self.child_conn = child_conn self.life_done = life_done self.history_size = history_size self.history = np.zeros([history_size, h, w]) self.h = h self.w = w self.reset() def run(self): super(MarioEnvironment, self).run() while True: action = self.child_conn.recv() if self.is_render: self.env.render() obs, reward, done, info = self.env.step(action) # when Mario loses life, changes the state to the terminal # state. if self.life_done: if self.lives > info['life'] and info['life'] > 0: force_done = True self.lives = info['life'] else: force_done = done self.lives = info['life'] else: force_done = done # reward range -15 ~ 15 log_reward = reward / 15 self.rall += log_reward r = log_reward self.history[:3, :, :] = self.history[1:, :, :] self.history[3, :, :] = self.pre_proc(obs) self.steps += 1 if done: self.recent_rlist.append(self.rall) print( "[Episode {}({})] Step: {} Reward: {} Recent Reward: {} Stage: {} current x:{} max x:{}".format( self.episode, self.env_idx, self.steps, self.rall, np.mean( self.recent_rlist), info['stage'], info['x_pos'], self.max_pos)) self.history = self.reset() self.child_conn.send([self.history[:, :, :], r, force_done, done, log_reward]) def reset(self): self.last_action = 0 self.steps = 0 self.episode += 1 self.rall = 0 self.lives = 3 self.stage = 1 self.max_pos = 0 self.get_init_state(self.env.reset()) return self.history[:, :, :] def pre_proc(self, X): # grayscaling x = cv2.cvtColor(X, cv2.COLOR_RGB2GRAY) # resize x = cv2.resize(x, (self.h, self.w)) return x def get_init_state(self, s): for i in range(self.history_size): self.history[i, :, :] = self.pre_proc(s)
class Env(object): def __init__(self, game, **kwargs): self.act_space = kwargs.get("act_space") self.state_size = kwargs.get("state_size") self.burn_in = kwargs.get("burn_in") self.seqlen = kwargs.get("seqlen") self.n_step = kwargs.get("n_step") self.use_soft = kwargs.get("use_soft") self.frames = kwargs.get("frames") self.sample_epsilon_per_step = kwargs.get("sample_epsilon_per_step") self.epsilon = np.power(0.4, random.uniform(4, 8)) self.game = game self.count = 0 self.count_maxpos = [] env = gym_super_mario_bros.make(game) if self.act_space == 7: self.env = JoypadSpace(env, SIMPLE_MOVEMENT) elif self.act_space == 12: self.env = JoypadSpace(env, COMPLEX_MOVEMENT) self.max_pos = -10000 self.done = True self.reset() def step(self, a, state_in): maxpos = self.reset() self.count += 1 if not self.use_soft: if self.sample_epsilon_per_step: self.epsilon = np.power(0.4, random.uniform(4, 8)) if random.random() < self.epsilon: a = random.randint(0, self.act_space - 1) self.a_t = a gs_t1, gr_t, gdone, ginfo = self.env.step(self.a_t) self.env.render() if not gdone: s_t1, r_t, done, info = self.env.step(self.a_t) r_t += gr_t r_t /= 2. else: s_t1 = gs_t1 r_t = gr_t done = gdone info = ginfo s_t1 = self.resize_image(s_t1) channels = s_t1.shape[-1] self.s_t = np.concatenate([s_t1, self.s_t[:, :, :-channels]], axis=-1) self.s.append(self.s_t) self.a.append(self.a_t) self.r.append(r_t) self.max_pos = max(self.max_pos, info["x_pos"]) self.pos.append(info["x_pos"]) if (len(self.pos) > 100) and (info["x_pos"] - self.pos[-100] < 5) and ( self.pos[-100] - info["x_pos"] < 5): done = True self.done = done if self.done: self.mask.append(0) else: self.mask.append(1) self.state_in.append(state_in) """ get segs """ # segs = self.get_history() # # return segs return maxpos def reset(self): if self.done: self.count_maxpos.append(self.max_pos) print(self.game, self.max_pos, len(self.count_maxpos[1:]), np.mean(self.count_maxpos[1:])) self.epsilon = np.power(0.4, random.uniform(4, 8)) self.count = 0 s_t = self.resize_image(self.env.reset()) self.s_t = np.tile(s_t, [1, 1, self.frames]) self.s = [self.s_t] self.a_t = random.randint(0, self.act_space - 1) self.a = [self.a_t] self.r = [0] self.mask = [1] self.max_pos = -10000 self.pos = [] state_in = np.zeros(self.state_size, dtype=np.float32) self.state_in = [state_in] self.done = False return self.count_maxpos return None def get_state(self): return self.s_t def get_act(self): return self.a_t def get_reward(self): return self.r[-1] def get_max_pos(self): return self.max_pos def get_state_in(self): return self.state_in[-1] @staticmethod def resize_image(image, size=84): image = Image.fromarray(image) image = image.convert("L") image = image.resize((size, size)) image = np.array(image, np.uint8) return image[:, :, None]
def train_agent(args): # if gpu is to be used device = torch.device( "cuda" if torch.cuda.is_available() and args.ngpu > 0 else "cpu") # Build env (first level, right only) env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') env = JoypadSpace(env, SIMPLE_MOVEMENT) # setup networks init_screen = get_screen(env, device) _, _, screen_height, screen_width = init_screen.shape # Get number of actions from gym action space args.n_actions = env.action_space.n policy_net = DQN(screen_height, screen_width, args.n_actions).to(device) target_net = DQN(screen_height, screen_width, args.n_actions).to(device) if args.targetNet: target_net.load_state_dict( torch.load(args.targetNet, map_location=device)) if args.policyNet: target_net.load_state_dict( torch.load(args.policyNet, map_location=device)) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = optim.RMSprop(policy_net.parameters()) memory = ReplayMemory(10000) args.steps_done = 0 num_episodes = 1 for i_episode in range(num_episodes): # Initialize the environment and state env.reset() last_screen = get_screen(env, device) current_screen = get_screen(env, device) state = current_screen - last_screen for t in count(): # Select and perform an action action = select_action(state, policy_net, args, device) _, reward, done, _ = env.step(action.item()) reward = torch.tensor([reward], device=device) # Observe new state last_screen = current_screen current_screen = get_screen(env, device) if not done: next_state = current_screen - last_screen else: next_state = None # Store the transition in memory memory.push(state, action, next_state, reward) # Move to the next state state = next_state # Perform one step of the optimization (on the target network) optimize_model(optimizer, memory, policy_net, target_net, args, device) if done: episode_durations.append(t + 1) break # Update the target network, copying all weights and biases in DQN if i_episode % args.target_update == 0: target_net.load_state_dict(policy_net.state_dict()) torch.save(policy_net.state_dict(), args.output_policyNet) torch.save(target_net.state_dict(), args.output_targetNet) if i_episode % 10 == 0: print(f'{i_episode+1}/{num_episodes}: Completed Episode.') print('Complete') env.close() torch.save(policy_net.state_dict(), args.output_policyNet) torch.save(target_net.state_dict(), args.output_targetNet)
class Env(object): def __init__(self, act_space, act_repeats, frames, n_step, gamma, game): self.act_space = act_space self.act_repeats = act_repeats self.act_repeat = random.choice(self.act_repeats) self.frames = frames self.n_step = n_step self.gamma = gamma self.max_pos = -10000 self.count = 0 env = gym_super_mario_bros.make(game) self.env = JoypadSpace(env, SIMPLE_MOVEMENT) s_t = self.resize_image(self.env.reset()) self.s_t = np.tile(s_t, [1, 1, frames]) self.s = [self.s_t] self.a_t = random.randint(0, act_space - 1) self.a = [self.a_t] self.a_logits = [] self.r = [] self.v_cur = [] self.pos = [] c_in = np.zeros(256, dtype=np.float32) h_in = np.zeros(256, dtype=np.float32) state_in = np.concatenate([c_in, h_in], axis=-1) self.state_in = [state_in] self.done = False def step(self, a, a_logits, state_in, v_cur): self.count += 1 if self.count % self.act_repeat == 0: self.a_t = a self.count = 0 self.act_repeat = random.choice(self.act_repeats) gs_t1, gr_t, gdone, ginfo = self.env.step(self.a_t) if not gdone: s_t1, r_t, done, info = self.env.step(self.a_t) r_t += gr_t r_t /= 2. else: s_t1 = gs_t1 r_t = gr_t done = gdone info = ginfo r_t /= 15. s_t1 = self.resize_image(s_t1) channels = s_t1.shape[-1] self.s_t = np.concatenate([s_t1, self.s_t[:, :, :-channels]], axis=-1) self.s.append(self.s_t) self.a.append(self.a_t) self.a_logits.append(a_logits) self.r.append(r_t) self.v_cur.append(v_cur) self.max_pos = max(self.max_pos, info["x_pos"]) self.pos.append(info["x_pos"]) if (len(self.pos) > 500) and (info["x_pos"] - self.pos[-500] < 5) and ( self.pos[-500] - info["x_pos"] < 5): done = True self.done = done self.state_in.append(state_in) def reset(self, force=False): if self.done or force: self.count = 0 self.act_repeat = random.choice(self.act_repeats) s_t = self.resize_image(self.env.reset()) self.s_t = np.tile(s_t, [1, 1, self.frames]) self.s = [self.s_t] self.a_t = random.randint(0, self.act_space - 1) self.a = [self.a_t] self.a_logits = [] self.r = [] self.v_cur = [] self.pos = [] c_in = np.zeros(256, dtype=np.float32) h_in = np.zeros(256, dtype=np.float32) state_in = np.concatenate([c_in, h_in], axis=-1) self.state_in = [state_in] self.done = False def get_state(self): return self.s_t def get_act(self): return self.a_t def get_max_pos(self): return self.max_pos def reset_max_pos(self): self.max_pos = -10000 def get_state_in(self): return self.state_in[-1] def get_history(self, force=False): if self.done or force: v_cur = np.array(self.v_cur + [0]) v_cur = h_inv(v_cur) gaes = get_gaes(None, self.r, v_cur[:-1], v_cur[1:], self.gamma, 0.95)[0] v_tar = get_rescaled_target(gaes, 1.0, self.v_cur) n_step_r = get_n_step_rewards(self.r, self.n_step, self.gamma) seg = Seg(self.s, self.a, self.a_logits, self.r, n_step_r, self.v_cur, v_tar, gaes, self.state_in) return seg return None @staticmethod def resize_image(image, size=84): image = Image.fromarray(image) image = image.convert("L") image = image.resize((size, size)) image = np.array(image) image = image / 255. image = np.array(image, np.float32) return image[:, :, None]
class Env(object): def __init__(self, act_space, act_repeats, frames, state_size, game): self.act_space = act_space self.act_repeats = act_repeats self.act_repeat = random.choice(self.act_repeats) self.frames = frames self.state_size = state_size self.game = game self.max_pos = -10000 self.count = 0 env = gym_super_mario_bros.make(game) if self.act_space == 7: self.env = JoypadSpace(env, SIMPLE_MOVEMENT) elif self.act_space == 12: self.env = JoypadSpace(env, COMPLEX_MOVEMENT) s_t = self.resize_image(self.env.reset()) self.s_t = np.tile(s_t, [1, 1, frames]) self.s = [self.s_t] self.a_t = random.randint(0, act_space - 1) self.a = [self.a_t] self.a_logits = [] self.r = [0] self.pos = [] self.v_cur = [] state_in = np.zeros(self.state_size, dtype=np.float32) self.state_in = [state_in] self.done = False def step(self, a, a_logits, state_in): self.count += 1 if self.count % self.act_repeat == 0: self.a_t = a self.count = 0 self.act_repeat = random.choice(self.act_repeats) gs_t1, gr_t, gdone, ginfo = self.env.step(self.a_t) self.env.render() if not gdone: s_t1, r_t, done, info = self.env.step(self.a_t) r_t += gr_t r_t /= 2. else: s_t1 = gs_t1 r_t = gr_t done = gdone info = ginfo r_t /= 15. s_t1 = self.resize_image(s_t1) channels = s_t1.shape[-1] self.s_t = np.concatenate([s_t1, self.s_t[:, :, :-channels]], axis=-1) self.s.append(self.s_t) self.a.append(self.a_t) self.a_logits.append(a_logits) self.r.append(r_t) self.max_pos = max(self.max_pos, info["x_pos"]) self.pos.append(info["x_pos"]) if (len(self.pos) > 500) and (info["x_pos"] - self.pos[-500] < 5) and ( self.pos[-500] - info["x_pos"] < 5): done = True self.done = done self.state_in.append(state_in) def update_v(self, v_cur): self.v_cur.append(v_cur) def reset(self, force=False): if self.done or force: max_pos = self.max_pos self.max_pos = -10000 logging.info(" Max Position %s : %d" % (self.game, max_pos)) self.count = 0 self.act_repeat = random.choice(self.act_repeats) s_t = self.resize_image(self.env.reset()) self.s_t = np.tile(s_t, [1, 1, self.frames]) self.s = [self.s_t] self.a_t = random.randint(0, self.act_space - 1) self.a = [self.a_t] self.a_logits = [] self.r = [0] self.pos = [] self.v_cur = [] state_in = np.zeros(self.state_size, dtype=np.float32) self.state_in = [state_in] self.done = False def get_state(self): return self.s_t def get_act(self): return self.a_t def get_max_pos(self): return self.max_pos def reset_max_pos(self): self.max_pos = -10000 def get_state_in(self): return self.state_in[-1] def get_history(self, force=False): if self.done or force: if self.done: gaes = get_gaes(None, self.r, self.v_cur, self.v_cur[1:] + [0], 0.99, 0.95)[0] seg = Seg(self.s, self.a, self.a_logits, self.r, gaes, self.v_cur, self.state_in) return seg if force and len(self.r) > 1: gaes = get_gaes(None, self.r[:-1], self.v_cur[:-1], self.v_cur[1:], 0.99, 0.95)[0] seg = Seg(self.s[:-1], self.a[:-1], self.a_logits[:-1], self.r[:-1], gaes, self.v_cur[:-1], self.state_in[:-1]) return seg return None @staticmethod def resize_image(image, size=84): image = Image.fromarray(image) image = image.convert("L") image = image.resize((size, size)) image = np.array(image) image = image / 255. image = np.array(image, np.float32) return image[:, :, None]
os.makedirs(params['path_logs_dir']) shutil.copy('./params.json', params['path_logs_dir'] + '/params.json') writer = SummaryWriter(params['path_logs_dir']) dummy_input_to_policy_net = torch.randn( 1, json_params['size_resized_image'], json_params['size_resized_image']).float().to( params['device']).unsqueeze(0) dummy_input_to_target_net = torch.randn( 1, json_params['size_resized_image'], json_params['size_resized_image']).float().to( params['device']).unsqueeze(0) writer.add_graph(agent.brain.policy_net, dummy_input_to_policy_net) writer.add_graph(agent.brain.target_net, dummy_input_to_target_net) for episode in range(1, json_params['num_episodes'] + 1): observation = env.reset() state = preprocess(observation, json_params['size_resized_image']) t = done = total_rewards = total_loss = total_max_q_val = 0 while True: if json_params['render']: env.render() t += 1 action = agent.get_action(state) observation, reward, done, _ = env.step(action) if done: next_state = None else: next_state = preprocess(observation, json_params['size_resized_image'])
qtargets = reward_batch.squeeze() + params['gamma'] * ( (1 - done_batch.squeeze()) * torch.max(qtargets_, dim=1)[0]) X = qvals.gather(dim=1, index=action_batch).squeeze() return loss_fn(X, qtargets.detach()) eps = 1 losses = [] ep_lengths = [] e_reward = 0.0 episode_length = 0 epochs = 7127431 500000 env.reset() state1 = prepare_initial_state(env.render('rgb_array')) state_deque = deque(maxlen=params['frames_per_state']) last_x_pos = env.env.env._x_position start_time = time.time() for i in range(epochs): optimizer.zero_grad() episode_length += 1 qval_pred = Qmodel(state1) action = int(policy(qval_pred, eps)) for j in range(params['action_repeats']): state2, e_reward_, done, info = env.step(action) last_x_pos = info['x_pos'] if done:
if __name__ == "__main__": tf.reset_default_graph() gpu_options = tf.GPUOptions(allow_growth=True) sess = tf.Session(config=tf.ConfigProto(gpu_options=gpu_options)) env = gym_super_mario_bros.make('SuperMarioBrosRandomStages-v0') env = JoypadSpace(env, SIMPLE_MOVEMENT) action_size = env.action_space.n # envs[0].set_render(True) train_model = A2CAgent("train_model", False, sess, input_shape, action_size, lr, GAMMA, LAMBDA, max_grad_norm, ent_coef, vf_coef, clip_range, True) while True: state_generator = StateGenerator(frame_size, stack_size) state = state_generator.get_stacked_frames(env.reset(), True) episodes_reward = 0 while True: policy, value = train_model.get_actions_and_values(np.array([state])) action = np.random.choice(np.arange(action_size), p=np.squeeze(policy)) for i in range(0, skip_frames): env.render() raw_state, frame_reward, done, info = env.step(action) if frame_reward == -15 or done: raw_state = env.reset() break
# create the network policy_net = DQNetwork(stacked_frame_dim=FRAME_DIM, num_actions=env.action_space.n) target_net = DQNetwork(stacked_frame_dim=FRAME_DIM, num_actions=env.action_space.n) # create the replay memory replay_memory = ReplayMemory(REPLAY_MEMORY_CAPACITY) # play the episodes current_exploration = EXPLORATION_MAX total_steps = 0 reward_history = [] mean_reward_history = [] for episode in range(NUM_EPISODES): state = env.reset() # play one game current_reward = 0 for steps in count(MAX_STEPS_PER_GAME): # render the environment if RENDER_ENVIRONMENT: env.render() # get the next action action = get_next_action(state, env.action_space.n, current_exploration) # perform the action next_state, reward, done, info = env.step(action)
class EnvWrapper(): def __init__(self, frame_size, skip_frames, stack_size): self.env = gym_super_mario_bros.make('SuperMarioBrosRandomStages-v0') self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT) self.agent = None self.frame_size = frame_size self.stack_size = stack_size self.action_size = self.env.action_space.n self.skip_frames = skip_frames self.render = False self.state_generator = StateGenerator(self.frame_size, self.stack_size) self.env.reset() raw_state, _, _, self.info = self.env.step(0) self.state = self.state_generator.get_stacked_frames(raw_state, True) self.states = [] self.policies = [] self.actions = [] self.rewards = [] self.values = [] self.dones = [] self.episode = 0 self.episodeReward = 0 self.maxEpisodeReward = 0 self.current_episode_reward = 0 self.done = False def step(self, n): for _ in range(n): policy, value = self.agent.get_actions_and_values( np.array([self.state])) action = np.random.choice(self.action_size, p=np.squeeze(policy)) reward = 0 for i in range(0, self.skip_frames): raw_state, frame_reward, done, info = self.env.step(action) if frame_reward == -15 or done: self.episode += 1 done = True if frame_reward == -15: reward = -15 * self.skip_frames else: reward = 15 * self.skip_frames raw_state = self.env.reset() break else: reward += frame_reward reward += (5 if (info["score"] - self.info["score"]) > 0 else 0) reward /= (15 * self.skip_frames) self.current_episode_reward += reward next_state = self.state_generator.get_stacked_frames( raw_state, done, frame_reward == 15 or (done and self.episode % 100 == 0), self.current_episode_reward) self.states.append(self.state) self.policies.append(np.squeeze(policy)) self.actions.append(action) self.rewards.append(reward) self.values.append(np.squeeze(value)) self.dones.append(done) self.state = next_state self.done = done self.info = info if self.done: self.episodeReward = self.current_episode_reward if self.maxEpisodeReward < self.episodeReward: self.maxEpisodeReward = self.episodeReward self.current_episode_reward = 0 def get_experiences(self): if self.done: next_state_value = 0 else: next_state_value = np.squeeze( self.agent.get_value(np.array([self.state]))) states = self.states actions = self.actions policies = self.policies rewards = self.rewards values = self.values dones = [1 if done else 0 for done in self.dones] next_values = values[1:] + [next_state_value] self.states = [] self.policies = [] self.actions = [] self.rewards = [] self.values = [] self.dones = [] return states, policies, actions, rewards, values, next_values, dones def get_action_size(self): return self.action_size def set_agent(self, agent): self.agent = agent def set_render(self, render): self.render = render def get_max_and_current_episode_reward(self): return self.maxEpisodeReward, self.episodeReward
def main(): env = gym_super_mario_bros.make('SuperMarioBros-v0') env = JoypadSpace(env, SIMPLE_MOVEMENT) agent = DQNAgent(action_size=7) scores, episodes, global_step = [], [], 0 global_start = datetime.now() local_start = datetime.now() print() print("=" * 100) print("RL environment initialized") print("=" * 100) print() gc.collect() for e in range(1000): e = e + 1 done = False dead = False step, score, start_life = 0, 0, 5 observe = env.reset() for _ in range(random.randint(1, agent.no_op_steps)): observe, _, _, _ = env.step(1) state = agent.pre_processing(observe) history = np.stack((state, state, state, state), axis=2) history = np.reshape([history], (1, 240, 256, 4)) count_epsilon = 0 count_greedy = 0 coinStatus = 0 marioStatus = "small" flagStatus = False softReward = 0 lifeStatus = 2 while not done: # if agent.render: # env.render() global_step += 1 step += 1 # 바로 전 4개의 상태로 행동을 선택 action, res = agent.get_action(history) if res: count_epsilon += 1 else: count_greedy += 1 # 선택한 행동으로 환경에서 한 타임스텝 진행 observe, reward, done, info = env.step(action) # 각 타임스텝마다 상태 전처리 next_state = agent.pre_processing(observe) next_state = np.reshape([next_state], (1, 240, 256, 1)) next_history = np.append(next_state, history[:, :, :, :3], axis=3) agent.avg_q_max += np.amax(agent.model.predict(np.float32(history / 255.))[0]) if start_life > info['life']: dead = True start_life = info['life'] # reward = np.clip(reward, -1., 1.) real_reward = reward ### ### ### # reward = reward # if coinStatus != info["coins"]: # coinStatus = info["coins"] # reward = reward + 10 # if marioStatus != info["status"]: # marioStatus = info["status"] # reward = reward + 200 # if flagStatus != info["flag_get"]: # flagStatus = info["flag_get"] # reward = reward + 200 # if lifeStatus != info["life"]: # lifeStatus = info["life"] # reward = reward - 20 # # if info["x_pos"] < 10: # info["x_pos"] = 10 # if info["time"] < 10: # info["time"] = 10 # # reward = reward + math.log((info["x_pos"] / info["time"]) + info["x_pos"]) # 샘플 <s, a, r, s'>을 리플레이 메모리에 저장 후 학습 agent.append_sample(history, action, reward, next_history, dead) if len(agent.memory) >= agent.train_start: agent.train_model() # 일정 시간마다 타겟모델을 모델의 가중치로 업데이트 if global_step % agent.update_target_rate == 0: agent.update_target_model() # score += reward score += real_reward if dead: dead = False else: history = next_history if global_step == 0: pass elif global_step % 1000 == 0: print("local step : {}, time : {} sec, epsilon : {}".format(global_step, (datetime.now() - local_start).seconds, agent.epsilon)) local_start = datetime.now() if done: ep_result = "episode : {}, score : {}, memory : {}, step : {}".format(e, score, len(agent.memory), global_step) print(ep_result) print("epsilon : {}, greedy : {}".format(count_epsilon, count_greedy)) print() print("time elapsed : {} sec".format((datetime.now() - global_start).seconds)) global_start = datetime.now() agent.epsilon = agent.epsilon - agent.epsilon_decay_step print("epsilon decay to {}!".format(agent.epsilon)) print() slack_msg(ep_result) # if score > 2000 and score <= 3000: # agent.epsilon = 0.075 # elif score > 3000 and score <= 5000: # agent.epsilon = 0.05 # elif score > 5000 and score <= 10000: # agent.epsilon = 0.005 agent.avg_q_max, agent.avg_loss, global_step = 0, 0, 0 # 1000 에피소드마다 모델 저장 if e == 0: pass elif e % 2 == 0: agent.model.save_weights("./dqn.h5") # dump(agent.memory, "memory.joblib") print("model saved!") print() gc.collect()
class DQLMarioAgent(DQLAgent.DQLAgent): def __init__(self, action_type, batch_size, model_type, success_margin, success_score, memory_size, record_video, target_model, project, wrapper_type): super().__init__(action_type, batch_size, model_type, success_margin, success_score, memory_size, record_video, target_model, project) self.env = gym_super_mario_bros.make('SuperMarioBros-v0') if wrapper_type == 'COMPLEX': self.env = JoypadSpace(self.env, COMPLEX_MOVEMENT) # -> 12 elif wrapper_type == 'SIMPLE': self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT) # -> 7 else: self.env = JoypadSpace(self.env, RIGHT_ONLY) # -> 5 self.action_size = self.env.action_space.n self.num_states = 1 self.state_single_size = 80 self.state_size = (self.state_single_size, self.state_single_size) self.action = self.env.action_space.sample() self.first_last_x_pos = self.env.env.env._x_position self.max_distance = self.first_last_x_pos self.DLModel = NNModel.DLModel(env=self.env, action_size=self.action_size, state_size=self.state_single_size, states=self.num_states, model_type=model_type, output_dir=self.others_dir) def append_new_frame(self): """ Save generated env's frame """ self.renders.append( img_as_ubyte( resize(self.env.render(mode='rgb_array'), (480, 480, 3)))) def get_first_state(self): """ :return: inicial state """ first_state = self.env.reset() return Utils.prepare_initial_state(first_state, self.state_size, channels=1) def get_first_x_pos(self): """ :return: inicial x position """ return self.first_last_x_pos def prepare_state(self, next_state, channels=1): """ Remove upper image info, reduce channels, and reduce image size :param channels: number of layers :param next_state: state to process :return: preprocessed image generated by env """ return Utils.prepare_initial_state(next_state, self.state_size, channels=channels) def reset_max_distance(self): """ Reset episode max distance """ self.max_distance = self.first_last_x_pos def update_max_distance(self, dist): """ Update episode max distance :param dist: new distance :return: new max distance """ if dist > self.max_distance: self.max_distance = dist return self.max_distance