def main(): #env = gym_super_mario_bros.make('SuperMarioBros-v0') env = gym_super_mario_bros.make('SuperMarioBros-1-1-v1') env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT) timestart = datetime.datetime.fromtimestamp(time.time()).strftime('%Y-%m-%d-%H:%M:%S') # env = VideoRecorderWrapper(env, PROJ_DIR + "/../video", str(timestart), 50) env = VideoRecorderWrapper(env, PROJ_DIR + "/../video/final", str(timestart), 1) env = DownsampleEnv(env, (84, 84)) env = PenalizeDeathEnv(env, penalty=-25) env = FrameStackEnv(env, 4) # good #act = deepq.load(PROJ_DIR+"/../models/mario_model_2018-08-12-13:00:58.pkl") # better act = deepq.load(PROJ_DIR + "/../models/mario_model_2018-08-12-19:21:50.pkl") episode = 0 while True: obs, done = env.reset(), False stepnr = 0 episode_rew = 0 while not done: env.render() obs, rew, done, _ = env.step(act(obs[None])[0]) if stepnr % 20 == 0: plot_obs(obs) episode_rew += rew stepnr += 1 print("Episode reward", episode_rew, episode) episode = episode+1
def run(self, solution, level, render, mode): env = gym_super_mario_bros.make(level) env = BinarySpaceToDiscreteSpaceEnv(env, COMPLEX_MOVEMENT) done = True reason_finish = "no_more_commands" pos = 0 total_r = 0 for step in range(len(solution)): if done: state = env.reset() state, reward, done, info = env.step(solution[pos]) pos+=1 if reward == -15: #faleceu reason_finish = "death" break if mode == "level" and info['flag_get'] == True: reason_finish = "win" break total_r = total_r + reward if render == "true": env.render() env.close() return total_r, pos, info, reason_finish
def main(path="./models/deepq/mario_reward_1736.7.pkl"): step_mul = 16 steps = 200 FLAGS = flags.FLAGS flags.DEFINE_string("env", "SuperMarioBros-v0", "RL environment to train.") flags.DEFINE_string("algorithm", "deepq", "RL algorithm to use.") FLAGS(sys.argv) # 1. Create gym environment env = gym_super_mario_bros.make('SuperMarioBros-v0') env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT) act = deepq.load(path) nstack = 4 nh, nw, nc = env.observation_space.shape history = np.zeros((1, nh, nw, nc * nstack), dtype=np.uint8) obs, done = env.reset(), False # history = update_history(history, obs) episode_rew = 0 while not done: env.render() action = act([obs])[0] obs, rew, done, _ = env.step(action) # history = update_history(history, obs) episode_rew += rew print("action : %s reward : %s" % (action, rew)) print("Episode reward", episode_rew)
class MarioBrosEnvironment(AbstractEnvironment): def __init__(self, config): self.config = config if config.env == 'mario': from gym_super_mario_bros.actions import RIGHT_ONLY from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv import gym_super_mario_bros env = gym_super_mario_bros.make('SuperMarioBros-v0') self.env = BinarySpaceToDiscreteSpaceEnv(env, RIGHT_ONLY) elif config.env == 'montezuma': self.env = gym.make('MontezumaRevengeDeterministic-v0') self.state_buffer = [] self.reward_buffer = [] self.counter = 0 def process_image(self, s, subsample=4): if self.config.env == 'mario': s = skimage.color.rgb2gray(s) s = skimage.transform.resize( s, (s.shape[0] / subsample, s.shape[1] / subsample), anti_aliasing=True, mode='constant') s = torch.from_numpy(s) elif self.config.env == 'montezuma': s = s[34:34 + 160, :160] s = skimage.color.rgb2gray(s) s = skimage.transform.resize( s, (s.shape[0] / subsample, s.shape[1] / subsample), anti_aliasing=True, mode='constant') s = torch.from_numpy(s).float() return s def reset(self): self.counter = 0 self.state_buffer = [] for _ in range(self.config.n_input_frames): state = self.process_image(self.env.reset(), self.config.image_subsample) self.state_buffer.append(state) return torch.stack(self.state_buffer) def step(self, action): total_reward = 0 for _ in range(self.config.n_action_repeat): state, reward, done, info = self.env.step(action) total_reward += reward if done: break state = self.process_image(state, self.config.image_subsample) self.state_buffer.append(state) self.state_buffer = self.state_buffer[-self.config.n_input_frames:] return torch.stack(self.state_buffer), total_reward, done, info
class Environment: actionMap = { 0: 'NOOP', 1: 'Right', 2: 'Right-Jump', 3: 'Right-Sprint', 4: 'Right-Jump-Sprint', 5: 'Jump', 6: 'Left' } def __init__(self, rows=19, columns=16, verbose=True, raw=True, variant=1): self.verbose = verbose self.raw = raw self.variant = variant self.img2state = Img2State(rows=19, columns=16) self.game = BinarySpaceToDiscreteSpaceEnv( gym_super_mario_bros.make('SuperMarioBros-v3'), SIMPLE_MOVEMENT) self.state = self.img2state.transfrom(self.game.reset(), raw=self.raw, variant=self.variant) self.reward = 0 # Actions self.A = list(Environment.actionMap.keys()) def step(self, action: int): if action not in self.A: raise Exception('Wrong Action...') state, self.reward, done, info = self.game.step(action) self.state = self.img2state.transfrom(state, raw=self.raw, variant=self.variant) if done and self.state[8]: self.reward = 100 elif self.state[8]: self.reward = 30 elif self.state[9]: self.reward = 15 if self.verbose: self.game.render() return done def reset(self): self.state = self.img2state.transfrom(self.game.reset(), raw=self.raw, variant=self.variant) self.reward = 0
class Environment(threading.Thread): stop_signal = False def __init__(self, render=False, eps_start=EPS_START, eps_end=EPS_STOP, eps_steps=EPS_STEPS): threading.Thread.__init__(self) self.render = render # Make the super mario gym environment and apply wrappers self.env = gym.make(ENV) self.env = BinarySpaceToDiscreteSpaceEnv(self.env, SIMPLE_MOVEMENT) self.env = preprocess.GrayScaleImage(self.env, height=HIGHT, width=WIDTH, grayscale=True) # self.env = wrappers.Monitor(self.env, "./Super_Mario_AI/videos", force = True, write_upon_reset=True) self.agent = Agent(TEMPERATURE) def runEpisode(self): s = self.env.reset() R = 0 while True: time.sleep(THREAD_DELAY) # yield if self.render: self.env.render() a = self.agent.act(s) s_, r, done, info = self.env.step(a) if done: # terminal state s_ = None self.agent.train(s, a, r, s_) s = s_ R += r if done or self.stop_signal: break print("Total R:", R) def run(self): while not self.stop_signal: self.runEpisode() def stop(self): self.stop_signal = True
class MarioEnv: def __init__(self, os='mac', display=False): self.display = display if os == 'mac' or os == 'linux': env = gym_super_mario_bros.make('SuperMarioBros-v0') self.env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT) else: raise Exception("bad os") self.act_dim = self.env.action_space.n self.obs_dim = (1, 128, 128) print("env created with act_dim", self.act_dim, "obs_dim", self.obs_dim) self.transform = transforms.Compose([ transforms.ToTensor(), # chain 2 transforms together using list. transforms.Normalize(mean=(0.5, 0.5, 0.5), std=(0.5, 0.5, 0.5)) ]) def reset(self): state = self.env.reset() return self.__resize_image(state) def step(self, action): state, reward, done, info = self.env.step(action) if reward == 0: reward = -0.5 state_t = self.__resize_image(state) return state_t, \ np.reshape(reward, -1), \ np.reshape(done, -1) def close(self): self.env.close() def __resize_image(self, state): state_new = cv2.resize(state, (128, 128)) img = Image.fromarray(state_new) state_t = self.transform(img)[0, :, :].unsqueeze(0) state_t = state_t.float().to(DEVICE) return state_t.unsqueeze(0) def render(self): if self.display: self.env.render()
def main(): env = gym_super_mario_bros.make('SuperMarioBros-v0') env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT) done = True max_step = 5000 print(env.observation_space.shape) #win下加ascii=True才会不换行 qbar = tqdm(max_step, ascii=True) for step in range(max_step): qbar.update() if done: state = env.reset() action = get_action(state, env.action_space) state, reward, done, info = env.step(action) if done: print(str(step) + " 英雄请卷土重来" + str(info)) env.render() env.close() qbar.close()
cv2.resize(input, (600, 320), interpolation=cv2.INTER_NEAREST)) if wait: cv2.waitKey(4) def build_model(input_shape, actions): model = tf.keras.Sequential() model.add(tf.keras.layers.Dense(256, input_dim=4, activation='relu')) model.add(tf.keras.layers.Dense(64, activation='relu')) model.add(tf.keras.layers.Dense(actions, activation="softmax")) return model.build() env = gym_super_mario_bros.make('SuperMarioBros-1-1-v2') env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT) state = env.reset() print(f"State shape: {state.shape}") input = state_to_tf_input(state) print(f"Input shape: {input.shape}") build_model(input.shape, env.action_space.n) done = False step = 0 while not done and step < 5000: step += 1 action = env.action_space.sample() state, reward, done, info = env.step(action) input = state_to_tf_input(state) print(f"{step}: {action} -> {reward}") show_input(input, True)
myRL = DQNAgent(gamma=GAMMA, initial_epsilon=INITIAL_EPSILON, final_epsilon=FINAL_EPSILON, decay_epsilon=DECAY_EPSILON, lr=LEARNING_RATE) for episode in range(EPISODES): env = gym_super_mario_bros.make('SuperMarioBros-v0') env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT) done = True action = 0 # action 'NOOP' for itera in range(ITERATION): if done: env.reset() oldObse, _, _, _ = env.step(0) # get the initial state oldObse = myRL.pre_process(oldObse) # oldObse = (-1, 'NOOP', 0) action = myRL.chooseAction(oldObse) newObse, reward, done, info = env.step(action) newObse = myRL.pre_process(newObse) myRL.remember(state=oldObse, action=action, reward=reward, next_state=newObse, done=done) oldObse = newObse if len(myRL.memory) > BATCH: myRL.learn_from_replay(BATCH) env.render() env.close()
class MarioEnvironment(Process): def __init__(self, env_id, is_render, env_idx, child_conn, history_size=4, h=84, w=84): super(MarioEnvironment, self).__init__() self.daemon = True self.env = BinarySpaceToDiscreteSpaceEnv( gym_super_mario_bros.make(env_id), movement) self.is_render = is_render self.env_idx = env_idx self.steps = 0 self.episode = 0 self.rall = 0 self.recent_rlist = deque(maxlen=100) self.child_conn = child_conn self.history_size = history_size self.history = np.zeros([history_size, h, w]) self.h = h self.w = w self.reset() def run(self): super(MarioEnvironment, self).run() while True: action = self.child_conn.recv() if self.is_render: self.env.render() obs, reward, done, info = self.env.step(action) if life_done: # when Mario loses life, changes the state to the terminal # state. if self.lives > info['life'] and info['life'] > 0: force_done = True self.lives = info['life'] else: force_done = done self.lives = info['life'] else: # normal terminal state force_done = done # reward range -15 ~ 15 log_reward = reward / 15 self.rall += log_reward r = log_reward self.history[:3, :, :] = self.history[1:, :, :] self.history[3, :, :] = self.pre_proc(obs) self.steps += 1 if done: self.recent_rlist.append(self.rall) print( "[Episode {}({})] Step: {} Reward: {} Recent Reward: {} Stage: {} current x:{} max x:{}" .format(self.episode, self.env_idx, self.steps, self.rall, np.mean(self.recent_rlist), info['stage'], info['x_pos'], self.max_pos)) self.history = self.reset() else: self.child_conn.send( [self.history[:, :, :], r, False, done, log_reward]) def reset(self): self.steps = 0 self.episode += 1 self.rall = 0 self.lives = 3 self.stage = 1 self.max_pos = 0 self.get_init_state(self.env.reset()) return self.history[:, :, :] def pre_proc(self, X): # grayscaling x = cv2.cvtColor(X, cv2.COLOR_RGB2GRAY) # resize x = cv2.resize(x, (self.h, self.w)) x = np.float32(x) * (1.0 / 255.0) return x def get_init_state(self, s): for i in range(self.history_size): self.history[i, :, :] = self.pre_proc(s)
class MarioEnv(Process): def __init__(self, env_id, idx, child_conn, queue, n_step, is_render=False): super(MarioEnv, self).__init__() self.idx = idx self.env_id = env_id self.child_conn = child_conn self.queue = queue self.is_render = is_render self.n_step = n_step self.steps = 0 self.episodes = 0 self.accum_reward = 0 self.transition = [] def run(self): super(MarioEnv, self).run() self.env = gym_super_mario_bros.make(self.env_id) self.env = BinarySpaceToDiscreteSpaceEnv(self.env, SIMPLE_MOVEMENT) self.reset() print('[ Worker %2d ] ' % (self.idx), end='') print('Playing <', self.env_id, '>') self.request_action(0, False) while True: action = self.child_conn.recv() next_state, reward, done, info = self.env.step(action) self.steps += 1 self.accum_reward += reward next_state = rgb2dataset(next_state) if self.is_render and self.idx == 0: self.env.render() # make a transition self.transition.append(next_state) if len(self.transition) > 4: self.transition.pop(0) if done: self.send_result(info['x_pos']) self.reset() self.request_action(reward, True) else: self.request_action(reward, False) def reset(self): state = self.env.reset() state = rgb2dataset(state) self.transition.clear() self.transition.append(state) self.steps = 0 self.episodes += 1 self.accum_reward = 0 def request_action(self, reward, done): self.queue.put([self.idx, "OnStep", [self.transition, reward, done]]) def send_result(self, x_pos): self.queue.put([ self.idx, "Result", [self.episodes, self.steps, self.accum_reward, x_pos] ])
def replay_genome(genome, movements, gen): env_expanded = gym_super_mario_bros.SuperMarioBrosEnv(frames_per_step=1, rom_mode='vanilla') env = BinarySpaceToDiscreteSpaceEnv(env_expanded, movements) print('Number of genes: ', len(genome.connection_genes)) for gene in genome.connection_genes: print(gene.in_node, gene.out_node, gene.weight, gene.innovation_number, gene.type, gene.enabled) done = True unticked = 0 tick_interval = 1 / 30 last_tick_time = time.time() fps = 0 frames = 0 last_fps_time = time.time() for _ in range(500000): unticked += time.time() - last_tick_time last_tick_time = time.time() ticked = False # while unticked >= tick_interval: if done: state = env.reset() state_downscaled = get_sensor_map(env_expanded) action = genome.calculate_action(state_downscaled) # print('\rFPS: {:.3f}'.format(fps), end=' ') # print(vectofixedstr(action, 10), end=' ') action = np.argmax(action) print('\rtaking action', movements[action], end='', flush=True) state, reward, done, info = env.step(action) #filename = get_path_of('all_pictures/mario/') #imsave(filename + 'mario_' + str(_) + '.png', state) save_state = np.full((13, 10, 3), 255, dtype=np.int) COLORS = [[250, 250, 250], [0, 0, 0], [196, 0, 0], [0, 0, 196]] for i in range(13): for j in range(10): if state_downscaled[(i, j)] == -1: save_state[(i, j)] = COLORS[3] elif state_downscaled[(i, j)] == 0: save_state[(i, j)] = COLORS[0] else: save_state[(i, j)] = COLORS[1] save_state[(7, 2)] = COLORS[2] # filename = get_path_of('all_pictures/input_downscaled/') # imsave(filename + 'state_' + str(_) + '.png', save_state.astype(np.uint8)) # make_controller(movements[action], _, gen) env.render() if info["life"] <= 2: died = True break ticked = True frames += 1 unticked -= tick_interval # if ticked: # now = time.time() # if now - last_fps_time >= 1: # fps = frames / (now - last_fps_time) # last_fps_time = now # frames = 0 # else: # time.sleep(0.001) env.close()
screen = np.ascontiguousarray(screen, dtype=np.float32) / 255 screen = torch.from_numpy(screen) # Resize, and add a batch dimension (BCHW) return resize(screen).unsqueeze(0).to(device) #reward needs to self design num_episodes=5000 for i_episode in range(num_episodes): # Initialize the environment and state if (i_episode==4999): print("last eposide") env.reset() picture, _, _, last_info=env.step(0) last_screen = get_screen() current_screen = get_screen() state = current_screen for t in count(): action=select_action(state,last_info['x_pos']) picture, reward, done, info = env.step(action.item()) if info['flag_get']==True: reward=99999 elif reward<0 and reward!=15: pass elif (reward==-15) or (info['time']<20): #dead reward= -99999 else: # reward=info['x_pos']+info['coins']*10+info['score']+info['time']*2
save_path = save_dir / AGENT_FILENAME if Path.is_file(save_path): print("Loading saved agent...") agent.load(save_path) done = False batch_size = 32 for e in range(1, EPISODES + 1): state = env.reset() state = np.reshape(state, [1, state_size]) time = 0 while True: env.render() action = agent.act(state) next_state, reward, done, _ = env.step(action) reward = reward if not done else -10 next_state = np.reshape(next_state, [1, state_size]) agent.remember(state, action, reward, next_state, done) state = next_state if done or time >= 500: print("episode: {}/{}, score: {}, e: {:.2}".format( e, EPISODES, time, agent.epsilon)) break time += 1 if len(agent.memory) > batch_size: agent.replay(batch_size) if e % 10 == 0: agent.save(save_path)
model.compile(loss='mse', optimizer=Adam(lr=learning_rate)) done = False batch_size = 16 """ state = env.reset() _, _, _, info = env.step(0) state = np.reshape(info["enemy"], [1, state_size]) """ for i in range(8): for j in range(4): env1 = gym_super_mario_bros.make('SuperMarioBros-' + str(i + 1) + '-' + str(j + 1) + '-v0') env = BinarySpaceToDiscreteSpaceEnv(env1, COMPLEX_MOVEMENT) state = env.reset() _, _, _, info = env.step(0) total_reward = 0 reward_checkpoint = 3000 state = np.reshape(info["enemy"], [1, state_size]) #state = np.append(state,info["time"]) #state = np.reshape(state, [1, state_size+1]) checkpoint = info["x_pos"] + 50 done = False x0 = info["x_pos"] y0 = info["y_pos"] t0 = info["time"] t = 0 for k in range(10): quieto = 0 total_reward = 0 while not done:
class MarioEnvironment(Process): def __init__( self, env_id, is_render, env_idx, child_conn, history_size=4, life_done=False, h=84, w=84, movement=COMPLEX_MOVEMENT, sticky_action=True, p=0.25): super(MarioEnvironment, self).__init__() self.daemon = True self.env = BinarySpaceToDiscreteSpaceEnv( gym_super_mario_bros.make(env_id), COMPLEX_MOVEMENT) self.is_render = is_render self.env_idx = env_idx self.steps = 0 self.episode = 0 self.rall = 0 self.recent_rlist = deque(maxlen=100) self.child_conn = child_conn self.life_done = life_done self.sticky_action = sticky_action self.last_action = 0 self.p = p self.history_size = history_size self.history = np.zeros([history_size, h, w]) self.h = h self.w = w self.reset() def run(self): super(MarioEnvironment, self).run() while True: action = self.child_conn.recv() if self.is_render: self.env.render() # sticky action if self.sticky_action: if np.random.rand() <= self.p: action = self.last_action self.last_action = action # 4 frame skip reward = 0.0 done = None for i in range(4): obs, r, done, info = self.env.step(action) if self.is_render: self.env.render() reward += r if done: break # when Mario loses life, changes the state to the terminal # state. if self.life_done: if self.lives > info['life'] and info['life'] > 0: force_done = True self.lives = info['life'] else: force_done = done self.lives = info['life'] else: force_done = done # reward range -15 ~ 15 log_reward = reward / 15 self.rall += log_reward r = int(info.get('flag_get', False)) self.history[:3, :, :] = self.history[1:, :, :] self.history[3, :, :] = self.pre_proc(obs) self.steps += 1 if done: self.recent_rlist.append(self.rall) print( "[Episode {}({})] Step: {} Reward: {} Recent Reward: {} Stage: {} current x:{} max x:{}".format( self.episode, self.env_idx, self.steps, self.rall, np.mean( self.recent_rlist), info['stage'], info['x_pos'], self.max_pos)) self.history = self.reset() self.child_conn.send([self.history[:, :, :], r, force_done, done, log_reward]) def reset(self): self.last_action = 0 self.steps = 0 self.episode += 1 self.rall = 0 self.lives = 3 self.stage = 1 self.max_pos = 0 self.get_init_state(self.env.reset()) return self.history[:, :, :] def pre_proc(self, X): # grayscaling x = cv2.cvtColor(X, cv2.COLOR_RGB2GRAY) # resize x = cv2.resize(x, (self.h, self.w)) return x def get_init_state(self, s): for i in range(self.history_size): self.history[i, :, :] = self.pre_proc(s)
def run(self): global episode env = gym_super_mario_bros.make('SuperMarioBros-1-1-v3') env = BinarySpaceToDiscreteSpaceEnv(env, REALLY_COMPLEX_MOVEMENT) step = 0 while episode < EPISODES: done = False max_x = 40 no_progress = 0 score = 0 state = env.reset() # Making initial history with random actions for _ in range(5): next_state = state state, _, _, _ = env.step(0) state = preprocess(state) history = np.stack((state, state, state, state), axis=2) history = np.reshape([history], (1, 88, 128, 4)) while not done: # Rendering code # Seems to be causing error in Mac OS #if self.thread_num==1: # env.render() step += 1 self.t += 1 step_reward = 0 action, policy = self.get_action(history) # Taking 3 steps with selected action # Mimicking frame skip for _ in range(6): next_state, reward, done, info = env.step(action) score += reward step_reward += reward if done: break # Kill Mario if Mario is making no progress for 10 seconds x_now = info.get('x_pos') # Handling exception x_pos = 65535 if x_now == 65535: x_now = max_x if max_x < x_now: max_x = x_now no_progress = 0 else: no_progress += 1 if no_progress == 150: done = True #reward -= 1 step_reward -= 1 score -= 1 print("#", self.thread_num, " STUCK") # Preprocessing each states next_state = preprocess(next_state) next_state = np.reshape([next_state], (1, 88, 128, 1)) next_history = np.append(next_state, history[:, :, :, :3], axis=3) # Average policy max value self.avg_p_max += np.amax( self.actor.predict(np.float32(history / 255.))) # Appending sample self.append_sample(history, action, step_reward) history = next_history if self.t >= self.t_max or done: #if done: self.train_model(done) self.update_local_model() self.t = 0 if done: # Recording training information episode += 1 print("#", self.thread_num, " episode:", episode, " score:", format(score, '.2f'), " step:", step, "max_x :", max_x) stats = [score, self.avg_p_max / float(step), step] for i in range(len(stats)): self.sess.run(self.update_ops[i], feed_dict={ self.summary_placeholders[i]: float(stats[i]) }) summary_str = self.sess.run(self.summary_op) self.summary_writer.add_summary(summary_str, episode + 1) self.avg_p_max = 0 self.avg_loss = 0 step = 0
env.render() # Get out Q values for the current state Q = model.predict(state, batch_size=1) print(Q) # 10% of the time, we'll just take a random action if np.random.rand(1) < epsilon: #print("Taking random action") action = env.action_space.sample() else: #print("Taking non-random action") action = np.argmax(Q) # Take the action new_state, reward, game_over, _ = env.step(action) new_state = process_image(new_state) batch.append([state, action, reward, Q, new_state]) batch_count += 1 if batch_count == batch_size: process_batch(batch) batch = [] batch_count = 0 # Explore more as we get further closer and closer toward our # Current best score # This way we stick with what works, and only explore when we get closer to # "uncharted territory" #epsilon = 0.4 * ((reward + 300)/(max_episode_reward + 300)) episode_reward += reward state = new_state
def main(): movement = SIMPLE_MOVEMENT movement.append(['left', 'A']) movement.append(['left', 'B']) movement.append(['left', 'A', 'B']) #movement.append(['B']) #movement.append(['down']) #movement.append(['up']) env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') env = BinarySpaceToDiscreteSpaceEnv(env, movement) #channels is acting as the number of frames in history #if resize_height and height are different, assert final_height < resize_height and image will be cropped channels = 4 # width = 84 # resize_height = 110 # final_height = 84 width=128 resize_height = 168 final_height = 128 size = [channels, final_height, width] batch_size = 16 replay_capacity = 100000 replay_dir = '/home/hansencb/mario_replay/' gamma = 0.95 start_epsilon = 0.3 stop_epsilon = 0.01 epsilon_decay = 0.00025 use_cuda = torch.cuda.is_available() torch.manual_seed(1) device = torch.device("cuda" if use_cuda else "cpu") model = simple_net(channels, len(movement), device).to(device) target_model = simple_net(channels, len(movement), device).to(device) data_file = 'data_loader' model_file = 'mario_agent' continue_train = True model.load_state_dict(torch.load(model_file)) if continue_train: target_model.load_state_dict(torch.load(model_file)) lr = 0.00005 optimizer = torch.optim.Adam(model.parameters(), lr=lr) total_reward_file ='total_reward.txt' if not continue_train: with open(total_reward_file, 'w') as f: f.write('Reward\tSteps\n') max_steps = 5000 num_eps = 5000 if continue_train: with open(data_file, 'rb') as f: data = pickle.load(f) data.batch_size = batch_size else: data = dataset(replay_capacity, batch_size, replay_dir, size) #initialize memory with 100 experiences done = True for i in range(100): if done: state = env.reset() state = preprocess(state, [resize_height, width], final_height) state = torch.cat((state, state, state, state)) action = random.randint(0,len(movement)-1) next_state, reward, done, info = env.step(int(action)) # if reward>0: # reward = 1 # else: # reward = -1 reward /= 15 if reward == 0: reward = -0.1 next_state = preprocess(next_state, [resize_height, width], final_height) next_state = torch.cat((state[1:, :, :], next_state)) trans = transition(state, action, reward, next_state, done) data.add(trans) state = next_state tau = 0 max_tau = 2000 decay_step = 0 farthest = 3000 cur_x = 1 #training loop for episode in range(num_eps): print('Episode {}'.format(episode+1)) state = env.reset() state = preprocess(state, [resize_height, width], final_height) state = torch.cat((state, state, state, state)) action = 0 episode_reward = 0 for step in range(max_steps): tau += 1 #epsilon = stop_epsilon+(start_epsilon - stop_epsilon)*np.exp(-epsilon_decay*decay_step) epsilon = start_epsilon * np.exp(1-(1/(cur_x/farthest))) if epsilon < stop_epsilon: epsilon = stop_epsilon if random.random() < epsilon: action = random.randint(0,len(movement)-1) else: q_val, action, q_vals = maxQ(state, model, device) next_state, reward, done, info = env.step(int(action)) cur_x = info['x_pos'] if cur_x > farthest: farthest = cur_x # if reward > 0: # reward = 1 # else: # reward = -1 reward /= 15 if reward == 0: reward = -0.1 episode_reward += reward next_state = preprocess(next_state, [resize_height, width], final_height) next_state = torch.cat((state[1:,:,:], next_state)) trans = transition(state, action, reward, next_state, done) data.add(trans) batch = data.get_batch(model, target_model, device, gamma) loss, abs_err = train(model, device, optimizer, batch) data.update_batch(batch['idx'], np.squeeze(torch.Tensor.numpy(abs_err))) state = next_state env.render() #time.sleep(0.03) if tau > max_tau: target_model.load_state_dict(model.state_dict()) tau = 0 if done: break decay_step += step with open(total_reward_file, 'a') as f: f.write('{}\t{}\n'.format(episode_reward, step)) if episode % 5 == 0: with open(model_file, 'wb') as f: torch.save(model.state_dict(), f) with open(data_file, 'wb') as f: pickle.dump(data, f) env.close()
oldi = { 'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 400, 'world': 1, 'x_pos': 40 } while oldi['life'] == 2: action_val, gradients_val = sess.run( [action, gradients], feed_dict={X: obs.reshape(-1, 240, 256, 3)}) obs, rwd, done, info = env.step(action_val[0][0]) creward = reward(info, oldi) oldi = info current_rewards.append(creward) current_gradients.append(gradients_val) #env.render() all_rewards.append(current_rewards) all_gradients.append(current_gradients) all_rewards = discnormrewards(all_rewards) feed_dict = {} for var_index, gradient_placeholder in enumerate( gradient_placeholders): mean_gradients = np.mean([ reward * all_gradients[game_index][step][var_index] for game_index, rewards in enumerate(all_rewards) for step, reward in enumerate(rewards)
if __name__ == "__main__": env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') env = BinarySpaceToDiscreteSpaceEnv(env, REALLY_COMPLEX_MOVEMENT) agent = DQNAgent(action_size) total_rewards, episodes = [], [] for e in range(EPISODES): state = env.reset() step, total_reward = 0, 0 done = False for _ in range(8): start, _, _, _ = env.step(0) start = preprocess2(start) start = np.reshape(start, (1, 88, 128, 1)) history = np.stack((start, start, start, start), axis=3) history = np.reshape([history], (1, 88, 128, 4)) while not done: if agent.render: env.render() global_step += 1 step += 1 step_reward = 0 epsilon = agent.epsilon_now(e) action = agent.act(history) for _ in range(8): next_state, reward, done, _ = env.step(action) step_reward += reward
['A'], ['B'], ['right'], ['right', 'A'], ['right', 'B'], ['right', 'A', 'B'], ['left'], ['left', 'A'], ['left', 'B'], ['left', 'A', 'B'], # ['down'], # ['up'] ] _env = gym_super_mario_bros.make('SuperMarioBros-v0') #_env = gym_super_mario_bros.SuperMarioBrosEnv(frames_per_step=1, rom_mode='rectangle') env = BinarySpaceToDiscreteSpaceEnv(_env, movements) env = DummyVecEnv([lambda: env]) model = PPO2(policy=CnnPolicy, env=env, verbose=1) model.learn(total_timesteps=10000) obs = env.reset() while True: action, _info = model.predict(obs) obs, rewards, dones, info = env.step(action) print("학습끝") print(rewards) env.render()
from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv import gym_super_mario_bros from gym_super_mario_bros.actions import SIMPLE_MOVEMENT import torch import cv2 env = gym_super_mario_bros.make('SuperMarioBros-v0') env = BinarySpaceToDiscreteSpaceEnv(env, SIMPLE_MOVEMENT) done = True for step in range(5000): if done: state = env.reset() state, reward, done, info = env.step(env.action_space.sample()) env.render() env.close()
def main(): movement = SIMPLE_MOVEMENT movement.append(['left', 'A']) movement.append(['left', 'B']) movement.append(['left', 'A', 'B']) #movement.append(['B']) #movement.append(['down']) #movement.append(['up']) env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') env = BinarySpaceToDiscreteSpaceEnv(env, movement) #channels is acting as the number of frames in history #if resize_height and height are different, assert final_height < resize_height and image will be cropped channels = 3 frames = 4 width = 128 resize_height = 180 final_height = 128 bottom_chop = 15 epsilon = 0.0 use_cuda = torch.cuda.is_available() torch.manual_seed(1) device = torch.device("cuda" if use_cuda else "cpu") model = simple_net(channels, len(movement), device).to(device) model_file = 'mario_agent' model.load_state_dict(torch.load(model_file)) max_steps = 5000 num_eps = 1 for episode in range(num_eps): print('Episode {}'.format(episode + 1)) state = env.reset() state = preprocess(state, [resize_height, width, 3], final_height, bottom_chop) state = torch.cat((state, state, state, state)) action = 0 episode_reward = 0 for step in range(max_steps): if step % 3 == 0: if random.random() < epsilon: action = random.randint(0, len(movement) - 1) else: q_val, action, q_vals = maxQ(state, model, device) next_state, reward, done, info = env.step(int(action)) if reward > 0: reward = 1 else: reward = -1 episode_reward += reward next_state = preprocess(next_state, [resize_height, width, 3], final_height, bottom_chop) next_state = torch.cat((state[3:, :, :], next_state)) state = next_state env.render() time.sleep(0.03) if done: break env.close()
# Instantiate memory memory = Memory(max_size=memory_size) for i in range(pretrain_length): # If it's the first step if i == 0: state = env.reset() state, stacked_frames = stack_frames(stacked_frames, state, True) # Get the next_state, the rewards, done by taking a random action choice = random.randint(1, env.action_space.n) - 1 action = possible_actions[choice] next_state, reward, done, _ = env.step(choice) # env.render() # Stack the frames next_state, stacked_frames = stack_frames(stacked_frames, next_state, False) # If the episode is finished (we're dead 3x) if done: # We finished the episode next_state = np.zeros(state.shape) # Add experience to memory memory.add((state, action, reward, next_state, done)) # TODO
## Base model to run the game, using random movements from nes_py.wrappers import BinarySpaceToDiscreteSpaceEnv from aux import * import gym_super_mario_bros from gym_super_mario_bros.actions import COMPLEX_MOVEMENT env = gym_super_mario_bros.make('SuperMarioBros-v0') env = BinarySpaceToDiscreteSpaceEnv(env, COMPLEX_MOVEMENT) done = True oldi = { 'coins': 0, 'flag_get': False, 'life': 2, 'score': 0, 'stage': 1, 'status': 'small', 'time': 400, 'world': 1, 'x_pos': 40 } for step in range(100): if done: state = env.reset() state, rwd, done, info = env.step(1) #env.action_space.sample()) print(reward(info, oldi), "vs", rwd) print(env.observation_space.shape) oldi = info env.render() env.close()
class Game: def __init__(self, game_id, obs_size, skip_frame=4, mode='train'): self.game_id = game_id env = gym_super_mario_bros.make(game_id) temp_obs = env.reset() height, width, _ = temp_obs.shape self.env = BinarySpaceToDiscreteSpaceEnv(env, COMPLEX_MOVEMENT) self.obs_last2max = np.zeros((2, obs_size, obs_size, 1), np.uint8) self.obstack = np.zeros((obs_size, obs_size, 4)) self.rewards = [] self.lives = 3 self.skip = skip_frame self.mode = mode if self.mode == 'play': self.monitor = Monitor(width=width, height=height) def step(self, action, monitor=False): reward = 0.0 done = False for i in range(self.skip): obs, r, done, info = self.env.step(action) if self.mode == 'play': self.monitor.record(obs) if i >= 2: self.obs_last2max[i % 2] = self._process_obs(obs) # super mario's reward is cliped in [-15.0, 15.0] reward += r / 15.0 lives = info['life'] if lives < self.lives: done = True self.lives = lives if done: break self.rewards.append(reward) if done: episode_info = { "reward": sum(self.rewards), "length": len(self.rewards) } self.reset() else: episode_info = None obs = self.obs_last2max.max(axis=0) self.obstack = np.roll(self.obstack, shift=-1, axis=-1) self.obstack[..., -1:] = obs return self.obstack, reward, done, episode_info def reset(self): obs = self.env.reset() obs = self._process_obs(obs) self.obstack[..., 0:] = obs self.obstack[..., 1:] = obs self.obstack[..., 2:] = obs self.obstack[..., 3:] = obs self.rewards = [] self.lives = 3 return self.obstack @staticmethod def _process_obs(obs): obs = cv2.cvtColor(obs, cv2.COLOR_RGB2GRAY) obs = cv2.resize(obs, (84, 84), interpolation=cv2.INTER_AREA) return obs[:, :, None]
['right'], ['right', 'A'], ['right', 'B'], ['right', 'A', 'B'], ['A'], ['left'], ['left', 'A'], ['left', 'B'], ['left', 'A', 'B'], ['down'], ] env = gym_super_mario_bros.make('SuperMarioBros-v0') env = BinarySpaceToDiscreteSpaceEnv(env, COMPLEX_MOVEMENT) done = True for i in range(1000): if done: state = env.reset() action = env.action_space.sample() observation , reward, done, info = env.step(action) print(state) print(reward) #env.render() env.close()
def main(): movement = SIMPLE_MOVEMENT movement.append(['left', 'A']) movement.append(['left', 'B']) movement.append(['left', 'A', 'B']) movement.append(['B']) movement.append(['down']) movement.append(['up']) env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') env = BinarySpaceToDiscreteSpaceEnv(env, movement) #channels is acting as the number of frames in history #if resize_height and height are different, assert final_height < resize_height and image will be cropped channels = 4 width = 84 resize_height = 110 final_height = 84 size = [channels, final_height, width] batch_size = 32 replay_capacity = 100000 replay_dir = '/home/hansencb/mario_replay/' epsilon = 1 gamma = 0.9 use_cuda = torch.cuda.is_available() torch.manual_seed(1) device = torch.device("cuda" if use_cuda else "cpu") model = simple_net(channels, len(movement), device).to(device) target_model = simple_net(channels, len(movement), device).to(device) model_file = 'mario_agent' model.load_state_dict(torch.load(model_file)) target_model.load_state_dict(torch.load(model_file)) lr = 0.001 optimizer = torch.optim.Adam(model.parameters(), lr=lr) total_reward_file = 'total_reward.txt' with open(total_reward_file, 'w') as f: f.write('Reward\tSteps\n') max_steps = 5000 num_eps = 1000 data = dataset(replay_capacity, batch_size, replay_dir, 1, size) for episode in range(num_eps): print('Episode {}'.format(episode + 1)) state = env.reset() state = preprocess(state, [resize_height, width], final_height) state = torch.cat((state, state, state, state)) action = 0 episode_reward = 0 for step in range(max_steps): if step % 3 == 0: if random.random() < epsilon: action = random.randint(0, len(movement) - 1) else: q_val, action = maxQ(state, model, device) next_state, reward, done, info = env.step(int(action)) if reward > 0: reward = 1 else: reward = -1 episode_reward += reward next_state = preprocess(next_state, [resize_height, width], final_height) next_state = torch.cat((state[1:, :, :], next_state)) trans = transition(state, action, reward, next_state, done) data.add(trans) train(model, device, optimizer, data.get_batch(model, device, gamma)) state = next_state env.render() #time.sleep(0.03) if done: with open(total_reward_file, 'a') as f: f.write('{}\t{}\n'.format(episode_reward, step)) break epsilon -= (1 / num_eps) if episode % 10 == 0: target_model.load_state_dict(model.state_dict()) with open(model_file, 'wb') as f: torch.save(model.state_dict(), f) env.close()