def play(self): env = gym_tetris.make('TetrisA-v0') env = JoypadSpace(env, MOVEMENT) state = env.reset() model = self.global_model model_path = os.path.join(self.save_dir, 'model_{}.h5'.format('Tetris')) print('Loading model from: {}'.format(model_path)) model.load_weights(model_path) done = False step_counter = 0 reward_sum = 0 pieza_colocada = True informacion = env.get_info() antiguo_statistics = informacion['statistics'] state = [0, 0, 0, 0] while not done: env.render() if pieza_colocada: pieza_colocada = False pos = 5 giro = 0 u = -1 state = [state] policy, value = model( tf.convert_to_tensor(state, dtype=tf.float32)) policy = tf.nn.softmax(policy) action = np.argmax(policy) pos_objetivo = action % 10 giro_objetivo = action // 10 if (giro % giro_objetivo) != 0 and not done: state, reward, done, info = env.step(1) accion = 0 giro = giro + 1 elif pos > pos_objetivo and not done: state, reward, done, info = env.step(6) pos = pos - 1 accion = 0 elif pos < pos_objetivo and not done: state, reward, done, info = env.step(3) pos = pos + 1 accion = 0 elif not done and not pieza_colocada: state, reward, done, info = env.step(9) accion = 9 else: accion = 0 if not done: state, reward, done, info = env.step(accion) env.render() informacion = env.get_info() if antiguo_statistics != informacion['statistics']: antiguo_statistics = informacion['statistics'] step_counter += 1 env.close()
class Worker(object): def __init__(self, genome, config): self.genome = genome self.config = config @property def work(self): self.env = gym_tetris.make('TetrisA-v0') self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT) self.env.reset() action = np.argmax(self.env.action_space.sample()) ob, _, _, _ = self.env.step(int(action)) inx = 10 iny = 20 done = False # net = neat.nn.FeedForwardNetwork.create(self.genome, self.config) net = neat.nn.recurrent.RecurrentNetwork.create( self.genome, self.config) fitness = 0 xpos = 0 xpos_max = 16 counter = 0 max_score = 0 moving = 0 frames = 0 while not done: scaledimg = cv2.cvtColor(ob, cv2.COLOR_BGR2RGB) ob = Minimize(ob) ob = cv2.resize(ob, (10, 20)) cv2.imshow('humanwin', scaledimg) cv2.waitKey(1) imgarray = np.ndarray.flatten(ob) actions = net.activate(imgarray) action = np.argmax(actions) ob, rew, done, info = self.env.step(int(action)) frames += 1 if frames == 1200: fitness += 1 frames = 0 print( f"genome:{self.genome.key} Fitnes: {fitness} lines: {info['number_of_lines']}" ) return int(fitness)
class Worker(object): def __init__(self, genome, config): self.genome = genome self.config = config self.x = 13 self.y = 15 self.w = 18 self.h = 17 def work(self): self.env = gym_super_mario_bros.make('SuperMarioBros-v3') self.env = JoypadSpace(self.env, RIGHT_ONLY) self.env.reset() observation, _, _, _ = self.env.step(self.env.action_space.sample()) done = False net = neat.nn.FeedForwardNetwork.create(self.genome, self.config) max_fitness = 0 fitness = 0 xpos = 0 xpos_max = 0 counter = 0 while not done: observation = observation[self.y * 8:self.y * 8 + self.h * 8, self.x * 8:self.x * 8 + self.w * 8] observation = cv2.resize(observation, (self.w, self.h)) observation = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY) observation = cv2.resize(observation, (self.w, self.h)) imgarray = np.ndarray.flatten(observation) imgarray = np.interp(imgarray, (0, 254), (-1, +1)) nnOutput = net.activate(imgarray) observation, reward, done, info = self.env.step( nnOutput.index(max(nnOutput))) fitness += int(reward) if fitness > max_fitness: max_fitness = fitness counter = 0 else: counter += 1 if done or counter > 350 or info['life'] < 2: done = True fitness += info['score'] if info['flag_get']: fitness += 100000 return fitness
class Mario(): def __init__(self, img_size=32, stacks=4, skips=4, return_seq=False): from nes_py.wrappers import JoypadSpace import gym_super_mario_bros from gym_super_mario_bros.actions import SIMPLE_MOVEMENT env = gym_super_mario_bros.make('SuperMarioBros-v0') env = JoypadSpace(env, SIMPLE_MOVEMENT) env = gym_super_mario_bros.make('SuperMarioBros-v2') self.env = JoypadSpace(env, SIMPLE_MOVEMENT) self.preprocess = Preprocess(img_size, stacks, return_seq) self.skips = skips self.action_space = self.env.action_space self.observation_space = (img_size, img_size, stacks) def reset(self): self.preprocess.reset() s = self.env.reset() s = self.preprocess(s) return s def step(self, a): total_r = 0 for i in range(self.skips): self.env.render() n_s, r, done, info = self.env.step(a) n_s = self.preprocess(n_s) total_r += r if done: break return n_s, total_r, done, info
def play_mario(): from nes_py.wrappers import JoypadSpace import gym_super_mario_bros from gym_super_mario_bros.actions import SIMPLE_MOVEMENT env = gym.make('SuperMarioBros-v3') env = JoypadSpace(env, SIMPLE_MOVEMENT) env.reset() done = False step = -1 while not done: step += 1 time.sleep(1 / 100) env.render() # print(step) action = env.action_space.sample() # action = 0 # if keyboard.is_pressed('a'): # action = 4 obs, reward, done, info = env.step(action) print(obs.shape)
def test_env(env, model, device, deterministic=True): env = gym_super_mario_bros.make('SuperMarioBros-v0') env = JoypadSpace(env, SIMPLE_MOVEMENT) env = RewardScalar(env) env = WarpFrame(env) env = FrameStack(env, 4) env = StochasticFrameSkip(env, 4, 0.5) env = ScaledFloatFrame(env) # env=gym.wrappers.Monitor(env, 'recording/PPORB5/{}'.format(str(num)), video_callable=lambda episode_id: True, force=True) state = env.reset() done = False total_reward = 0 distance = [] print("yes") for i in range(2000): state = torch.FloatTensor(state).to(device) state = state.float() state = state.permute(3, 0, 1, 2) dist, _ = model(state) policy = dist policy = Categorical(F.softmax(policy, dim=-1).data.cpu()) actionLog = policy.sample() action = actionLog.numpy() next_state, reward, done, info = env.step(action[0]) distance.append(info['x_pos']) state = next_state total_reward += reward env.render() print(total_reward) print(max(distance))
class MarioEnvWrapper(GymEnvWrapper, TensorStateMixin): max_steps = 10 # TODO: Fix this reward_range = (-100, 100) # TODO: Fix this def __init__(self): super().__init__() self.env = gym_super_mario_bros.make("SuperMarioBros-v0") self.env = JoypadSpace(self.env, COMPLEX_MOVEMENT) self.history_size = 3 self.action_repeats = 6 @timeout_lost @step_incrementer def step(self, action: int, **kwargs) -> Tuple[Any, Any, bool, dict]: for _ in range(self.action_repeats): frame, self.reward, self.done, self.info = self.env.step(action) self.state = prepare_multi_state(self.state, frame) if self.done: break return self.state, self.reward, self.done, self.info @reset_incrementer def reset(self): frame = self.env.reset() self.state = prepare_initial_state(frame, self.history_size) self.done = False return self.state def get_legal_actions(self): return list(range(12))
def run(file): config = neat.Config(neat.DefaultGenome, neat.DefaultReproduction, neat.DefaultSpeciesSet, neat.DefaultStagnation, 'config-feedforward') genome = pickle.load(open(file, 'rb')) #print(genome) env = gym_super_mario_bros.make('SuperMarioBros-1-1-v2') env = JoypadSpace(env, RIGHT_ONLY) env1 = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') env1 = JoypadSpace(env1, RIGHT_ONLY) net = neat.nn.FeedForwardNetwork.create(genome, config) try: obs = env.reset() env1.reset() inx = int(obs.shape[0] / 8) iny = int(obs.shape[1] / 8) done = False while not done: #env.render() env1.render() obs = cv2.resize(obs, (inx, iny)) obs = cv2.cvtColor(obs, cv2.COLOR_BGR2GRAY) obs = np.reshape(obs, (inx, iny)) imgarray = np.ndarray.flatten(obs) actions = net.activate(imgarray) action = np.argmax(actions) _,_,_,info1 = env1.step(action) s, reward, done, info = env.step(action) xpos = info['x_pos'] print(done, action, xpos) obs = s env1.close() env.close() except KeyboardInterrupt: env.close() env1.close() exit()
def fitness_func(self, genome, config, o): # create the environment game = gym_super_mario_bros.make('SuperMarioBros-v2') env = JoypadSpace(game, SIMPLE_MOVEMENT) try: # reset environment and create network from config file state = env.reset() neural_net = neat.nn.recurrent.RecurrentNetwork.create( genome, config) # frame count i = 0 # starting mario position start_mario_distance = 40 done = False # get shape of pixels inx, iny, inc = env.observation_space.shape inx, iny = int(inx / 8), int(iny / 8) while not done: # env.render() uncomment this to see mario play # resize image array and convert to grayscale state = cv2.resize(state, (inx, iny)) state = cv2.cvtColor(state, cv2.COLOR_BGR2GRAY) state = np.reshape(state, (inx, iny)) # flatten array so the network likes it state = state.flatten() # feed the state through the network and get max output output = neural_net.activate(state) action = output.index(max(output)) # do the action from the net observation, reward, done, info = env.step(action) state = observation # increase frame count i += 1 # check if 50 frames if mario moves and break from loop to restart if he hasn't if i % 50 == 0: if start_mario_distance == info['x_pos']: break else: start_mario_distance = info['x_pos'] # give a negative reward if mario didn't move else reward the distance he moved fitness = -1 if info['x_pos'] <= 40 else info['x_pos'] # if at the end of the level dump the current genome to file if fitness >= 4000: pickle.dump(genome, open("winning_genome.pkl", "wb")) # put current fitness into queue o.put(fitness) env.close() except KeyboardInterrupt: env.close() sys.exit()
def main(): """ Main entry point function for program. """ env = gym_super_mario_bros.make('SuperMarioBros-v0') env = JoypadSpace(env, RIGHT_ONLY) action_size = len(RIGHT_ONLY) cdqn = CDQN(action_size, memory_size=10000, image_shape=(45, 64, 1)) batch_size = 1024 games = 10000 skip = 100 beaten = False for game in range(games): print("Game: {}".format(game + 1), end=" ") done = True total_reward = 0 for step in range(8000): # Preprocess first image if done: state = env.reset() state = preprocess_image(state)[..., tf.newaxis] # Play move action = cdqn.act(state) next_state, reward, done, info = env.step(action) total_reward += reward # Remember move next_state = preprocess_image(next_state)[..., tf.newaxis] cdqn.remember(state, action, total_reward, next_state, done) state = next_state # Render game env.render() if done: break # Train when there are enough examples in memory #if len(cdqn.memory) >= batch_size and step % skip == 0: print("Reward: {}".format(total_reward)) for e in range(5): print('Epoch {}'.format(e + 1)) cdqn.experience_replay(batch_size) if game % 10 == 0: cdqn.update_target_model() print("Reward: {}".format(total_reward)) tf.saved_model.save(cdqn.network, "model.sav") env.close()
def rank(self): for model in self.models: env = gym_tetris.make('TetrisA-v0') env = JoypadSpace(env, SIMPLE_MOVEMENT) env.reset() done = False info = None while not done: # Generate all options options = [[Action.DROP]] for x in range(1, 5): left_option = [Action.LEFT] * x left_option.append(Action.DROP) options.append(left_option) right_option = [Action.RIGHT] * x right_option.append(Action.DROP) options.append(right_option) # Enumerate all choices boards = [] for option in options: # Back-up the environment first env.unwrapped._backup() # Run the sequence of actions state = None for action in option: state, _, _, _ = env.step(action.value) # Now, parse the board from the state board = parse_blocks(state) boards.append(board) env.unwrapped._restore() # Choose the best option genetically choice = model.best(boards) for action in options[choice]: _, _, done, info = env.step(action.value) model.fitness = info['score'] self.models = sorted(self.models, key=lambda model: model.fitness)
def run_player(self, member): env = gym_super_mario_bros.make(self.env) env = JoypadSpace(env, self.actions) env = WarpFrame(env) env = FrameStack(env, 4) player = MarioPlayer(self.num_of_actions, member.genes) if self.record: rec_output_path = os.path.join( self.current_gen_output_dir, "vid", "{name}.mp4".format(name=member.get_name())) rec = monitor.video_recorder.VideoRecorder(env, path=rec_output_path) state = env.reset() done = False last_x_pos = 0 same_x_pos_cunt = 0 for step in range(self.steps_scale): if done: break action = player.act(state) state, reward, done, info = env.step(action) if self.record: rec.capture_frame() if self.render: env.render() player.update_info(info) player.update_reward(reward) if last_x_pos == info['x_pos']: same_x_pos_cunt += 1 else: same_x_pos_cunt = 0 last_x_pos = info['x_pos'] if same_x_pos_cunt > self.standing_steps_limit: # end the run if player don't advance: done = True if not self.allow_death and info[ 'life'] < INITIAL_LIFE: # will repeat death, so why try more done = True if info['flag_get']: # if got to the flag - run is ended. done = True if self.record: rec.close() env.close() member.set_fitness_score(player.calculate_fitness()) outcome = player.get_run_info() outcome['generation'] = self.generation outcome['index'] = member.get_name() return outcome
class Worker(object): def __init__(self, genome, config): self.genome = genome self.config = config env = gym_super_mario_bros.make('SuperMarioBros-1-1-v2') self.env = JoypadSpace(env, RIGHT_ONLY) def work(self): ob = self.env.reset() inx = int(ob.shape[0] / 8) iny = int(ob.shape[1] / 8) done = False net = neat.nn.FeedForwardNetwork.create(self.genome, self.config) fitness = 0 xpos = 0 xpos_max = 0 counter = 0 imgarray = [] while not done: # cv2.namedWindow("main", cv2.WINDOW_NORMAL) ob = cv2.resize(ob, (inx, iny)) ob = cv2.cvtColor(ob, cv2.COLOR_BGR2GRAY) ob = np.reshape(ob, (inx, iny)) imgarray = np.ndarray.flatten(ob) #print("Test",self.env.action_space) actions = net.activate(imgarray) action = np.argmax(actions) ob, rew, done, info = self.env.step(action) xpos = info['x_pos'] if xpos > xpos_max: xpos_max = xpos counter = 0 else: counter += 1 if counter > 250: done = True if info['flag_get']: print("Finished") done = True print("Worker Fitness:{}".format(xpos)) return int(xpos)
class Player(object): def __init__(self, genome, config, record): self.genome = genome self.config = config self.x = 13 self.y = 15 self.w = 18 self.h = 17 self.record = record def play(self): self.env = gym_super_mario_bros.make('SuperMarioBros-v3') self.env = JoypadSpace(self.env, RIGHT_ONLY) self.env.reset() observation, _, _, _ = self.env.step(self.env.action_space.sample()) done = False net = neat.nn.FeedForwardNetwork.create(bestGenome, self.config) fourcc = cv2.VideoWriter_fourcc(*'XVID') fourcc2 = cv2.VideoWriter_fourcc(*'XVID') mainVideo = cv2.VideoWriter('./Videos/mainWindow.avi', fourcc, 60.0, (256, 240)) smallWindow = cv2.VideoWriter('./Videos/smallWindow.avi', fourcc2, 60.0, (self.w * 8, self.h * 8)) while not done: self.env.render() frame = observation observation = observation[self.y * 8:self.y * 8 + self.h * 8, self.x * 8:self.x * 8 + self.w * 8] if record: mainVideo.write(frame) smallWindow.write(observation) cv2.imshow('main', observation) observation = cv2.resize(observation, (self.w, self.h)) observation = cv2.cvtColor(observation, cv2.COLOR_BGR2GRAY) observation = cv2.resize(observation, (self.w, self.h)) imgarray = np.ndarray.flatten(observation) imgarray = np.interp(imgarray, (0, 254), (-1, +1)) nnOutput = net.activate(imgarray) observation, reward, done, info = self.env.step( nnOutput.index(max(nnOutput)))
def mario(v, lock): env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') env = JoypadSpace(env, COMPLEX_MOVEMENT) done = True while True: if done: env.reset() with lock: v.value = 0 with lock: u = v.value _, _, done, _ = env.step(u) env.render() sleep(0.01)
class Agent: def __init__(self, height, width, env_name='SuperMarioBros-v0'): # Create gym environment self.env = gym_super_mario_bros.make(env_name) # Adding actions to the environment self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT) self.num_actions = self.env.action_space.n # Define state as a queue self.obs = deque(maxlen=4) self.height = height self.width = width # Initialize state with empty frames self.obs.append(np.zeros((height, width))) self.obs.append(np.zeros((height, width))) self.obs.append(np.zeros((height, width))) self.obs.append(np.zeros((height, width))) self.env.reset() def randomAction(self): return random.randint(0, self.num_actions - 1) def play(self, act, curr_time, skip_frame=4): current_state = self.obs.copy() current_state = np.array(current_state) current_state = current_state.transpose(1, 2, 0) r = 0 for _ in range(0, skip_frame): state, reward, done, info = self.env.step(act) r = r + reward if done or info['time'] <= 1 or info['time'] > curr_time: r = r + (-100) done = True break curr_time = info['time'] state = resize(Utils.pre_process(state), (self.height, self.width), anti_aliasing=True) self.obs.append(state) next_state = self.obs.copy() next_state = np.array(next_state) next_state = next_state.transpose(1, 2, 0) return current_state, next_state, r, done, curr_time
def main(): env = gym_super_mario_bros.make('SuperMarioBros-v0') env = JoypadSpace(env, SIMPLE_MOVEMENT) done = False for e in range(100): state = env.reset() while not done: env.render() state, reward, done, info = env.step(env.action_space.sample()) env.close()
def contra_game_render(): env = gym.make('Contra-v0') env = JoypadSpace(env, SIMPLE_MOVEMENT) print("actions", env.action_space) print("observation_space ", env.observation_space.shape) done = False env.reset() for step in range(5000): if done: print("Over") break state, reward, done, info = env.step(env.action_space.sample()) env.render() env.close()
def main(): env = gym.make('SuperMarioBros-v0') env = JoypadSpace(env, SIMPLE_MOVEMENT) obs_shape = env.observation_space.shape obs_size = reduce(operator.mul, obs_shape, 1) action_size = env.action_space.n q = MLP(obs_size, action_size) q_target = MLP(obs_size, action_size) q_target.load_state_dict(q.state_dict()) if torch.cuda.is_available(): q = q.cuda() q_target = q_target.cuda() memory = ReplayBuffer() print_interval = 20 score = 0.0 optimizer = optim.Adam(q.parameters(), lr=learning_rate) for n_epi in range(10000): epsilon = max(0.01, 0.08 - 0.01 * (n_epi / 200)) # Linear annealing from 8% to 1% s = env.reset() done = False while not done: a = q.sample_action(torch.from_numpy(np.array(s)).float(), epsilon) s_prime, r, done, info = env.step(a) done_mask = 0.0 if done else 1.0 memory.put((s, a, r / 100.0, s_prime, done_mask)) s = s_prime score += r if done: break if memory.size() > 2000: train(q, q_target, memory, optimizer) if n_epi % print_interval == 0 and n_epi != 0: q_target.load_state_dict(q.state_dict()) print( "n_episode :{}, score : {:.1f}, n_buffer : {}, eps : {:.1f}%". format(n_epi, score / print_interval, memory.size(), epsilon * 100)) score = 0.0 env.close()
class agent: def __init__(self): self.env = gym_super_mario_bros.make('SuperMarioBros-v0') self.env = JoypadSpace(self.env, SIMPLE_MOVEMENT) self.size = self.env.observation_space.shape self.options = self.env.action_space.n self.baseline = 0 def get_screen(self): self.env.render() def close(self): self.env.close() def doStep(self, a): sP, r, done, info = self.env.step(a) return r, done, sP
def gym_SuperMarioBros_env_test(): """ `pip install gym-super-mario-bros==7.3.0` """ import gym_super_mario_bros from nes_py.wrappers import JoypadSpace # Initialize Super Mario environment env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0") # Limit the action-space to # 0. walk right # 1. jump right env = JoypadSpace(env, [["right"], ["right", "A"]]) env.reset() next_state, reward, done, info = env.step(action=0) print(f"{next_state.shape},\n {reward},\n {done},\n {info}")
def play_random_custom(env, steps): _NOP = 0 actions = [['start'], ['NOOP'], ['right', 'A'], ['left', 'A'], ['left', 'B'], ['right', 'B'], ['up'], ['down'], ['A'], ['B']] env = JoypadSpace(env, actions) env.reset() action = 0 start = time.time() # play_human for t in range(0, steps): # get the mapping of keyboard keys to actions in the environment if hasattr(env, 'get_keys_to_action'): keys_to_action = env.get_keys_to_action() elif hasattr(env.unwrapped, 'get_keys_to_action'): keys_to_action = env.unwrapped.get_keys_to_action() else: raise ValueError('env has no get_keys_to_action method') # # change action every 6 frames if t % 6 == 0: action = env.action_space.sample() # after 500 timesteps, stop pressing start button if t > 500: while action == 0: action = env.action_space.sample() observation, reward, done, info = env.step(action) # print("---------------------------t: ", t) # print("action space: ", action, env.action_space) # print("obs: ", observation) # print("reward: ", reward) # print("info: ", info) # runs game at about 60fps time.sleep(0.016667) env.render() end = time.time() env.close() print("time: ", (end - start), " seconds for ", steps, "steps")
def main(): env = gym_super_mario_bros.make('SuperMarioBros-v0') env = JoypadSpace(env, SIMPLE_MOVEMENT) gamma = 0.9 epsilon = .95 trials = 1000 trial_len = 500 dqn_agent = DQN(env=env) steps = [] for trial in range(trials): cur_state = env.reset() cur_state = cur_state.reshape(1, 184320) cur_reward = 0 for step in range(trial_len): action = dqn_agent.act(cur_state) new_state, reward, done, _ = env.step(action) env.render() cur_reward += reward new_state = new_state.reshape(1, 184320) dqn_agent.remember(cur_state, action, reward, new_state, done) dqn_agent.replay() dqn_agent.target_train() cur_state = new_state if done: break if reward <= 199.0: print("Failed to complete in trial: " + str(trial) + " reward: " + str(cur_reward)) else: print("Completed in trial: " + str(trail) + " reward: " + str(cur_reward)) break
def record_one_episode(agent, episode): tmp_env = gym_super_mario_bros.make(LEVEL_NAME) tmp_env = JoypadSpace(tmp_env, ACTION_SPACE) tmp_env = Monitor(tmp_env, './videos/video-episode-{0:05d}'.format(episode), force=True) tmp_env = wrapper(tmp_env, FRAME_DIM, FRAME_SKIP) state = lazy_frame_to_tensor(tmp_env.reset()) total_reward = 0 while True: action = agent.get_action(state) next_state, reward, done, info = tmp_env.step(action) next_state = lazy_frame_to_tensor(next_state) if done: break total_reward += reward state = next_state
def run_random_actions(): """ randomly take 1 of the 12 complex movement actions and print action, rewards """ env = JoypadSpace(gym_super_mario_bros.make('SuperMarioBros-v0'), COMPLEX_MOVEMENT) done = True for step in range(50): if done: env.reset() # randomly take an action from action_space random_action = env.action_space.sample() # info returns meta-data incl. coins, life, score etc. # state is RGB image (240, 256, 3) state, reward, done, info = env.step(random_action) print('# {}: Action: {}, Reward: {}, Done: {}'.format( step, random_action, reward, done)) env.close()
def main(): env = gym_super_mario_bros.make('SuperMarioBros-v1') env = JoypadSpace(env, USE_MOVEMENT) interval = 20 q = QNetWork() q_target = QNetWork() input_shape = (batch_size, 240, 256, 3) q.build(input_shape=input_shape) q_target.build(input_shape=input_shape) for src, dest in zip(q.variables, q_target.variables): dest.assign(src) memory = ReplayBuffer() score = 0. optimizer = optimizers.Adam(lr=learning_rate) for n_epi in range(10000): eqsilon = max(0.01, 0.08 - 0.01 * (n_epi / 200)) s = env.reset() for t in range(10000): a = q.sample_action(s, eqsilon) s_prime, r, done, _ = env.step(a) env.render() done = 0. if done else 1. memory.put((s, a, r, s_prime, done)) s = s_prime score += r if not done: break print ("epeide : {} ".format(n_epi)) if memory.size() > 100: train(q, q_target, memory, optimizer) # print("22, ", tf.size(q), tf.size(q)) if n_epi % interval == 0 and n_epi != 0: # print(q.variables, q_target.variables) for src, dest in zip(q.variables, q_target.variables): dest.assign(src) # 影子网络权值来自Q print(" # of epsode {}, avg_score {}, buffer size {}".format(n_epi, score/interval, memory.size())) score = 0. if n_epi % 200 == 0 and not n_epi: q_target.network.save_weights('dqn_weights{}.ckpt'.format(int(n_epi / 200))) env.close()
def record_one_episode(agent): tmp_env = gym_super_mario_bros.make(LEVEL_NAME) tmp_env = JoypadSpace(tmp_env, ACTION_SPACE) tmp_env = Monitor(tmp_env, './video', force=True) tmp_env = wrapper(tmp_env, FRAME_DIM) state = lazy_frame_to_tensor(tmp_env.reset()) total_reward = 0 while True: action, _ = agent.select_action_based_on_state(state) next_state, reward, done, info = tmp_env.step(action) next_state = lazy_frame_to_tensor(next_state) if done: break total_reward += reward state = next_state
def eval_genome(genome): env = gym_super_mario_bros.make("SuperMarioBros-1-1-v0") env = JoypadSpace(env, COMPLEX_MOVEMENT) done = False timeout = 100 state = env.reset() rewards = 0 while not done and timeout > 0: state_resized = resize(state, (state.shape[0] // 8, state.shape[1] // 8), anti_aliasing=False) state_resized = np.apply_along_axis( rgb2dec, 1, (np.reshape(state_resized, (state_resized.shape[0] * state_resized.shape[1], 3)) * 255), ) state, reward, done, info = env.step( np.argmax(genome.evaluate(state_resized))) rewards += reward if reward <= 0: timeout -= 1 else: timeout += 1 env.render() env.close() return rewards
class Mario: def __init__(self, *args, **kwargs): self._env = gym_super_mario_bros.make( kwargs.get('env', 'SuperMarioBros-v0')) self._env = JoypadSpace(self._env, SIMPLE_MOVEMENT) self._env.reset() self._cur_state = None self._cur_reward = None def perform_move(self, move): state, reward, done, info = self._env.step(move) self._cur_state = state self._cur_reward = reward self._env.render() return self._cur_state def get_cur_reward(self): return self._cur_reward def get_cur_state(self): return self._cur_state
def play_model(args): # if gpu is to be used device = torch.device( "cuda" if torch.cuda.is_available() and args.ngpu > 0 else "cpu") # Build env (first level, right only) env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') env = JoypadSpace(env, SIMPLE_MOVEMENT) # setup networks init_screen = get_screen(env, device) _, _, screen_height, screen_width = init_screen.shape # Get number of actions from gym action space args.n_actions = env.action_space.n target_net = DQN(screen_height, screen_width, args.n_actions).to(device) if args.targetNet: target_net.load_state_dict( torch.load(args.targetNet, map_location=device)) with torch.no_grad(): i = 0 observation = env.reset() while i < 5000: env.render() state = get_screen(env, device) action = int(target_net(state).max(1)[1].view(1, 1)) observation, reward, done, info = env.step(action) if done: break i += 1 env.close()