def eval_genomes(self, genomes, config): env = gym.make('pacman-v0', layout=self.layout) env = SkipFrame(env, 4) idx, genomes = zip(*genomes) for genome in genomes: genome.fitness = 0 for genome in genomes: fitness = self.fintness_func(genome, config, env) genome.fitness = fitness env.close()
def train(self, episodes, **kwargs): n_episodes = episodes discount = 0.99 alpha = 0.6 # learning rate epsilon = 1.0 epsilon_min = 0.1 epsilon_decay_rate = 1e6 env = gym.make('pacman-v0', layout=self.layout) env = SkipFrame(env, skip=10) q_table = defaultdict(lambda: np.zeros(env.action_space.n)) state = QAgent.get_state(env.game.maze.get_player_home(), env.get_state_matrix()) epsilon_by_frame = lambda frame_idx: epsilon_min + (epsilon - epsilon_min) * math.exp( -1. * frame_idx / epsilon_decay_rate) for episode in range(n_episodes): env.reset() total_rewards = 0 epsilon = epsilon_by_frame(episode) for i in count(): env.render() if random.uniform(0, 1) > epsilon: action = int(np.argmax(q_table[state])) else: action = env.action_space.sample() obs, rewards, done, info = env.step(action) next_state = QAgent.get_state(info['player position'], info['state matrix']) if next_state != state: rewards = rewards + 2 if rewards > 0 else rewards q_table[state][action] += alpha * ( rewards + discount * np.max(q_table[next_state]) - q_table[state][action]) state = next_state total_rewards += rewards if done: print(f'{episode} episode finished after {i} timesteps') print(f'Total rewards: {total_rewards}') print(f'win: {info["win"]}') break env.close() with open(self.filename, 'wb') as handle: pickle.dump(dict(q_table), handle, protocol=pickle.HIGHEST_PROTOCOL) handle.close()
def train(self, **kwargs): n_episodes = 10000 discount = 0.99 epsilon = 1.0 epsilon_min = 0.1 epsilon_decay = 1e7 env = gym.make('pacman-v0', layout=self.layout) env = SkipFrame(env, skip=5) approximator = LinearApproximator(6, env.action_space.n) epsilon_by_frame = lambda frame_idx: epsilon_min + ( epsilon - epsilon_min) * math.exp(-1. * frame_idx / epsilon_decay) for episode in range(n_episodes): info = env.reset(mode='info') state = LinQAgent.get_state(info['player position'], info['state matrix'], info['player action']) total_rewards = 0 epsilon = epsilon_by_frame(episode) for i in count(): env.render() if np.random.rand() < epsilon: action = env.action_space.sample() else: action = int(np.argmax(approximator.predict(state))) obs, rewards, done, info = env.step(action) next_state = LinQAgent.get_state(info['player position'], info['state matrix'], info['player action']) if not np.array_equal(next_state, state): approximator.update(state, next_state, rewards, discount, action) state = next_state total_rewards += rewards if done: print(f'{episode} episode finished after {i} timesteps') print(f'Total rewards: {total_rewards}') print(f'win: {info["win"]}') print(f'epsilon {epsilon}') break if episode % 1000 == 0: approximator.save(self.filename) env.close() approximator.save(self.filename)
def run_agent(layout: str): env = PacmanEnv(layout) env = SkipFrame(env, skip=4) env = GrayScaleObservation(env) env = ResizeObservation(env, shape=84) env = FrameStack(env, num_stack=4) screen = env.reset(mode='rgb_array') n_actions = env.action_space.n model = load_model(screen.shape, n_actions, 'pacman.pth') for i in range(10): env.render(mode='human') screen = env.reset(mode='rgb_array') for _ in count(): env.render(mode='human') action = select_action(screen, 0, model, n_actions) screen, reward, done, info = env.step(action) if done: break
from nes_py.wrappers import JoypadSpace from metrics import MetricLogger from agent import Mario from wrappers import ResizeObservation, SkipFrame # Initialize Super Mario environment env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0') # Limit the action-space to # 0. walk right # 1. jump right env = JoypadSpace(env, [['right'], ['right', 'A']]) # Apply Wrappers to environment env = SkipFrame(env, skip=4) env = GrayScaleObservation(env, keep_dim=False) env = ResizeObservation(env, shape=84) env = TransformObservation(env, f=lambda x: x / 255.) env = FrameStack(env, num_stack=4) env.reset() save_dir = Path('checkpoints') / datetime.datetime.now().strftime( '%Y-%m-%dT%H-%M-%S') save_dir.mkdir(parents=True) checkpoint = None # Path('checkpoints/2020-10-21T18-25-27/mario.chkpt') # Add in check to see if GPU is avaliable (BM) if torch.cuda.is_available():
def train_agent(layout: str, episodes: int = 10000, frames_to_skip: int = 4): GAMMA = 0.99 EPSILON = 1.0 EPS_END = 0.1 EPS_DECAY = 1e7 TARGET_UPDATE = 10 BATCH_SIZE = 64 epsilon_by_frame = lambda frame_idx: EPS_END + ( EPSILON - EPS_END) * math.exp(-1. * frame_idx / EPS_DECAY) # Get screen size so that we can initialize layers correctly based on shape # returned from AI gym. Typical dimensions at this point are close to 3x40x90 # which is the result of a clamped and down-scaled render buffer in get_screen() env = PacmanEnv(layout=layout) env = SkipFrame(env, skip=frames_to_skip) env = GrayScaleObservation(env) env = ResizeObservation(env, shape=84) env = FrameStack(env, num_stack=4) screen = env.reset(mode='rgb_array') # Get number of actions from gym action space n_actions = env.action_space.n policy_net = DQN(screen.shape, n_actions).to(device) target_net = DQN(screen.shape, n_actions).to(device) target_net.load_state_dict(policy_net.state_dict()) target_net.eval() optimizer = optim.RMSprop(policy_net.parameters()) memory = ReplayBuffer(BATCH_SIZE) for i_episode in range(episodes): # Initialize the environment and state state = env.reset(mode='rgb_array') ep_reward = 0. EPSILON = epsilon_by_frame(i_episode) for t in count(): # Select and perform an action env.render(mode='human') action = select_action(state, EPSILON, policy_net, n_actions) next_state, reward, done, info = env.step(action) reward = max(-1.0, min(reward, 1.0)) ep_reward += reward memory.cache(state, next_state, action, reward, done) # Observe new state if done: next_state = None # Move to the next state state = next_state # Perform one step of the optimization (on the target network) optimize_model(memory, policy_net, optimizer, target_net, GAMMA) if done: print("Episode #{}, lasts for {} timestep, total reward: {}". format(i_episode, t + 1, ep_reward)) break # Update the target network, copying all weights and biases in DQN if i_episode % TARGET_UPDATE == 0: target_net.load_state_dict(policy_net.state_dict()) if i_episode % 1000 == 0: save_model(target_net, 'pacman.pth') print('Complete') env.render() env.close() save_model(target_net, 'pacman.pth')
def Environment(): env = gym_super_mario_bros.make(ENV_NAME) env = JoypadSpace(env, COMPLEX_MOVEMENT) env = Reward(env) env = SkipFrame(env) return env