Пример #1
0
    def eval_genomes(self, genomes, config):
        env = gym.make('pacman-v0', layout=self.layout)
        env = SkipFrame(env, 4)
        idx, genomes = zip(*genomes)

        for genome in genomes:
            genome.fitness = 0

        for genome in genomes:
            fitness = self.fintness_func(genome, config, env)
            genome.fitness = fitness

        env.close()
Пример #2
0
    def train(self, episodes, **kwargs):
        n_episodes = episodes
        discount = 0.99
        alpha = 0.6  # learning rate
        epsilon = 1.0
        epsilon_min = 0.1
        epsilon_decay_rate = 1e6
        env = gym.make('pacman-v0', layout=self.layout)
        env = SkipFrame(env, skip=10)
        q_table = defaultdict(lambda: np.zeros(env.action_space.n))
        state = QAgent.get_state(env.game.maze.get_player_home(), env.get_state_matrix())

        epsilon_by_frame = lambda frame_idx: epsilon_min + (epsilon - epsilon_min) * math.exp(
            -1. * frame_idx / epsilon_decay_rate)

        for episode in range(n_episodes):
            env.reset()
            total_rewards = 0

            epsilon = epsilon_by_frame(episode)

            for i in count():
                env.render()
                if random.uniform(0, 1) > epsilon:
                    action = int(np.argmax(q_table[state]))
                else:
                    action = env.action_space.sample()

                obs, rewards, done, info = env.step(action)
                next_state = QAgent.get_state(info['player position'], info['state matrix'])

                if next_state != state:
                    rewards = rewards + 2 if rewards > 0 else rewards
                    q_table[state][action] += alpha * (
                            rewards + discount * np.max(q_table[next_state]) - q_table[state][action])

                state = next_state
                total_rewards += rewards

                if done:
                    print(f'{episode} episode finished after {i} timesteps')
                    print(f'Total rewards: {total_rewards}')
                    print(f'win: {info["win"]}')
                    break

        env.close()

        with open(self.filename, 'wb') as handle:
            pickle.dump(dict(q_table), handle, protocol=pickle.HIGHEST_PROTOCOL)
            handle.close()
Пример #3
0
    def train(self, **kwargs):
        n_episodes = 10000
        discount = 0.99
        epsilon = 1.0
        epsilon_min = 0.1
        epsilon_decay = 1e7
        env = gym.make('pacman-v0', layout=self.layout)
        env = SkipFrame(env, skip=5)
        approximator = LinearApproximator(6, env.action_space.n)

        epsilon_by_frame = lambda frame_idx: epsilon_min + (
            epsilon - epsilon_min) * math.exp(-1. * frame_idx / epsilon_decay)

        for episode in range(n_episodes):
            info = env.reset(mode='info')
            state = LinQAgent.get_state(info['player position'],
                                        info['state matrix'],
                                        info['player action'])
            total_rewards = 0

            epsilon = epsilon_by_frame(episode)

            for i in count():
                env.render()
                if np.random.rand() < epsilon:
                    action = env.action_space.sample()
                else:
                    action = int(np.argmax(approximator.predict(state)))

                obs, rewards, done, info = env.step(action)
                next_state = LinQAgent.get_state(info['player position'],
                                                 info['state matrix'],
                                                 info['player action'])

                if not np.array_equal(next_state, state):
                    approximator.update(state, next_state, rewards, discount,
                                        action)

                state = next_state
                total_rewards += rewards

                if done:
                    print(f'{episode} episode finished after {i} timesteps')
                    print(f'Total rewards: {total_rewards}')
                    print(f'win: {info["win"]}')
                    print(f'epsilon {epsilon}')
                    break

            if episode % 1000 == 0:
                approximator.save(self.filename)

        env.close()

        approximator.save(self.filename)
Пример #4
0
def run_agent(layout: str):
    env = PacmanEnv(layout)
    env = SkipFrame(env, skip=4)
    env = GrayScaleObservation(env)
    env = ResizeObservation(env, shape=84)
    env = FrameStack(env, num_stack=4)
    screen = env.reset(mode='rgb_array')
    n_actions = env.action_space.n

    model = load_model(screen.shape, n_actions, 'pacman.pth')

    for i in range(10):

        env.render(mode='human')
        screen = env.reset(mode='rgb_array')

        for _ in count():
            env.render(mode='human')
            action = select_action(screen, 0, model, n_actions)
            screen, reward, done, info = env.step(action)

            if done:
                break
Пример #5
0
from nes_py.wrappers import JoypadSpace

from metrics import MetricLogger
from agent import Mario
from wrappers import ResizeObservation, SkipFrame

# Initialize Super Mario environment
env = gym_super_mario_bros.make('SuperMarioBros-1-1-v0')

# Limit the action-space to
#   0. walk right
#   1. jump right
env = JoypadSpace(env, [['right'], ['right', 'A']])

# Apply Wrappers to environment
env = SkipFrame(env, skip=4)
env = GrayScaleObservation(env, keep_dim=False)
env = ResizeObservation(env, shape=84)
env = TransformObservation(env, f=lambda x: x / 255.)
env = FrameStack(env, num_stack=4)

env.reset()

save_dir = Path('checkpoints') / datetime.datetime.now().strftime(
    '%Y-%m-%dT%H-%M-%S')
save_dir.mkdir(parents=True)

checkpoint = None  # Path('checkpoints/2020-10-21T18-25-27/mario.chkpt')

# Add in check to see if GPU is avaliable (BM)
if torch.cuda.is_available():
Пример #6
0
def train_agent(layout: str, episodes: int = 10000, frames_to_skip: int = 4):
    GAMMA = 0.99
    EPSILON = 1.0
    EPS_END = 0.1
    EPS_DECAY = 1e7
    TARGET_UPDATE = 10
    BATCH_SIZE = 64

    epsilon_by_frame = lambda frame_idx: EPS_END + (
        EPSILON - EPS_END) * math.exp(-1. * frame_idx / EPS_DECAY)

    # Get screen size so that we can initialize layers correctly based on shape
    # returned from AI gym. Typical dimensions at this point are close to 3x40x90
    # which is the result of a clamped and down-scaled render buffer in get_screen()
    env = PacmanEnv(layout=layout)
    env = SkipFrame(env, skip=frames_to_skip)
    env = GrayScaleObservation(env)
    env = ResizeObservation(env, shape=84)
    env = FrameStack(env, num_stack=4)
    screen = env.reset(mode='rgb_array')

    # Get number of actions from gym action space
    n_actions = env.action_space.n

    policy_net = DQN(screen.shape, n_actions).to(device)
    target_net = DQN(screen.shape, n_actions).to(device)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()

    optimizer = optim.RMSprop(policy_net.parameters())
    memory = ReplayBuffer(BATCH_SIZE)

    for i_episode in range(episodes):
        # Initialize the environment and state
        state = env.reset(mode='rgb_array')
        ep_reward = 0.
        EPSILON = epsilon_by_frame(i_episode)

        for t in count():
            # Select and perform an action
            env.render(mode='human')
            action = select_action(state, EPSILON, policy_net, n_actions)
            next_state, reward, done, info = env.step(action)
            reward = max(-1.0, min(reward, 1.0))
            ep_reward += reward

            memory.cache(state, next_state, action, reward, done)

            # Observe new state
            if done:
                next_state = None

            # Move to the next state
            state = next_state

            # Perform one step of the optimization (on the target network)
            optimize_model(memory, policy_net, optimizer, target_net, GAMMA)
            if done:
                print("Episode #{}, lasts for {} timestep, total reward: {}".
                      format(i_episode, t + 1, ep_reward))
                break
        # Update the target network, copying all weights and biases in DQN
        if i_episode % TARGET_UPDATE == 0:
            target_net.load_state_dict(policy_net.state_dict())

        if i_episode % 1000 == 0:
            save_model(target_net, 'pacman.pth')

    print('Complete')
    env.render()
    env.close()

    save_model(target_net, 'pacman.pth')
Пример #7
0
def Environment():
    env = gym_super_mario_bros.make(ENV_NAME)
    env = JoypadSpace(env, COMPLEX_MOVEMENT)
    env = Reward(env)
    env = SkipFrame(env)
    return env