示例#1
0
 def _thunk():
     env = MiniPacman(mode, 1000)
     return env
示例#2
0
    plt.show()

keys = {
    'w': 2,
    'd': 1,
    'a': 3,
    's': 4,
    ' ': 0
}

MODES = ('regular', 'avoid', 'hunt', 'ambush', 'rush')
frame_cap = 1000

mode = 'rush'

env = MiniPacman(mode, 1000)

state = env.reset()
done = False

total_reward = 0
step = 1

displayImage(state.transpose(1, 2, 0), step, total_reward)

while not done:
    #x = raw_input()
    #clear_output()
    try:
        keys[x]
    except:
示例#3
0
def main():
    mode = "regular"
    num_envs = 16

    def make_env():
        def _thunk():
            env = MiniPacman(mode, 1000)
            return env

        return _thunk

    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)

    state_shape = envs.observation_space.shape

    #a2c hyperparams:
    gamma = 0.99
    entropy_coef = 0.01
    value_loss_coef = 0.5
    max_grad_norm = 0.5
    num_steps = 5
    num_frames = int(10e3)

    #rmsprop hyperparams:
    lr = 7e-4
    eps = 1e-5
    alpha = 0.99

    #Init a2c and rmsprop
    actor_critic = ActorCritic(envs.observation_space.shape,
                               envs.action_space.n)
    optimizer = optim.RMSprop(actor_critic.parameters(),
                              lr,
                              eps=eps,
                              alpha=alpha)

    #if USE_CUDA:
    #    actor_critic = actor_critic.cuda()

    rollout = RolloutStorage(num_steps, num_envs, envs.observation_space.shape)
    #rollout.cuda()

    all_rewards = []
    all_losses = []

    state = envs.reset()
    state = torch.FloatTensor(np.float32(state))

    rollout.states[0].copy_(state)

    episode_rewards = torch.zeros(num_envs, 1)
    final_rewards = torch.zeros(num_envs, 1)

    for i_update in tqdm(range(num_frames)):

        for step in range(num_steps):
            action = actor_critic.act(autograd.Variable(state))

            next_state, reward, done, _ = envs.step(
                action.squeeze(1).cpu().data.numpy())

            reward = torch.FloatTensor(reward).unsqueeze(1)
            episode_rewards += reward
            masks = torch.FloatTensor(1 - np.array(done)).unsqueeze(1)
            final_rewards *= masks
            final_rewards += (1 - masks) * episode_rewards
            episode_rewards *= masks

            #if USE_CUDA:
            #    masks = masks.cuda()

            state = torch.FloatTensor(np.float32(next_state))
            rollout.insert(step, state, action.data, reward, masks)

        _, next_value = actor_critic(
            autograd.Variable(rollout.states[-1], volatile=True))
        next_value = next_value.data

        returns = rollout.compute_returns(next_value, gamma)

        logit, action_log_probs, values, entropy = actor_critic.evaluate_actions(
            autograd.Variable(rollout.states[:-1]).view(-1, *state_shape),
            autograd.Variable(rollout.actions).view(-1, 1))

        values = values.view(num_steps, num_envs, 1)
        action_log_probs = action_log_probs.view(num_steps, num_envs, 1)
        advantages = autograd.Variable(returns) - values

        value_loss = advantages.pow(2).mean()
        action_loss = -(autograd.Variable(advantages.data) *
                        action_log_probs).mean()

        optimizer.zero_grad()
        loss = value_loss * value_loss_coef + action_loss - entropy * entropy_coef
        loss.backward()
        nn.utils.clip_grad_norm(actor_critic.parameters(), max_grad_norm)
        optimizer.step()

        if i_update % num_frames == 0:
            all_rewards.append(final_rewards.mean())
            all_losses.append(loss.item())

            #clear_output(True)
            plt.figure(figsize=(20, 5))
            plt.subplot(131)
            plt.title('epoch %s. reward: %s' %
                      (i_update, np.mean(all_rewards[-10:])))
            plt.plot(all_rewards)
            plt.subplot(132)
            plt.title('loss %s' % all_losses[-1])
            plt.plot(all_losses)
            plt.show()

        rollout.after_update()

    torch.save(actor_critic.state_dict(), "actor_critic_" + mode)

    import time

    def displayImage(image, step, reward):
        #clear_output(True)
        s = "step: " + str(step) + " reward: " + str(reward)
        plt.figure(figsize=(10, 3))
        plt.title(s)
        plt.imshow(image)
        plt.show()
        time.sleep(0.1)

    env = MiniPacman(mode, 1000)

    done = False
    state = env.reset()
    total_reward = 0
    step = 1

    while not done:
        current_state = torch.FloatTensor(state).unsqueeze(0)
        #if USE_CUDA:
        #    current_state = current_state.cuda()

        action = actor_critic.act(autograd.Variable(current_state))

        next_state, reward, done, _ = env.step(action.data[0, 0])
        total_reward += reward
        state = next_state

        image = torch.FloatTensor(state).permute(1, 2, 0).cpu().numpy()
        displayImage(image, step, total_reward)
        step += 1
示例#4
0
def main():
    mode = "regular"
    num_envs = 16

    def make_env():
        def _thunk():
            env = MiniPacman(mode, 1000)
            return env

        return _thunk

    envs = [make_env() for i in range(num_envs)]
    envs = SubprocVecEnv(envs)

    state_shape = envs.observation_space.shape
    num_actions = envs.action_space.n

    env_model = EnvModel(envs.observation_space.shape, num_pixels,
                         len(mode_rewards["regular"]))
    actor_critic = ActorCritic(envs.observation_space.shape,
                               envs.action_space.n)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(env_model.parameters())

    actor_critic.load_state_dict(torch.load("actor_critic_" + mode))

    def get_action(state):
        if state.ndim == 4:
            state = torch.FloatTensor(np.float32(state))
        else:
            state = torch.FloatTensor(np.float32(state)).unsqueeze(0)

        action = actor_critic.act(autograd.Variable(state, volatile=True))
        action = action.data.cpu().squeeze(1).numpy()
        return action

    def play_games(envs, frames):
        states = envs.reset()

        for frame_idx in range(frames):
            actions = get_action(states)
            next_states, rewards, dones, _ = envs.step(actions)

            yield frame_idx, states, actions, rewards, next_states, dones

            states = next_states

    reward_coef = 0.1
    num_updates = 5000

    losses = []
    all_rewards = []

    for frame_idx, states, actions, rewards, next_states, dones in tqdm(
            play_games(envs, num_updates), total=num_updates):
        states = torch.FloatTensor(states)
        actions = torch.LongTensor(actions)

        batch_size = states.size(0)

        onehot_actions = torch.zeros(batch_size, num_actions, *state_shape[1:])
        onehot_actions[range(batch_size), actions] = 1
        inputs = autograd.Variable(torch.cat([states, onehot_actions], 1))

        #if USE_CUDA:
        #    inputs = inputs.cuda()

        imagined_state, imagined_reward = env_model(inputs)

        target_state = pix_to_target(next_states)
        target_state = autograd.Variable(torch.LongTensor(target_state))

        target_reward = rewards_to_target(mode, rewards)
        target_reward = autograd.Variable(torch.LongTensor(target_reward))

        optimizer.zero_grad()
        image_loss = criterion(imagined_state, target_state)
        reward_loss = criterion(imagined_reward, target_reward)
        loss = image_loss + reward_coef * reward_loss
        loss.backward()
        optimizer.step()

        losses.append(loss.item())
        all_rewards.append(np.mean(rewards))

        if frame_idx % num_updates == 0:
            plot(frame_idx, all_rewards, losses)

    torch.save(env_model.state_dict(), "env_model_" + mode)

    import time

    env = MiniPacman(mode, 1000)
    batch_size = 1

    done = False
    state = env.reset()
    iss = []
    ss = []

    steps = 0

    while not done:
        steps += 1
        actions = get_action(state)
        onehot_actions = torch.zeros(batch_size, num_actions, *state_shape[1:])
        onehot_actions[range(batch_size), actions] = 1
        state = torch.FloatTensor(state).unsqueeze(0)

        inputs = autograd.Variable(torch.cat([state, onehot_actions], 1))
        #if USE_CUDA:
        #    inputs = inputs.cuda()

        imagined_state, imagined_reward = env_model(inputs)
        imagined_state = F.softmax(imagined_state)
        iss.append(imagined_state)

        next_state, reward, done, _ = env.step(actions[0])
        ss.append(state)
        state = next_state

        imagined_image = target_to_pix(
            imagined_state.view(batch_size, -1,
                                len(pixels))[0].max(1)[1].data.cpu().numpy())
        imagined_image = imagined_image.reshape(15, 19, 3)
        state_image = torch.FloatTensor(next_state).permute(1, 2,
                                                            0).cpu().numpy()

        #clear_output()
        plt.figure(figsize=(10, 3))
        plt.subplot(131)
        plt.title("Imagined")
        plt.imshow(imagined_image)
        plt.subplot(132)
        plt.title("Actual")
        plt.imshow(state_image)
        plt.show()
        time.sleep(0.3)

        if steps > 30:
            break