Exemplo n.º 1
0
def generate_episode(env, model, device, steps_done, episode_rewards):
    state = env.reset().transpose((2, 0, 1))
    i_rewards, i_states, i_actions = [], [], []
    total_reward = 0

    for t in count():
        # Select and perform an action
        action_idx = select_action(state, model, device, steps_done)
        action = index_to_action(action_idx)
        new_state, reward, done, _ = env.step(action)
        env.render()
        new_state = new_state.transpose((2, 0, 1))
        steps_done += 1

        # Save reward, action, state
        i_rewards.append(reward)
        i_actions.append(action_idx)
        i_states.append(state)
        total_reward += reward

        # Move state forward
        state = new_state

        # Break if done
        if done or t == 5000:
            print(total_reward)
            episode_rewards.append(total_reward)
            plot_rewards(episode_rewards)
            break

    return i_rewards, i_states, i_actions, steps_done
Exemplo n.º 2
0
def train():
    policy_net = DQN(n_inputs=2*LARGEST_CARD, n_outputs=HAND_SIZE).to(device)
    target_net = DQN(n_inputs=2*LARGEST_CARD, n_outputs=HAND_SIZE).to(device)
    target_net.load_state_dict(policy_net.state_dict())
    target_net.eval()
    
    optimizer = RMSprop(policy_net.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
    memory = ReplayMemory(MEMORY_SIZE)

    env = Game(N_PLAYERS, LARGEST_CARD, HAND_SIZE, N_ROUNDS)
    select_action = generate_action_selector()

    rewards = []
    for episode in trange(N_EPISODES):
        total_reward = 0
        observation = env.reset()
        done = False

        while not done:
            state = torch.tensor([create_state(observation)], dtype=torch.float, device=device)
            action = select_action(policy_net, state, observation.hand)

            observation, reward, done, info = env.step(action.item())
            total_reward += reward
            
            if not done:
                next_state = torch.tensor([create_state(observation)], dtype=torch.float, device=device)
            else:
                next_state = None
            reward = torch.tensor([reward], device=device)
            memory.push(state, action, next_state, reward)
            state = next_state
            
            optimize_model(policy_net, target_net, optimizer, memory)
            if done:
                rewards.append(total_reward)
                break
        
        if episode % TARGET_UPDATE == 0:
            target_net.load_state_dict(policy_net.state_dict())
        if episode % SAVE_INTERVAL == 0:
            torch.save(target_net.state_dict(), f'models/model_{episode}.pth')
        if episode % 100 == 0:
            plot_rewards(np.cumsum(rewards), baseline=np.zeros(len(rewards)))

    return rewards
Exemplo n.º 3
0
def train():
    env = Game(N_PLAYERS, LARGEST_CARD, HAND_SIZE, N_ROUNDS)

    net = DQN(n_inputs=2*LARGEST_CARD, n_outputs=HAND_SIZE).to(device)
    optimizer = RMSprop(net.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)

    episodic_rewards = []
    for episode in trange(N_EPISODES):
        states, rewards, actions = generate_episode(env, net)
        optimize_model(net, optimizer, states, rewards, actions)
        
        episodic_rewards.append(sum(rewards[:, 0]))

        if episode % SAVE_INTERVAL == 0:
            torch.save(net.state_dict(), f'models/model_{episode}.pth')
        if episode % 100 == 0:
            plot_rewards(np.cumsum(episodic_rewards),
                         baseline=np.zeros_like(episodic_rewards))

    return episodic_rewards