Python calc_rewards примеры, rewards_lib.calc_rewards Python примеры использования

Пример #1

0

Показать файл

def test_single_game_term_t2():
    t = 2
    batch_size = 1
    torch.manual_seed(123)
    np.random.seed(123)
    s = ecn.State(**sampling.generate_batch(batch_size))

    s.pool = torch.LongTensor([[3, 7, 2]])
    s.utilities = torch.LongTensor([[[5, 4, 3], [3, 4, 5]]])
    s.last_proposal = torch.LongTensor([[3, 0, 0]])

    total_available = 3 * 5 + 7 * 4 + 2 * 5
    print('total_available', total_available)
    # so, the  proposer is the second agent, ie agent 1
    # so, the proposer, agent 1, will take: 3 0 0
    # accepter, agent 0, will take 0 7 2
    actual = 0 * 5 + 7 * 4 + 2 * 3 + \
            3 * 3
    print('actual', actual)
    ratio = actual / total_available
    print('ratio', ratio)

    agent = 0 if t % 2 == 0 else 1
    term = torch.ByteTensor([1])
    rewards = rewards_lib.calc_rewards(s=s, t=t, term=term)

    assert rewards[0, 0] == approx(
        (0 * 5 + 7 * 4 + 2 * 3) / (3 * 5 + 7 * 4 + 2 * 3))
    assert rewards[0, 1] == approx(
        (3 * 3 + 0 * 4 + 0 * 5) / (3 * 3 + 7 * 4 + 2 * 5))
    assert rewards[0, 2] == approx(ratio)

Пример #2

0

Показать файл

def test_single_game_term_exceeds_withinpool2():
    t = 1
    batch_size = 1
    torch.manual_seed(123)
    np.random.seed(123)
    s = ecn.State(**sampling.generate_batch(batch_size))

    s.pool = torch.LongTensor([[3, 7, 2]])
    s.utilities = torch.LongTensor([[[5, 4, 3], [3, 4, 5]]])
    # last proposal means agent 0's, and we are now on agent 1, who is accepintg it
    s.last_proposal = torch.LongTensor([[3, 7, 2]])

    total_available = 3 * 5 + 7 * 4 + 2 * 5
    print('total_available', total_available)
    actual = 3 * 5 + 7 * 4 + 2 * 3
    print('actual', actual)
    ratio = actual / total_available
    print('ratio', ratio)

    agent = 0 if t % 2 == 0 else 1
    term = torch.ByteTensor([1])
    rewards = rewards_lib.calc_rewards(s=s, t=t, term=term)

    assert rewards[0, 0] == approx(
        (3 * 5 + 7 * 4 + 2 * 3) / (3 * 5 + 7 * 4 + 2 * 3))
    assert rewards[0, 1] == approx((0) / (3 * 3 + 7 * 4 + 2 * 5))
    assert rewards[0, 2] == approx(ratio)

Пример #3

0

Показать файл

def test_single_game_noterm():
    t = 1
    batch_size = 1
    torch.manual_seed(123)
    np.random.seed(123)
    s = ecn.State(**sampling.generate_batch(batch_size))
    agent = 0 if t % 2 == 0 else 1
    term = torch.ByteTensor([0])
    rewards = rewards_lib.calc_rewards(s=s, t=t, term=term)
    assert rewards[0].tolist() == [0, 0, 0]

Пример #4

0

Показать файл

def test_rewards_t0():
    t = 0
    batch_size = 128
    torch.manual_seed(123)
    np.random.seed(123)
    s = ecn.State(**sampling.generate_batch(batch_size))
    agent = 0 if t % 2 == 0 else 1
    term = torch.from_numpy(np.random.choice(2, batch_size)).long()
    rewards = rewards_lib.calc_rewards(s=s, t=t, term=term)
    assert rewards.size() == (batch_size, 3)
    assert rewards.abs().sum() == 0

Пример #5

0

Показать файл

def test_rewards_t1():
    t = 1
    batch_size = 97
    torch.manual_seed(123)
    np.random.seed(123)
    s = ecn.State(**sampling.generate_batch(batch_size))
    agent = 0 if t % 2 == 0 else 1
    term = torch.from_numpy(np.random.choice(2, batch_size)).long()
    rewards = rewards_lib.calc_rewards(s=s, t=t, term=term)
    # print('alive_games', alive_games)
    for b in range(batch_size):
        # game = alive_games[b]
        assert rewards[b].tolist() == [0, 0, 0] or term[b] == 1

Пример #6

0

Показать файл

def test_single_game_term_t2_batch3():
    t = 2
    batch_size = 3
    torch.manual_seed(123)
    np.random.seed(123)
    s = ecn.State(**sampling.generate_batch(batch_size))

    s.pool = torch.from_numpy(np.random.choice(10, (batch_size, 3))).long()
    s.pool[1] = torch.LongTensor([3, 7, 2])

    s.utilities = torch.from_numpy(np.random.choice(
        10, (batch_size, 2, 3))).long()
    s.utilities[1] = torch.LongTensor([[5, 4, 3], [3, 4, 5]])

    s.last_proposal = torch.from_numpy(np.random.choice(
        10, (batch_size, 3))).long()
    s.last_proposal[1] = torch.LongTensor([3, 0, 0])

    term = torch.ByteTensor([0, 1, 0])
    # since only one terminated, reward should be for simply the hard-coded ones above
    # all others should be zero

    s.pool[0] = s.last_proposal[0]
    s.pool[2] = s.last_proposal[2]

    # make rewards for 0 and 2 1.0
    s.utilities[0][1] = torch.max(s.utilities[0], 0)[0].view(1, 3)
    s.utilities[2][1] = torch.max(s.utilities[2], 0)[0].view(1, 3)

    total_available = 3 * 5 + 7 * 4 + 2 * 5
    print('total_available', total_available)
    # so, the  proposer is the second agent, ie agent 1
    # so, the proposer, agent 1, will take: 3 0 0
    # accepter, agent 0, will take 0 7 2
    actual = 0 * 5 + 7 * 4 + 2 * 3 + \
            3 * 3
    print('actual', actual)
    ratio = actual / total_available
    print('ratio', ratio)

    agent = 0 if t % 2 == 0 else 1
    rewards = rewards_lib.calc_rewards(s=s, t=t, term=term)

    assert rewards[1, 0] == approx(
        (0 * 5 + 7 * 4 + 2 * 3) / (3 * 5 + 7 * 4 + 2 * 3))
    assert rewards[1, 1] == approx(
        (3 * 3 + 0 * 4 + 0 * 5) / (3 * 3 + 7 * 4 + 2 * 5))

    assert rewards[0].tolist() == [0.0, 0.0, 0]
    assert rewards[1, 2] == approx(ratio)
    assert rewards[2].tolist() == [0.0, 0.0, 0]

Пример #7

0

Показать файл

def test_single_game_term_exceeds_pool():
    t = 1
    batch_size = 1
    torch.manual_seed(123)
    np.random.seed(123)
    s = ecn.State(**sampling.generate_batch(batch_size))

    s.pool = torch.LongTensor([[3, 7, 2]])
    s.utilities = torch.LongTensor([[[5, 4, 3], [3, 4, 5]]])
    # last proposal means agent 0's, and we are now on agent 1, who is accepintg it
    s.last_proposal = torch.LongTensor([[0, 2, 3]])

    agent = 0 if t % 2 == 0 else 1
    term = torch.ByteTensor([1])
    rewards = rewards_lib.calc_rewards(s=s, t=t, term=term)
    assert rewards[0].tolist() == [0, 0, 0]

Пример #8

0

Показать файл

def test_single_game_term_ideal():
    t = 1
    batch_size = 1
    torch.manual_seed(123)
    np.random.seed(123)
    s = ecn.State(**sampling.generate_batch(batch_size))

    s.pool = torch.LongTensor([[3, 7, 2]])
    s.utilities = torch.LongTensor([[[5, 4, 3], [3, 4, 5]]])
    # last proposal means agent 0's, and we are now on agent 1, who is accepintg it
    s.last_proposal = torch.LongTensor([[3, 7, 0]])

    agent = 0 if t % 2 == 0 else 1
    term = torch.ByteTensor([1])
    rewards = rewards_lib.calc_rewards(s=s, t=t, term=term)
    assert rewards[0, 0] == approx((3 * 5 + 7 * 4) / (3 * 5 + 7 * 4 + 2 * 3))
    assert rewards[0, 1] == approx((2 * 5) / (3 * 3 + 7 * 4 + 2 * 5))
    assert rewards[0, 2] == 1.0

Пример #9

0

Показать файл

def test_single_game_term_t2_batch3_zero_term():
    t = 2
    batch_size = 3
    torch.manual_seed(123)
    np.random.seed(123)
    s = ecn.State(**sampling.generate_batch(batch_size))

    s.pool = torch.from_numpy(np.random.choice(10, (batch_size, 3))).long()
    s.pool[1] = torch.LongTensor([3, 7, 2])

    s.utilities = torch.from_numpy(np.random.choice(
        10, (batch_size, 2, 3))).long()
    s.utilities[1] = torch.LongTensor([[5, 4, 3], [3, 4, 5]])

    s.last_proposal = torch.from_numpy(np.random.choice(
        10, (batch_size, 3))).long()
    s.last_proposal[1] = torch.LongTensor([3, 0, 0])

    term = torch.ByteTensor([0, 0, 0])

    s.pool[0] = s.last_proposal[0]
    s.pool[2] = s.last_proposal[2]

    total_available = 3 * 5 + 7 * 4 + 2 * 5
    print('total_available', total_available)
    # so, the  proposer is the second agent, ie agent 1
    # so, the proposer, agent 1, will take: 3 0 0
    # accepter, agent 0, will take 0 7 2
    actual = 0 * 5 + 7 * 4 + 2 * 3 + \
            3 * 3
    print('actual', actual)
    ratio = actual / total_available
    print('ratio', ratio)

    agent = 0 if t % 2 == 0 else 1
    rewards = rewards_lib.calc_rewards(s=s, t=t, term=term)
    assert rewards[0].tolist() == [0.0, 0.0, 0]
    assert rewards[1].tolist() == [0.0, 0.0, 0]
    assert rewards[2].tolist() == [0.0, 0.0, 0]

Пример #10

0

Показать файл

def run_episode(
        batch,
        enable_cuda,
        enable_comms,
        enable_proposal,
        prosocial,
        agent_models,
        # batch_size,
        testing,
        render=False):
    """
    turning testing on means, we disable stochasticity: always pick the argmax
    """

    type_constr = torch.cuda if enable_cuda else torch
    batch_size = batch['N'].size()[0]
    s = State(**batch)
    if enable_cuda:
        s.cuda()

    sieve = alive_sieve.AliveSieve(batch_size=batch_size,
                                   enable_cuda=enable_cuda)
    actions_by_timestep = []
    alive_masks = []

    # next two tensofrs wont be sieved, they will stay same size throughout
    # entire batch, we will update them using sieve.out_idxes[...]
    rewards = type_constr.FloatTensor(batch_size, SEQ_LEN).fill_(0)
    num_steps = type_constr.LongTensor(batch_size).fill_(10)
    term_matches_argmax_count = 0
    utt_matches_argmax_count = 0
    utt_stochastic_draws = 0
    num_policy_runs = 0
    prop_matches_argmax_count = 0
    prop_stochastic_draws = 0

    entropy_loss_by_agent = [
        Variable(type_constr.FloatTensor(1).fill_(0)),
        Variable(type_constr.FloatTensor(1).fill_(0))
    ]
    if render:
        print('  ')
    for t in range(10):
        agent = t % 2

        agent_model = agent_models[agent]
        if enable_comms:
            _prev_message = s.m_prev
        else:
            # we dont strictly need to blank them, since they'll be all zeros anyway,
            # but defense in depth and all that :)
            _prev_message = type_constr.LongTensor(sieve.batch_size,
                                                   6).fill_(0)
        if enable_proposal:
            _prev_proposal = s.last_proposal
        else:
            # we do need to blank this one though :)
            _prev_proposal = type_constr.LongTensor(sieve.batch_size,
                                                    SEQ_LEN).fill_(0)
        nodes, term_a, s.m_prev, this_proposal, _entropy_loss, \
                _term_matches_argmax_count, _utt_matches_argmax_count, _utt_stochastic_draws, \
                _prop_matches_argmax_count, _prop_stochastic_draws = agent_model(
            pool=Variable(s.pool),
            utility=Variable(s.utilities[:, agent]),
            m_prev=Variable(s.m_prev),
            prev_proposal=Variable(_prev_proposal),
            testing=testing
        )
        entropy_loss_by_agent[agent] += _entropy_loss
        actions_by_timestep.append(nodes)
        term_matches_argmax_count += _term_matches_argmax_count
        num_policy_runs += sieve.batch_size
        utt_matches_argmax_count += _utt_matches_argmax_count
        utt_stochastic_draws += _utt_stochastic_draws
        prop_matches_argmax_count += _prop_matches_argmax_count
        prop_stochastic_draws += _prop_stochastic_draws

        if render and sieve.out_idxes[0] == 0:
            render_action(t=t, s=s, term=term_a, prop=this_proposal)

        new_rewards = rewards_lib.calc_rewards(t=t, s=s, term=term_a)
        rewards[sieve.out_idxes] = new_rewards
        s.last_proposal = this_proposal

        sieve.mark_dead(term_a)
        sieve.mark_dead(t + 1 >= s.N)
        alive_masks.append(sieve.alive_mask.clone())
        sieve.set_dead_global(num_steps, t + 1)
        if sieve.all_dead():
            break

        s.sieve_(sieve.alive_idxes)
        sieve.self_sieve_()

    if render:
        print('  r: %.2f' % rewards[0].mean())
        print('  ')

    return actions_by_timestep, rewards, num_steps, alive_masks, entropy_loss_by_agent, \
        term_matches_argmax_count, num_policy_runs, utt_matches_argmax_count, utt_stochastic_draws, \
        prop_matches_argmax_count, prop_stochastic_draws

Python calc_rewards примеры использования