def test_single_game_term_t2(): t = 2 batch_size = 1 torch.manual_seed(123) np.random.seed(123) s = ecn.State(**sampling.generate_batch(batch_size)) s.pool = torch.LongTensor([[3, 7, 2]]) s.utilities = torch.LongTensor([[[5, 4, 3], [3, 4, 5]]]) s.last_proposal = torch.LongTensor([[3, 0, 0]]) total_available = 3 * 5 + 7 * 4 + 2 * 5 print('total_available', total_available) # so, the proposer is the second agent, ie agent 1 # so, the proposer, agent 1, will take: 3 0 0 # accepter, agent 0, will take 0 7 2 actual = 0 * 5 + 7 * 4 + 2 * 3 + \ 3 * 3 print('actual', actual) ratio = actual / total_available print('ratio', ratio) agent = 0 if t % 2 == 0 else 1 term = torch.ByteTensor([1]) rewards = rewards_lib.calc_rewards(s=s, t=t, term=term) assert rewards[0, 0] == approx( (0 * 5 + 7 * 4 + 2 * 3) / (3 * 5 + 7 * 4 + 2 * 3)) assert rewards[0, 1] == approx( (3 * 3 + 0 * 4 + 0 * 5) / (3 * 3 + 7 * 4 + 2 * 5)) assert rewards[0, 2] == approx(ratio)
def test_single_game_term_exceeds_withinpool2(): t = 1 batch_size = 1 torch.manual_seed(123) np.random.seed(123) s = ecn.State(**sampling.generate_batch(batch_size)) s.pool = torch.LongTensor([[3, 7, 2]]) s.utilities = torch.LongTensor([[[5, 4, 3], [3, 4, 5]]]) # last proposal means agent 0's, and we are now on agent 1, who is accepintg it s.last_proposal = torch.LongTensor([[3, 7, 2]]) total_available = 3 * 5 + 7 * 4 + 2 * 5 print('total_available', total_available) actual = 3 * 5 + 7 * 4 + 2 * 3 print('actual', actual) ratio = actual / total_available print('ratio', ratio) agent = 0 if t % 2 == 0 else 1 term = torch.ByteTensor([1]) rewards = rewards_lib.calc_rewards(s=s, t=t, term=term) assert rewards[0, 0] == approx( (3 * 5 + 7 * 4 + 2 * 3) / (3 * 5 + 7 * 4 + 2 * 3)) assert rewards[0, 1] == approx((0) / (3 * 3 + 7 * 4 + 2 * 5)) assert rewards[0, 2] == approx(ratio)
def test_single_game_noterm(): t = 1 batch_size = 1 torch.manual_seed(123) np.random.seed(123) s = ecn.State(**sampling.generate_batch(batch_size)) agent = 0 if t % 2 == 0 else 1 term = torch.ByteTensor([0]) rewards = rewards_lib.calc_rewards(s=s, t=t, term=term) assert rewards[0].tolist() == [0, 0, 0]
def test_rewards_t0(): t = 0 batch_size = 128 torch.manual_seed(123) np.random.seed(123) s = ecn.State(**sampling.generate_batch(batch_size)) agent = 0 if t % 2 == 0 else 1 term = torch.from_numpy(np.random.choice(2, batch_size)).long() rewards = rewards_lib.calc_rewards(s=s, t=t, term=term) assert rewards.size() == (batch_size, 3) assert rewards.abs().sum() == 0
def test_rewards_t1(): t = 1 batch_size = 97 torch.manual_seed(123) np.random.seed(123) s = ecn.State(**sampling.generate_batch(batch_size)) agent = 0 if t % 2 == 0 else 1 term = torch.from_numpy(np.random.choice(2, batch_size)).long() rewards = rewards_lib.calc_rewards(s=s, t=t, term=term) # print('alive_games', alive_games) for b in range(batch_size): # game = alive_games[b] assert rewards[b].tolist() == [0, 0, 0] or term[b] == 1
def test_single_game_term_t2_batch3(): t = 2 batch_size = 3 torch.manual_seed(123) np.random.seed(123) s = ecn.State(**sampling.generate_batch(batch_size)) s.pool = torch.from_numpy(np.random.choice(10, (batch_size, 3))).long() s.pool[1] = torch.LongTensor([3, 7, 2]) s.utilities = torch.from_numpy(np.random.choice( 10, (batch_size, 2, 3))).long() s.utilities[1] = torch.LongTensor([[5, 4, 3], [3, 4, 5]]) s.last_proposal = torch.from_numpy(np.random.choice( 10, (batch_size, 3))).long() s.last_proposal[1] = torch.LongTensor([3, 0, 0]) term = torch.ByteTensor([0, 1, 0]) # since only one terminated, reward should be for simply the hard-coded ones above # all others should be zero s.pool[0] = s.last_proposal[0] s.pool[2] = s.last_proposal[2] # make rewards for 0 and 2 1.0 s.utilities[0][1] = torch.max(s.utilities[0], 0)[0].view(1, 3) s.utilities[2][1] = torch.max(s.utilities[2], 0)[0].view(1, 3) total_available = 3 * 5 + 7 * 4 + 2 * 5 print('total_available', total_available) # so, the proposer is the second agent, ie agent 1 # so, the proposer, agent 1, will take: 3 0 0 # accepter, agent 0, will take 0 7 2 actual = 0 * 5 + 7 * 4 + 2 * 3 + \ 3 * 3 print('actual', actual) ratio = actual / total_available print('ratio', ratio) agent = 0 if t % 2 == 0 else 1 rewards = rewards_lib.calc_rewards(s=s, t=t, term=term) assert rewards[1, 0] == approx( (0 * 5 + 7 * 4 + 2 * 3) / (3 * 5 + 7 * 4 + 2 * 3)) assert rewards[1, 1] == approx( (3 * 3 + 0 * 4 + 0 * 5) / (3 * 3 + 7 * 4 + 2 * 5)) assert rewards[0].tolist() == [0.0, 0.0, 0] assert rewards[1, 2] == approx(ratio) assert rewards[2].tolist() == [0.0, 0.0, 0]
def test_single_game_term_exceeds_pool(): t = 1 batch_size = 1 torch.manual_seed(123) np.random.seed(123) s = ecn.State(**sampling.generate_batch(batch_size)) s.pool = torch.LongTensor([[3, 7, 2]]) s.utilities = torch.LongTensor([[[5, 4, 3], [3, 4, 5]]]) # last proposal means agent 0's, and we are now on agent 1, who is accepintg it s.last_proposal = torch.LongTensor([[0, 2, 3]]) agent = 0 if t % 2 == 0 else 1 term = torch.ByteTensor([1]) rewards = rewards_lib.calc_rewards(s=s, t=t, term=term) assert rewards[0].tolist() == [0, 0, 0]
def test_single_game_term_ideal(): t = 1 batch_size = 1 torch.manual_seed(123) np.random.seed(123) s = ecn.State(**sampling.generate_batch(batch_size)) s.pool = torch.LongTensor([[3, 7, 2]]) s.utilities = torch.LongTensor([[[5, 4, 3], [3, 4, 5]]]) # last proposal means agent 0's, and we are now on agent 1, who is accepintg it s.last_proposal = torch.LongTensor([[3, 7, 0]]) agent = 0 if t % 2 == 0 else 1 term = torch.ByteTensor([1]) rewards = rewards_lib.calc_rewards(s=s, t=t, term=term) assert rewards[0, 0] == approx((3 * 5 + 7 * 4) / (3 * 5 + 7 * 4 + 2 * 3)) assert rewards[0, 1] == approx((2 * 5) / (3 * 3 + 7 * 4 + 2 * 5)) assert rewards[0, 2] == 1.0
def test_single_game_term_t2_batch3_zero_term(): t = 2 batch_size = 3 torch.manual_seed(123) np.random.seed(123) s = ecn.State(**sampling.generate_batch(batch_size)) s.pool = torch.from_numpy(np.random.choice(10, (batch_size, 3))).long() s.pool[1] = torch.LongTensor([3, 7, 2]) s.utilities = torch.from_numpy(np.random.choice( 10, (batch_size, 2, 3))).long() s.utilities[1] = torch.LongTensor([[5, 4, 3], [3, 4, 5]]) s.last_proposal = torch.from_numpy(np.random.choice( 10, (batch_size, 3))).long() s.last_proposal[1] = torch.LongTensor([3, 0, 0]) term = torch.ByteTensor([0, 0, 0]) s.pool[0] = s.last_proposal[0] s.pool[2] = s.last_proposal[2] total_available = 3 * 5 + 7 * 4 + 2 * 5 print('total_available', total_available) # so, the proposer is the second agent, ie agent 1 # so, the proposer, agent 1, will take: 3 0 0 # accepter, agent 0, will take 0 7 2 actual = 0 * 5 + 7 * 4 + 2 * 3 + \ 3 * 3 print('actual', actual) ratio = actual / total_available print('ratio', ratio) agent = 0 if t % 2 == 0 else 1 rewards = rewards_lib.calc_rewards(s=s, t=t, term=term) assert rewards[0].tolist() == [0.0, 0.0, 0] assert rewards[1].tolist() == [0.0, 0.0, 0] assert rewards[2].tolist() == [0.0, 0.0, 0]
def run_episode( batch, enable_cuda, enable_comms, enable_proposal, prosocial, agent_models, # batch_size, testing, render=False): """ turning testing on means, we disable stochasticity: always pick the argmax """ type_constr = torch.cuda if enable_cuda else torch batch_size = batch['N'].size()[0] s = State(**batch) if enable_cuda: s.cuda() sieve = alive_sieve.AliveSieve(batch_size=batch_size, enable_cuda=enable_cuda) actions_by_timestep = [] alive_masks = [] # next two tensofrs wont be sieved, they will stay same size throughout # entire batch, we will update them using sieve.out_idxes[...] rewards = type_constr.FloatTensor(batch_size, SEQ_LEN).fill_(0) num_steps = type_constr.LongTensor(batch_size).fill_(10) term_matches_argmax_count = 0 utt_matches_argmax_count = 0 utt_stochastic_draws = 0 num_policy_runs = 0 prop_matches_argmax_count = 0 prop_stochastic_draws = 0 entropy_loss_by_agent = [ Variable(type_constr.FloatTensor(1).fill_(0)), Variable(type_constr.FloatTensor(1).fill_(0)) ] if render: print(' ') for t in range(10): agent = t % 2 agent_model = agent_models[agent] if enable_comms: _prev_message = s.m_prev else: # we dont strictly need to blank them, since they'll be all zeros anyway, # but defense in depth and all that :) _prev_message = type_constr.LongTensor(sieve.batch_size, 6).fill_(0) if enable_proposal: _prev_proposal = s.last_proposal else: # we do need to blank this one though :) _prev_proposal = type_constr.LongTensor(sieve.batch_size, SEQ_LEN).fill_(0) nodes, term_a, s.m_prev, this_proposal, _entropy_loss, \ _term_matches_argmax_count, _utt_matches_argmax_count, _utt_stochastic_draws, \ _prop_matches_argmax_count, _prop_stochastic_draws = agent_model( pool=Variable(s.pool), utility=Variable(s.utilities[:, agent]), m_prev=Variable(s.m_prev), prev_proposal=Variable(_prev_proposal), testing=testing ) entropy_loss_by_agent[agent] += _entropy_loss actions_by_timestep.append(nodes) term_matches_argmax_count += _term_matches_argmax_count num_policy_runs += sieve.batch_size utt_matches_argmax_count += _utt_matches_argmax_count utt_stochastic_draws += _utt_stochastic_draws prop_matches_argmax_count += _prop_matches_argmax_count prop_stochastic_draws += _prop_stochastic_draws if render and sieve.out_idxes[0] == 0: render_action(t=t, s=s, term=term_a, prop=this_proposal) new_rewards = rewards_lib.calc_rewards(t=t, s=s, term=term_a) rewards[sieve.out_idxes] = new_rewards s.last_proposal = this_proposal sieve.mark_dead(term_a) sieve.mark_dead(t + 1 >= s.N) alive_masks.append(sieve.alive_mask.clone()) sieve.set_dead_global(num_steps, t + 1) if sieve.all_dead(): break s.sieve_(sieve.alive_idxes) sieve.self_sieve_() if render: print(' r: %.2f' % rewards[0].mean()) print(' ') return actions_by_timestep, rewards, num_steps, alive_masks, entropy_loss_by_agent, \ term_matches_argmax_count, num_policy_runs, utt_matches_argmax_count, utt_stochastic_draws, \ prop_matches_argmax_count, prop_stochastic_draws