def test_hash_batches(): r = np.random.RandomState(123) test_batches = sampling.generate_test_batches(batch_size=8, num_batches=3, random_state=r) hashes = sampling.hash_batches(test_batches) for v in hashes: print(v)
def test_generate_nonoverlapping_train_batch(): """ cant really test this, but at least check it runs (actually: we could test, by resetting the random state) """ r = np.random.RandomState(123) test_batches = sampling.generate_test_batches(batch_size=32, num_batches=3, random_state=r) test_hashes = sampling.hash_batches(test_batches) train_batch = sampling.generate_training_batch(test_hashes=test_hashes, batch_size=32, random_state=r) print('train_batch', train_batch)
def test_checkoverlap(): r = np.random.RandomState(123) batches = sampling.generate_test_batches(batch_size=32, num_batches=5, random_state=r) test_batches = batches[:2] train_batches = batches[2:] test_hashes = sampling.hash_batches(test_batches) assert sampling.overlaps(test_hashes=test_hashes, batch=test_batches[1]) assert not sampling.overlaps(test_hashes=test_hashes, batch=train_batches[0]) # grab one example from test batch, copy to train batch, check now overlaps src = test_batches[1] dest = train_batches[0] dest['pool'][5] = src['pool'][12] dest['utilities'][0][5] = src['utilities'][0][12] dest['utilities'][1][5] = src['utilities'][1][12] dest['N'][5] = src['N'][12] assert sampling.overlaps(test_hashes=test_hashes, batch=test_batches[0])
def run(enable_proposal, enable_comms, seed, prosocial, logfile, model_file, batch_size, term_entropy_reg, utterance_entropy_reg, proposal_entropy_reg, enable_cuda, no_load, testing, test_seed, render_every_seconds): """ testing option will: - use argmax, ie disable stochastic draws - not run optimizers - not save model """ type_constr = torch.cuda if enable_cuda else torch if seed is not None: np.random.seed(seed) torch.manual_seed(seed) train_r = np.random.RandomState(seed) else: train_r = np.random test_r = np.random.RandomState(test_seed) test_batches = sampling.generate_test_batches(batch_size=batch_size, num_batches=5, random_state=test_r) test_hashes = sampling.hash_batches(test_batches) episode = 0 start_time = time.time() agent_models = [] agent_opts = [] for i in range(2): model = nets.AgentModel(enable_comms=enable_comms, enable_proposal=enable_proposal, term_entropy_reg=term_entropy_reg, utterance_entropy_reg=utterance_entropy_reg, proposal_entropy_reg=proposal_entropy_reg) if enable_cuda: model = model.cuda() agent_models.append(model) agent_opts.append(optim.Adam(params=agent_models[i].parameters())) if path.isfile(model_file) and not no_load: episode, start_time = load_model(model_file=model_file, agent_models=agent_models, agent_opts=agent_opts) print('loaded model') elif testing: print('') print('ERROR: must have loadable model to use --testing option') print('') return last_print = time.time() rewards_sum = type_constr.FloatTensor(SEQ_LEN).fill_(0) steps_sum = 0 count_sum = 0 for d in ['logs', 'model_saves']: if not path.isdir(d): os.makedirs(d) f_log = open(logfile, 'w') f_log.write('meta: %s\n' % json.dumps({ 'enable_proposal': enable_proposal, 'enable_comms': enable_comms, 'prosocial': prosocial, 'seed': seed })) last_save = time.time() baseline = type_constr.FloatTensor(SEQ_LEN).fill_(0) term_matches_argmax_count = 0 num_policy_runs = 0 utt_matches_argmax_count = 0 utt_stochastic_draws = 0 prop_matches_argmax_count = 0 prop_stochastic_draws = 0 while True: render = time.time() - last_print >= render_every_seconds # render = True batch = sampling.generate_training_batch(batch_size=batch_size, test_hashes=test_hashes, random_state=train_r) actions, rewards, steps, alive_masks, entropy_loss_by_agent, \ _term_matches_argmax_count, _num_policy_runs, _utt_matches_argmax_count, _utt_stochastic_draws, \ _prop_matches_argmax_count, _prop_stochastic_draws = run_episode( batch=batch, enable_cuda=enable_cuda, enable_comms=enable_comms, enable_proposal=enable_proposal, agent_models=agent_models, prosocial=prosocial, # batch_size=batch_size, render=render, testing=testing) term_matches_argmax_count += _term_matches_argmax_count utt_matches_argmax_count += _utt_matches_argmax_count utt_stochastic_draws += _utt_stochastic_draws num_policy_runs += _num_policy_runs prop_matches_argmax_count += _prop_matches_argmax_count prop_stochastic_draws += _prop_stochastic_draws if not testing: for i in range(2): agent_opts[i].zero_grad() reward_loss_by_agent = [0, 0] baselined_rewards = rewards - baseline rewards_by_agent = [] for i in range(2): if prosocial: rewards_by_agent.append(baselined_rewards[:, 2]) else: rewards_by_agent.append(baselined_rewards[:, i]) sieve_playback = alive_sieve.SievePlayback(alive_masks, enable_cuda=enable_cuda) for t, global_idxes in sieve_playback: agent = t % 2 if len(actions[t]) > 0: for action in actions[t]: _rewards = rewards_by_agent[agent] _reward = _rewards[global_idxes].float().contiguous( ).view(sieve_playback.batch_size, 1) _reward_loss = -(action * Variable(_reward)) _reward_loss = _reward_loss.sum() reward_loss_by_agent[agent] += _reward_loss for i in range(2): loss = entropy_loss_by_agent[i] + reward_loss_by_agent[i] loss.backward() agent_opts[i].step() rewards_sum += rewards.sum(0) steps_sum += steps.sum() baseline = 0.7 * baseline + 0.3 * rewards.mean(0) count_sum += batch_size if render: """ run the test batches, print the results """ test_rewards_sum = 0 for test_batch in test_batches: actions, test_rewards, steps, alive_masks, entropy_loss_by_agent, \ _term_matches_argmax_count, _num_policy_runs, _utt_matches_argmax_count, _utt_stochastic_draws, \ _prop_matches_argmax_count, _prop_stochastic_draws = run_episode( batch=test_batch, enable_cuda=enable_cuda, enable_comms=enable_comms, enable_proposal=enable_proposal, agent_models=agent_models, prosocial=prosocial, render=True, testing=True) test_rewards_sum += test_rewards[:, 2].mean() print('test reward=%.3f' % (test_rewards_sum / len(test_batches))) time_since_last = time.time() - last_print if prosocial: baseline_str = '%.2f' % baseline[2] # rewards_str = '%.2f' % (rewards_sum[2] / count_sum) else: baseline_str = '%.2f,%.2f' % (baseline[0], baseline[1]) rewards_str = '%.2f,%.2f,%.2f' % (rewards_sum[0] / count_sum, rewards_sum[1] / count_sum, rewards_sum[2] / count_sum) print( 'e=%s train=%s b=%s games/sec %s avg steps %.4f argmaxp term=%.4f utt=%.4f prop=%.4f' % (episode, rewards_str, baseline_str, int(count_sum / time_since_last), steps_sum / count_sum, term_matches_argmax_count / num_policy_runs, safe_div(utt_matches_argmax_count, utt_stochastic_draws), prop_matches_argmax_count / prop_stochastic_draws)) # f_log.write(json.dumps({ # 'episode': episode, # 'avg_reward_0': (rewards_sum[2] / count_sum).tolist(), # 'test_reward': (test_rewards_sum / len(test_batches)).tolist(), # 'avg_steps': (steps_sum / count_sum).tolist(), # 'games_sec': count_sum / time_since_last, # 'elapsed': time.time() - start_time, # 'argmaxp_term': (term_matches_argmax_count / num_policy_runs).tolist(), # 'argmaxp_utt': safe_div(utt_matches_argmax_count, utt_stochastic_draws), # # 'argmaxp_prop': (prop_matches_argmax_count / prop_stochastic_draws) # 'argmaxp_prop': (prop_matches_argmax_count.tolist() / prop_stochastic_draws) # }) + '\n') # f_log.flush() last_print = time.time() steps_sum = 0 rewards_sum.fill_(0) term_matches_argmax_count = 0 num_policy_runs = 0 utt_matches_argmax_count = 0 utt_stochastic_draws = 0 prop_matches_argmax_count = 0 prop_stochastic_draws = 0 count_sum = 0 if not testing and time.time() - last_save >= 30.0: save_model(model_file=model_file, agent_models=agent_models, agent_opts=agent_opts, start_time=start_time, episode=episode) print('saved model') last_save = time.time() episode += 1 f_log.close()