예제 #1
0
def test_hash_batches():
    r = np.random.RandomState(123)
    test_batches = sampling.generate_test_batches(batch_size=8,
                                                  num_batches=3,
                                                  random_state=r)
    hashes = sampling.hash_batches(test_batches)
    for v in hashes:
        print(v)
예제 #2
0
def test_generate_nonoverlapping_train_batch():
    """
    cant really test this, but at least check it runs

    (actually: we could test, by resetting the random state)
    """
    r = np.random.RandomState(123)
    test_batches = sampling.generate_test_batches(batch_size=32,
                                                  num_batches=3,
                                                  random_state=r)
    test_hashes = sampling.hash_batches(test_batches)
    train_batch = sampling.generate_training_batch(test_hashes=test_hashes,
                                                   batch_size=32,
                                                   random_state=r)
    print('train_batch', train_batch)
예제 #3
0
def test_checkoverlap():
    r = np.random.RandomState(123)
    batches = sampling.generate_test_batches(batch_size=32,
                                             num_batches=5,
                                             random_state=r)
    test_batches = batches[:2]
    train_batches = batches[2:]
    test_hashes = sampling.hash_batches(test_batches)
    assert sampling.overlaps(test_hashes=test_hashes, batch=test_batches[1])
    assert not sampling.overlaps(test_hashes=test_hashes,
                                 batch=train_batches[0])
    # grab one example from test batch, copy to train batch, check now overlaps
    src = test_batches[1]
    dest = train_batches[0]
    dest['pool'][5] = src['pool'][12]
    dest['utilities'][0][5] = src['utilities'][0][12]
    dest['utilities'][1][5] = src['utilities'][1][12]
    dest['N'][5] = src['N'][12]
    assert sampling.overlaps(test_hashes=test_hashes, batch=test_batches[0])
예제 #4
0
def run(enable_proposal, enable_comms, seed, prosocial, logfile, model_file,
        batch_size, term_entropy_reg, utterance_entropy_reg,
        proposal_entropy_reg, enable_cuda, no_load, testing, test_seed,
        render_every_seconds):
    """
    testing option will:
    - use argmax, ie disable stochastic draws
    - not run optimizers
    - not save model
    """
    type_constr = torch.cuda if enable_cuda else torch
    if seed is not None:
        np.random.seed(seed)
        torch.manual_seed(seed)
        train_r = np.random.RandomState(seed)
    else:
        train_r = np.random

    test_r = np.random.RandomState(test_seed)
    test_batches = sampling.generate_test_batches(batch_size=batch_size,
                                                  num_batches=5,
                                                  random_state=test_r)
    test_hashes = sampling.hash_batches(test_batches)

    episode = 0
    start_time = time.time()
    agent_models = []
    agent_opts = []
    for i in range(2):
        model = nets.AgentModel(enable_comms=enable_comms,
                                enable_proposal=enable_proposal,
                                term_entropy_reg=term_entropy_reg,
                                utterance_entropy_reg=utterance_entropy_reg,
                                proposal_entropy_reg=proposal_entropy_reg)
        if enable_cuda:
            model = model.cuda()
        agent_models.append(model)
        agent_opts.append(optim.Adam(params=agent_models[i].parameters()))
    if path.isfile(model_file) and not no_load:
        episode, start_time = load_model(model_file=model_file,
                                         agent_models=agent_models,
                                         agent_opts=agent_opts)
        print('loaded model')
    elif testing:
        print('')
        print('ERROR: must have loadable model to use --testing option')
        print('')
        return
    last_print = time.time()
    rewards_sum = type_constr.FloatTensor(SEQ_LEN).fill_(0)
    steps_sum = 0
    count_sum = 0
    for d in ['logs', 'model_saves']:
        if not path.isdir(d):
            os.makedirs(d)
    f_log = open(logfile, 'w')
    f_log.write('meta: %s\n' % json.dumps({
        'enable_proposal': enable_proposal,
        'enable_comms': enable_comms,
        'prosocial': prosocial,
        'seed': seed
    }))
    last_save = time.time()
    baseline = type_constr.FloatTensor(SEQ_LEN).fill_(0)
    term_matches_argmax_count = 0
    num_policy_runs = 0
    utt_matches_argmax_count = 0
    utt_stochastic_draws = 0
    prop_matches_argmax_count = 0
    prop_stochastic_draws = 0
    while True:
        render = time.time() - last_print >= render_every_seconds
        # render = True
        batch = sampling.generate_training_batch(batch_size=batch_size,
                                                 test_hashes=test_hashes,
                                                 random_state=train_r)
        actions, rewards, steps, alive_masks, entropy_loss_by_agent, \
                _term_matches_argmax_count, _num_policy_runs, _utt_matches_argmax_count, _utt_stochastic_draws, \
                _prop_matches_argmax_count, _prop_stochastic_draws = run_episode(
            batch=batch,
            enable_cuda=enable_cuda,
            enable_comms=enable_comms,
            enable_proposal=enable_proposal,
            agent_models=agent_models,
            prosocial=prosocial,
            # batch_size=batch_size,
            render=render,
            testing=testing)
        term_matches_argmax_count += _term_matches_argmax_count
        utt_matches_argmax_count += _utt_matches_argmax_count
        utt_stochastic_draws += _utt_stochastic_draws
        num_policy_runs += _num_policy_runs
        prop_matches_argmax_count += _prop_matches_argmax_count
        prop_stochastic_draws += _prop_stochastic_draws

        if not testing:
            for i in range(2):
                agent_opts[i].zero_grad()
            reward_loss_by_agent = [0, 0]
            baselined_rewards = rewards - baseline
            rewards_by_agent = []
            for i in range(2):
                if prosocial:
                    rewards_by_agent.append(baselined_rewards[:, 2])
                else:
                    rewards_by_agent.append(baselined_rewards[:, i])
            sieve_playback = alive_sieve.SievePlayback(alive_masks,
                                                       enable_cuda=enable_cuda)
            for t, global_idxes in sieve_playback:
                agent = t % 2
                if len(actions[t]) > 0:
                    for action in actions[t]:
                        _rewards = rewards_by_agent[agent]
                        _reward = _rewards[global_idxes].float().contiguous(
                        ).view(sieve_playback.batch_size, 1)
                        _reward_loss = -(action * Variable(_reward))
                        _reward_loss = _reward_loss.sum()
                        reward_loss_by_agent[agent] += _reward_loss
            for i in range(2):
                loss = entropy_loss_by_agent[i] + reward_loss_by_agent[i]
                loss.backward()
                agent_opts[i].step()

        rewards_sum += rewards.sum(0)
        steps_sum += steps.sum()
        baseline = 0.7 * baseline + 0.3 * rewards.mean(0)
        count_sum += batch_size

        if render:
            """
            run the test batches, print the results
            """
            test_rewards_sum = 0
            for test_batch in test_batches:
                actions, test_rewards, steps, alive_masks, entropy_loss_by_agent, \
                        _term_matches_argmax_count, _num_policy_runs, _utt_matches_argmax_count, _utt_stochastic_draws, \
                        _prop_matches_argmax_count, _prop_stochastic_draws = run_episode(
                    batch=test_batch,
                    enable_cuda=enable_cuda,
                    enable_comms=enable_comms,
                    enable_proposal=enable_proposal,
                    agent_models=agent_models,
                    prosocial=prosocial,
                    render=True,
                    testing=True)
                test_rewards_sum += test_rewards[:, 2].mean()
            print('test reward=%.3f' % (test_rewards_sum / len(test_batches)))

            time_since_last = time.time() - last_print
            if prosocial:
                baseline_str = '%.2f' % baseline[2]
                # rewards_str = '%.2f' % (rewards_sum[2] / count_sum)
            else:
                baseline_str = '%.2f,%.2f' % (baseline[0], baseline[1])
            rewards_str = '%.2f,%.2f,%.2f' % (rewards_sum[0] / count_sum,
                                              rewards_sum[1] / count_sum,
                                              rewards_sum[2] / count_sum)
            print(
                'e=%s train=%s b=%s games/sec %s avg steps %.4f argmaxp term=%.4f utt=%.4f prop=%.4f'
                % (episode, rewards_str, baseline_str,
                   int(count_sum / time_since_last), steps_sum / count_sum,
                   term_matches_argmax_count / num_policy_runs,
                   safe_div(utt_matches_argmax_count, utt_stochastic_draws),
                   prop_matches_argmax_count / prop_stochastic_draws))
            # f_log.write(json.dumps({
            #     'episode': episode,
            #     'avg_reward_0': (rewards_sum[2] / count_sum).tolist(),
            #     'test_reward': (test_rewards_sum / len(test_batches)).tolist(),
            #     'avg_steps': (steps_sum / count_sum).tolist(),
            #     'games_sec': count_sum / time_since_last,
            #     'elapsed': time.time() - start_time,
            #     'argmaxp_term': (term_matches_argmax_count / num_policy_runs).tolist(),
            #     'argmaxp_utt': safe_div(utt_matches_argmax_count, utt_stochastic_draws),
            #     # 'argmaxp_prop': (prop_matches_argmax_count / prop_stochastic_draws)
            #     'argmaxp_prop': (prop_matches_argmax_count.tolist() / prop_stochastic_draws)
            # }) + '\n')
            # f_log.flush()
            last_print = time.time()
            steps_sum = 0
            rewards_sum.fill_(0)
            term_matches_argmax_count = 0
            num_policy_runs = 0
            utt_matches_argmax_count = 0
            utt_stochastic_draws = 0
            prop_matches_argmax_count = 0
            prop_stochastic_draws = 0
            count_sum = 0
        if not testing and time.time() - last_save >= 30.0:
            save_model(model_file=model_file,
                       agent_models=agent_models,
                       agent_opts=agent_opts,
                       start_time=start_time,
                       episode=episode)
            print('saved model')
            last_save = time.time()

        episode += 1
    f_log.close()