def test_train(self): q = MemoryOnlyHashQ('test', 3) algo = AlgoQLearning(q) # train on 10 games for _ in range(100): ep = Episode(algo, PolicyRandom()) ep.run(StateTest(random.choice([True, False]))) # check if policy is optimal policy = PolicyExploit(q) # print(q.get_all(StateTest().inputs())) for cells, optimal_action in [ ([0, 0, 0], 1), ([2, 0, 0], 1), ([0, 0, 2], 1), ]: state = StateTest() state.cells = cells self.assertEqual(policy.play(state)[0], optimal_action, state)
def episodes_from_fns(fns, limit=None, split='dev'): use_scripts = (scripted_tell in fns) or (scripted_tell_before_peek in fns) if scripted_tell_after_peek in fns: use_scripts = True run_from = codraw_data.get_scenes_and_scripts_with_peek(split) elif use_scripts: run_from = codraw_data.get_scenes_and_scripts(split) else: run_from = codraw_data.get_scenes(split) if limit is not None: run_from = run_from[:limit] sims = [] with torch.no_grad(): for run_from_single in run_from: if use_scripts: episode = Episode.run_script(run_from_single, fns) else: episode = Episode.run(run_from_single, fns) yield episode
def collect_episodes(fns, dg, scenes=codraw_data.get_scenes('dev'), batch_size=16, utterance_penalty=0.25, gamma=0.99, uninformative_penalty=0.3): with torch.no_grad(): episodes = [] for scene in np.random.choice(scenes, batch_size): ep = Episode.run(scene, fns) episodes.append(ep) example_batch = examples_from_episodes( episodes, dg=dg, utterance_penalty=utterance_penalty, gamma=gamma, uninformative_penalty=uninformative_penalty, ) return episodes, example_batch