def test_experiences_to_batches(target_computer_class_mock): compute = target_computer_class_mock.return_value.compute compute.return_value = np.array([42, 43]) state1 = np.arange(16).reshape((4, 4)) + 1 state2 = np.arange(16).reshape((4, 4)) + 2 state3 = np.arange(16).reshape((4, 4)) + 3 experiences = [Experience(state1, 1, 2, state2, False, False, [3]), Experience(state2, 3, 4, state3, True, False, [])] run_inference = Mock(side_effect=[np.array([[0, 0, 0, -0.5], [0, 0, 0, 0]])]) batcher = ExperienceBatcher(None, run_inference, None, 1.0 / 15.0) state_batch, targets, actions = batcher.experiences_to_batches(experiences) reward_batch = np.array([2, 4]) bad_action_batch = np.array([False, True]) next_state_batch = np.array([state2.flatten(), state3.flatten()]) / 15.0 available_actions_batch = np.array([[False, False, False, True], [False, False, False, False]]) assert (compute.call_args_list[0][0][0] == reward_batch).all() assert (compute.call_args_list[0][0][1] == bad_action_batch).all() assert (compute.call_args_list[0][0][2] == next_state_batch).all() assert (compute.call_args_list[0][0][3] == available_actions_batch).all() expected_state_batch = np.array([state1.flatten(), state2.flatten()]) / 15.0 assert (state_batch == expected_state_batch).all() assert (targets == np.array([42, 43])).all() assert (actions == np.array([1, 3])).all()
def run_training(train_dir): """Run training""" resume = os.path.exists(train_dir) with tf.Graph().as_default(): model = FeedModel() saver = tf.train.Saver() session = tf.Session() summary_writer = tf.summary.FileWriter(train_dir, graph_def=session.graph_def, flush_secs=10) if resume: print("Resuming: ", train_dir) saver.restore(session, tf.train.latest_checkpoint(train_dir)) else: print("Starting new training: ", train_dir) session.run(model.init) run_inference = make_run_inference(session, model) get_q_values = make_get_q_values(session, model) experience_collector = ExperienceCollector() batcher = ExperienceBatcher(experience_collector, run_inference, get_q_values, STATE_NORMALIZE_FACTOR) test_experiences = experience_collector.collect( play.random_strategy, 100) for state_batch, targets, actions in batcher.get_batches_stepwise(): global_step, _ = session.run( [model.global_step, model.train_op], feed_dict={ model.state_batch_placeholder: state_batch, model.targets_placeholder: targets, model.actions_placeholder: actions, }) if global_step % 1e3 == 0 and global_step != 0: saver.save(session, train_dir + "/checkpoint", global_step=global_step) loss = write_summaries(session, batcher, model, test_experiences, summary_writer) print("Step:", global_step, "Loss:", loss)
def run_training(train_dir): """Run training""" resume = os.path.exists(train_dir) with tf.Graph().as_default(): model = FeedModel() saver = tf.train.Saver() session = tf.Session() summary_writer = tf.train.SummaryWriter(train_dir, graph_def=session.graph_def, flush_secs=10) if resume: print("Resuming: ", train_dir) saver.restore(session, tf.train.latest_checkpoint(train_dir)) else: print("Starting new training: ", train_dir) session.run(model.init) run_inference = make_run_inference(session, model) get_q_values = make_get_q_values(session, model) experience_collector = ExperienceCollector() batcher = ExperienceBatcher(experience_collector, run_inference, get_q_values, STATE_NORMALIZE_FACTOR) test_experiences = experience_collector.collect(play.random_strategy, 100) for state_batch, targets, actions in batcher.get_batches_stepwise(): global_step, _ = session.run([model.global_step, model.train_op], feed_dict={ model.state_batch_placeholder: state_batch, model.targets_placeholder: targets, model.actions_placeholder: actions,}) if global_step % 10000 == 0 and global_step != 0: saver.save(session, train_dir + "/checkpoint", global_step=global_step) loss = write_summaries(session, batcher, model, test_experiences, summary_writer) print("Step:", global_step, "Loss:", loss)
def test_experiences_to_batches(target_computer_class_mock): compute = target_computer_class_mock.return_value.compute compute.return_value = np.array([42, 43]) state1 = np.arange(16).reshape((4, 4)) + 1 state2 = np.arange(16).reshape((4, 4)) + 2 state3 = np.arange(16).reshape((4, 4)) + 3 experiences = [ Experience(state1, 1, 2, state2, False, False, [3]), Experience(state2, 3, 4, state3, True, False, []) ] run_inference = Mock( side_effect=[np.array([[0, 0, 0, -0.5], [0, 0, 0, 0]])]) batcher = ExperienceBatcher(None, run_inference, None, 1.0 / 15.0) state_batch, targets, actions = batcher.experiences_to_batches(experiences) reward_batch = np.array([2, 4]) bad_action_batch = np.array([False, True]) next_state_batch = np.array([state2.flatten(), state3.flatten()]) / 15.0 available_actions_batch = np.array([[False, False, False, True], [False, False, False, False]]) assert (compute.call_args_list[0][0][0] == reward_batch).all() assert (compute.call_args_list[0][0][1] == bad_action_batch).all() assert (compute.call_args_list[0][0][2] == next_state_batch).all() assert (compute.call_args_list[0][0][3] == available_actions_batch).all() expected_state_batch = np.array([state1.flatten(), state2.flatten()]) / 15.0 assert (state_batch == expected_state_batch).all() assert (targets == np.array([42, 43])).all() assert (actions == np.array([1, 3])).all()