Exemplo n.º 1
0
 def test_gridworld_generate_samples(self):
     env = Gridworld()
     num_samples = 1000
     num_steps = 5
     samples = env.generate_samples(
         num_samples,
         epsilon=1.0,
         discount_factor=0.9,
         multi_steps=num_steps,
         include_shorter_samples_at_start=True,
         include_shorter_samples_at_end=True,
     )
     self._check_samples(samples, num_samples, num_steps, False)
Exemplo n.º 2
0
    def test_predictor_torch_export(self):
        """Verify that q-values before model export equal q-values after
        model export. Meant to catch issues with export logic."""
        environment = Gridworld()
        samples = Samples(
            mdp_ids=["0"],
            sequence_numbers=[0],
            sequence_number_ordinals=[1],
            states=[{0: 1.0, 1: 1.0, 2: 1.0, 3: 1.0, 4: 1.0, 5: 1.0, 15: 1.0, 24: 1.0}],
            actions=["D"],
            action_probabilities=[0.5],
            rewards=[0],
            possible_actions=[["R", "D"]],
            next_states=[{5: 1.0}],
            next_actions=["U"],
            terminals=[False],
            possible_next_actions=[["R", "U", "D"]],
        )
        tdps = environment.preprocess_samples(samples, 1)
        assert len(tdps) == 1, "Invalid number of data pages"

        trainer = self.get_trainer(environment, {}, False, False, False)
        input = rlt.FeatureData(tdps[0].states)

        pre_export_q_values = trainer.q_network(input).detach().numpy()

        preprocessor = Preprocessor(environment.normalization, False)
        cpu_q_network = trainer.q_network.cpu_model()
        cpu_q_network.eval()
        dqn_with_preprocessor = DiscreteDqnWithPreprocessor(cpu_q_network, preprocessor)
        serving_module = DiscreteDqnPredictorWrapper(
            dqn_with_preprocessor, action_names=environment.ACTIONS
        )

        with tempfile.TemporaryDirectory() as tmpdirname:
            buf = export_module_to_buffer(serving_module)
            tmp_path = os.path.join(tmpdirname, "model")
            with open(tmp_path, "wb") as f:
                f.write(buf.getvalue())
                f.close()
                predictor = DiscreteDqnTorchPredictor(torch.jit.load(tmp_path))

        post_export_q_values = predictor.predict([samples.states[0]])

        for i, action in enumerate(environment.ACTIONS):
            self.assertAlmostEqual(
                float(pre_export_q_values[0][i]),
                float(post_export_q_values[0][action]),
                places=4,
            )
Exemplo n.º 3
0
 def test_doubly_robust(self):
     """Both the logged and model policies are epsilon-greedy policies where
     greedy = optimal, but the epsilon values are different. We test a variety
     of epsilon pairs to check the estimator's ability to evaluate model policies
     that are much different than the logged policies that generated the data. By
     computing the true values associated with both epsilon policies, we can
     see the performance and compute a percentage error.
     """
     environment = Gridworld()
     dr = DoublyRobustEstimator()
     epsilon_test_pairs = [
         [1.0, 0.05],
         [0.8, 0.2],
         [0.6, 0.4],
         [0.5, 0.5],
         [0.4, 0.6],
         [0.2, 0.8],
         [0.05, 1.0],
     ]
     for epsilon_pair in epsilon_test_pairs:
         epsilon_logged = epsilon_pair[0]
         epsilon_model = epsilon_pair[1]
         samples_logged = environment.generate_samples(
             10000, epsilon_logged, DISCOUNT)
         edp = self.create_edp(environment, samples_logged, epsilon_model)
         cpe_drs = dr.estimate(edp)
         true_logged_value = environment.true_q_epsilon_values(
             DISCOUNT, epsilon_logged)
         true_model_value = environment.true_q_epsilon_values(
             DISCOUNT, epsilon_model)
         ratio = true_model_value[0] / true_logged_value[0]
         cpe_drs_names = [
             "One-step direct method",
             "One-step inverse propensity",
             "One-step doubly robust",
         ]
         for i in range(len(cpe_drs)):
             percent_err = (cpe_drs[i].normalized - ratio) / ratio * 100
             logger.info(cpe_drs_names[i] + ": epsilon_pair = (" +
                         str(epsilon_logged) + ", " + str(epsilon_model) +
                         ");\n" + "true ratio = " + str(ratio) +
                         ", computed ratio = " +
                         str(cpe_drs[i].normalized) + ", percent error = " +
                         str(percent_err) + ".")
             self.assertLessEqual(np.absolute(percent_err), 1000)
             self.assertLessEqual(cpe_drs[i].normalized_std_error,
                                  cpe_drs[i].normalized)
Exemplo n.º 4
0
 def test_magic(self):
     """Both the logged and model policies are epsilon-greedy policies where
     greedy = optimal, but the epsilon values are different. We test a variety
     of epsilon pairs to check the estimator's ability to evaluate model policies
     that are much different than the logged policies that generated the data. By
     computing the true values associated with both epsilon policies, we can
     see the performance and compute a percentage error.
     """
     environment = Gridworld()
     weighted_sequential_dr = WeightedSequentialDoublyRobustEstimator(
         DISCOUNT)
     epsilon_test_pairs = [
         [1.0, 0.05],
         [0.8, 0.2],
         [0.6, 0.4],
         [0.5, 0.5],
         [0.4, 0.6],
         [0.2, 0.8],
         [0.05, 1.0],
     ]
     for epsilon_pair in epsilon_test_pairs:
         epsilon_logged = epsilon_pair[0]
         epsilon_model = epsilon_pair[1]
         samples_logged = environment.generate_samples(
             10000, epsilon_logged, DISCOUNT)
         edp = self.create_edp(environment, samples_logged, epsilon_model)
         cpe_magic = weighted_sequential_dr.estimate(
             edp, TestGridworldCPE.NUM_J_STEPS_FOR_MAGIC_ESTIMATOR, True)
         true_logged_value = environment.true_q_epsilon_values(
             DISCOUNT, epsilon_logged)
         true_model_value = environment.true_q_epsilon_values(
             DISCOUNT, epsilon_model)
         ratio = true_model_value[0] / true_logged_value[0]
         percent_err = (cpe_magic.normalized - ratio) / ratio * 100
         logger.info("Magic: epsilon_pair = (" + str(epsilon_logged) +
                     ", " + str(epsilon_model) + ");\n" + "true ratio = " +
                     str(ratio) + ", computed ratio = " +
                     str(cpe_magic.normalized) + ", percent error = " +
                     str(percent_err) + ".")
         self.assertLessEqual(np.absolute(percent_err), 100)
         self.assertLessEqual(cpe_magic.normalized_std_error,
                              cpe_magic.normalized)
Exemplo n.º 5
0
 def _test_reward_boost(self, use_gpu=False, use_all_avail_gpus=False):
     environment = Gridworld()
     reward_boost = {"L": 100, "R": 200, "U": 300, "D": 400}
     trainer = self.get_trainer(
         environment,
         reward_boost,
         dueling=False,
         categorical=False,
         quantile=False,
         use_gpu=use_gpu,
         use_all_avail_gpus=use_all_avail_gpus,
     )
     evaluator = GridworldEvaluator(
         env=environment, assume_optimal_policy=False, gamma=DISCOUNT
     )
     self.evaluate_gridworld(environment, evaluator, trainer, use_gpu)
Exemplo n.º 6
0
 def _test_evaluator_ground_truth(
     self,
     dueling=False,
     categorical=False,
     quantile=False,
     use_gpu=False,
     use_all_avail_gpus=False,
     clip_grad_norm=None,
 ):
     environment = Gridworld()
     evaluator = GridworldEvaluator(environment, False, DISCOUNT)
     trainer = self.get_trainer(
         environment,
         {},
         dueling=dueling,
         categorical=categorical,
         quantile=quantile,
         use_gpu=use_gpu,
         use_all_avail_gpus=use_all_avail_gpus,
         clip_grad_norm=clip_grad_norm,
     )
     self.evaluate_gridworld(environment, evaluator, trainer, use_gpu)