예제 #1
0
 def edp_to_contextual_bandit_log(edp: EvaluationDataPage,
                                  device=None) -> BanditsEstimatorInput:
     log = []
     n = edp.model_rewards.shape[0]
     for idx in range(n):
         # Action is only 1 if tgt policy and log policy took same action?
         action = torch.argmax(edp.action_mask[idx]).item()
         if edp.action_mask[idx][action] == 0.0:
             action = None
         logged_propensities = torch.zeros(
             edp.model_propensities[idx].shape, device=device)
         if action is not None:
             logged_propensities[action] = edp.logged_propensities[idx]
         log.append(
             LogSample(
                 context=None
                 if edp.contexts is None else edp.contexts[idx],
                 log_action=Action(action),
                 log_reward=edp.logged_rewards[idx],
                 log_action_probabilities=ActionDistribution(
                     logged_propensities),
                 tgt_action_probabilities=ActionDistribution(
                     edp.model_propensities[idx]),
                 tgt_action=Action(action),
                 model_outputs=ModelOutputs(
                     tgt_reward_from_log_action=edp.
                     model_rewards_for_logged_action[idx],
                     tgt_rewards=edp.model_rewards[idx],
                 )
                 # item features not specified as edp came from trained reward model
             ))
     return BanditsEstimatorInput(ActionSpace(edp.action_mask.shape[1]),
                                  log, True)
예제 #2
0
    def test_gridworld_sequential_adapter(self):
        """
        Create a gridworld environment, logging policy, and target policy
        Evaluates target policy using the direct OPE sequential doubly robust estimator,
        then transforms the log into an evaluation data page which is passed to the ope adapter.

        This test is meant to verify the adaptation of EDPs into RLEstimatorInputs as employed
        by ReAgent since ReAgent provides EDPs to Evaluators. Going from EDP -> RLEstimatorInput
        is more involved than RLEstimatorInput -> EDP since the EDP does not store the state
        at each timestep in each MDP, only the corresponding logged outputs & model outputs.
        Thus, the adapter must do some tricks to represent these timesteps as states so the
        ope module can extract the correct outputs.

        Note that there is some randomness in the model outputs since the model is purposefully
        noisy. However, the same target policy is being evaluated on the same logged walks through
        the gridworld, so the two results should be close in value (within 1).

        """
        random.seed(0)
        np.random.seed(0)
        torch.random.manual_seed(0)

        device = torch.device("cuda") if torch.cuda.is_available() else None

        gridworld = GridWorld.from_grid(
            [
                ["s", "0", "0", "0", "0"],
                ["0", "0", "0", "W", "0"],
                ["0", "0", "0", "0", "0"],
                ["0", "W", "0", "0", "0"],
                ["0", "0", "0", "0", "g"],
            ],
            max_horizon=TestOPEModuleAlgs.MAX_HORIZON,
        )

        action_space = ActionSpace(4)
        opt_policy = TabularPolicy(action_space)
        trainer = DPTrainer(gridworld, opt_policy)
        value_func = trainer.train(gamma=TestOPEModuleAlgs.GAMMA)

        behavivor_policy = RandomRLPolicy(action_space)
        target_policy = EpsilonGreedyRLPolicy(opt_policy,
                                              TestOPEModuleAlgs.NOISE_EPSILON)
        model = NoiseGridWorldModel(
            gridworld,
            action_space,
            epsilon=TestOPEModuleAlgs.NOISE_EPSILON,
            max_horizon=TestOPEModuleAlgs.MAX_HORIZON,
        )
        value_func = DPValueFunction(target_policy, model,
                                     TestOPEModuleAlgs.GAMMA)
        ground_truth = DPValueFunction(target_policy, gridworld,
                                       TestOPEModuleAlgs.GAMMA)

        log = []
        log_generator = PolicyLogGenerator(gridworld, behavivor_policy)
        num_episodes = TestOPEModuleAlgs.EPISODES
        for state in gridworld.states:
            for _ in range(num_episodes):
                log.append(log_generator.generate_log(state))

        estimator_input = RLEstimatorInput(
            gamma=TestOPEModuleAlgs.GAMMA,
            log=log,
            target_policy=target_policy,
            value_function=value_func,
            ground_truth=ground_truth,
        )

        edp = rlestimator_input_to_edp(estimator_input,
                                       len(model.action_space))

        dr_estimator = SeqDREstimator(weight_clamper=None,
                                      weighted=False,
                                      device=device)

        module_results = SequentialOPEstimatorAdapter.estimator_results_to_cpe_estimate(
            dr_estimator.evaluate(estimator_input))
        adapter_results = SequentialOPEstimatorAdapter(
            dr_estimator, TestOPEModuleAlgs.GAMMA, device=device).estimate(edp)

        self.assertAlmostEqual(
            adapter_results.raw,
            module_results.raw,
            delta=TestOPEModuleAlgs.CPE_PASS_BAR,
        ), f"OPE adapter results differed too much from underlying module (Diff: {abs(adapter_results.raw - module_results.raw)} > {TestOPEModuleAlgs.CPE_PASS_BAR})"
        self.assertLess(
            adapter_results.raw, TestOPEModuleAlgs.CPE_MAX_VALUE
        ), f"OPE adapter results are too large ({adapter_results.raw} > {TestOPEModuleAlgs.CPE_MAX_VALUE})"
예제 #3
0
            ["0", "0", "0", "0", "0"],
            ["0", "W", "0", "0", "0"],
            ["0", "0", "0", "0", "g"],
        ],
        # [
        #     ["s", "0", "0", "0"],
        #     ["0", "0", "0", "0"],
        #     ["0", "0", "0", "0"],
        #     ["0", "0", "0", "g"],
        # ],
        max_horizon=1000,
    )
    # gridworld = ThomasGridWorld()
    logging.info(f"GridWorld:\n{gridworld}")

    action_space = ActionSpace(4)
    opt_policy = TabularPolicy(action_space)
    trainer = DPTrainer(gridworld, opt_policy)
    value_func = trainer.train(gamma=GAMMA)

    logging.info(f"Opt Policy:\n{gridworld.dump_policy(opt_policy)}")
    logging.info(f"Opt state values:\n{gridworld.dump_value_func(value_func)}")

    behavivor_policy = RandomRLPolicy(action_space)
    target_policy = EpsilonGreedyRLPolicy(opt_policy, 0.3)
    model = NoiseGridWorldModel(gridworld,
                                action_space,
                                epsilon=0.3,
                                max_horizon=1000)
    value_func = DPValueFunction(target_policy, model, GAMMA)
    ground_truth = DPValueFunction(target_policy, gridworld, GAMMA)
예제 #4
0
def evaluate_all(
    experiments: Iterable[Tuple[Iterable[Estimator], int]],
    dataset: UCIMultiClassDataset,
    log_trainer: Trainer,
    log_epsilon: float,
    tgt_trainer: Trainer,
    tgt_epsilon: float,
    max_num_workers: int,
    device=None,
):
    action_space = ActionSpace(dataset.num_actions)
    config_path = PurePath(dataset.config_file)
    data_name = config_path.stem
    log_model_name = data_name + "_" + log_trainer.__class__.__name__ + ".pickle"
    log_model_file = str(config_path.with_name(log_model_name))
    tgt_model_name = data_name + "_" + tgt_trainer.__class__.__name__ + ".pickle"
    tgt_model_file = str(config_path.with_name(tgt_model_name))

    log_trainer.load_model(log_model_file)
    tgt_trainer.load_model(tgt_model_file)
    if not log_trainer.is_trained or not tgt_trainer.is_trained:
        (
            train_x,
            train_y,
            train_r,
            val_x,
            val_y,
            val_r,
            test_x,
            test_y,
            test_r,
        ) = dataset.train_val_test_split((0.8, 0.8))
        trainer_data = TrainingData(train_x, train_y, None, val_x, val_y, None)
        if not log_trainer.is_trained:
            log_trainer.train(trainer_data)
            log_trainer.save_model(log_model_file)
        if not tgt_trainer.is_trained:
            tgt_trainer.train(trainer_data)
            tgt_trainer.save_model(tgt_model_file)

    log_results = log_trainer.predict(dataset.features)
    log_policy = MultiClassPolicy(action_space, log_results.probabilities,
                                  log_epsilon)

    tgt_results = tgt_trainer.predict(dataset.features)
    tgt_policy = MultiClassPolicy(action_space, tgt_results.probabilities,
                                  tgt_epsilon)

    inputs = []
    tasks = []
    total_queries = len(dataset)
    for estimators, num_samples in experiments:
        samples = []
        for i in range(num_samples):
            qid = random.randrange(total_queries)
            label = int(dataset.labels[qid].item())
            log_action, log_action_probabilities = log_policy(qid)
            log_reward = 1.0 if log_action.value == label else 0.0
            tgt_action, tgt_action_probabilities = tgt_policy(qid)
            ground_truth_reward = 1.0 if tgt_action.value == label else 0.0
            item_feature = dataset.features[qid]
            samples.append(
                LogSample(
                    context=qid,
                    log_action=log_action,
                    log_reward=log_reward,
                    log_action_probabilities=log_action_probabilities,
                    tgt_action_probabilities=tgt_action_probabilities,
                    tgt_action=tgt_action,
                    ground_truth_reward=ground_truth_reward,
                    item_feature=item_feature,
                ))
        tasks.append(
            (estimators, BanditsEstimatorInput(action_space, samples, False)))

    logging.info("start evaluating...")
    st = time.perf_counter()
    evaluator = Evaluator(tasks, max_num_workers)
    results = evaluator.evaluate()
    Evaluator.report_results(results)
    logging.info(f"evaluating done in {time.perf_counter() - st}s")
    return results
예제 #5
0
def evaluate_all(
    experiments: Iterable[Tuple[Iterable[Estimator], int]],
    dataset: UCIMultiClassDataset,
    log_trainer: Trainer,
    log_epsilon: float,
    tgt_trainer: Trainer,
    tgt_epsilon: float,
    max_num_workers: int,
    random_reward_prob: float = 0.0,
    device=None,
):
    action_space = ActionSpace(dataset.num_actions)
    config_path = PurePath(dataset.config_file)
    data_name = config_path.stem
    log_model_name = data_name + "_" + log_trainer.__class__.__name__ + ".pickle"
    log_model_file = str(config_path.with_name(log_model_name))
    tgt_model_name = data_name + "_" + tgt_trainer.__class__.__name__ + ".pickle"
    tgt_model_file = str(config_path.with_name(tgt_model_name))

    log_trainer.load_model(log_model_file)
    tgt_trainer.load_model(tgt_model_file)
    if not log_trainer.is_trained or not tgt_trainer.is_trained:
        (
            train_x,
            train_y,
            train_r,
            val_x,
            val_y,
            val_r,
            test_x,
            test_y,
            test_r,
            train_choices,
        ) = dataset.train_val_test_split((0.2, 0.8))
        trainer_data = TrainingData(train_x, train_y, None, val_x, val_y, None)
        if not log_trainer.is_trained:
            log_trainer.train(trainer_data)
            log_trainer.save_model(log_model_file)
        if not tgt_trainer.is_trained:
            tgt_trainer.train(trainer_data)
            tgt_trainer.save_model(tgt_model_file)

    log_results = log_trainer.predict(dataset.features)
    assert log_results.probabilities is not None
    log_policy = MultiClassPolicy(action_space, log_results.probabilities,
                                  log_epsilon)

    tgt_results = tgt_trainer.predict(dataset.features)
    assert tgt_results.probabilities is not None
    tgt_policy = MultiClassPolicy(action_space, tgt_results.probabilities,
                                  tgt_epsilon)

    tasks = []
    # pyre-fixme[61]: `train_choices` may not be initialized here.
    test_queries = list(set(range(len(dataset))) - set(train_choices))
    for estimators, num_samples in experiments:
        samples = []
        for _ in range(num_samples):
            qid = random.sample(test_queries, 1)
            label = int(dataset.labels[qid].item())
            log_action, log_action_probabilities = log_policy(qid)
            log_reward = 1.0 if log_action.value == label else 0.0
            tgt_action, tgt_action_probabilities = tgt_policy(qid)
            ground_truth_reward = 1.0 if tgt_action.value == label else 0.0
            item_feature = dataset.features[qid]
            random_reward = random.random() < random_reward_prob
            samples.append(
                LogSample(
                    context=qid,
                    log_action=log_action,
                    log_reward=random.randint(0, 1)
                    if random_reward else log_reward,
                    log_action_probabilities=log_action_probabilities,
                    tgt_action_probabilities=tgt_action_probabilities,
                    tgt_action=tgt_action,
                    ground_truth_reward=ground_truth_reward,
                    item_feature=item_feature,
                ))
        tasks.append(
            (estimators, BanditsEstimatorInput(action_space, samples, False)))

    evaluator = Evaluator(tasks, max_num_workers)
    results = evaluator.evaluate()
    Evaluator.report_results(results)
    return results
예제 #6
0
 def __init__(self,
              num_actions: int,
              model_propensities: torch.Tensor,
              device=None):
     super().__init__(ActionSpace(num_actions), device)
     self.model_propensities = model_propensities
예제 #7
0
    torch.random.manual_seed(1234)

    dataset = UCIMultiClassDataset(params["dataset"])

    episodes = DEFAULT_ITERATIONS
    if "iterations" in params:
        episodes = params["iterations"]

    training_iterations = 10
    training_test_split_ratio = 0.5
    train_x, train_y, train_r, val_x, val_y, val_r, test_x, test_y, test_r = dataset.train_val_test_split(
        (0.8, 0.8))

    trainer_data = TrainingData(train_x, train_y, None, val_x, val_y, None)

    action_space = ActionSpace(dataset.num_actions)
    gt_model = MultiClassModel(test_x, test_r)

    log_trainer = LogisticRegressionTrainer()
    log_trainer.train(trainer_data)
    log_results = log_trainer.predict(test_x)
    score = log_trainer.score(test_y, log_results.predictions)
    logging.info(f"Model trainer score: {score}")
    log_model = MultiClassModel(test_x, log_results.probabilities)
    log_policy = MultiClassPolicy(action_space, log_results.probabilities, 1.0)

    target_trainer = SGDClassifierTrainer()
    # target_trainer = SGDClassifierTrainer(500, 'modified_huber')
    target_trainer.train(trainer_data)
    target_results = target_trainer.predict(test_x)
    score = target_trainer.score(test_y, target_results.predictions)