예제 #1
0
    def estimator_results_to_cpe_estimate(
        estimator_results: EstimatorResults, ) -> CpeEstimate:
        scores = torch.tensor(
            [r.estimated_reward for r in estimator_results.results],
            dtype=torch.double)
        log_scores = torch.tensor(
            [r.log_reward for r in estimator_results.results],
            dtype=torch.double)

        dr_score = float(torch.mean(scores).item())
        dr_score_std_error = bootstrapped_std_error_of_mean(scores)

        log_score = float(torch.mean(log_scores).item())
        if log_score < 1e-6:
            logger.warning("Can't normalize SDR-CPE because of small"
                           f" or negative logged_policy_score ({log_score})."
                           f"Episode values: {log_scores}.")
            return CpeEstimate(
                raw=dr_score,
                normalized=0.0,
                raw_std_error=dr_score_std_error,
                normalized_std_error=0.0,
            )
        return CpeEstimate(
            raw=dr_score,
            normalized=dr_score / log_score,
            raw_std_error=dr_score_std_error,
            normalized_std_error=dr_score_std_error / log_score,
        )
예제 #2
0
 def _compute_metric_data(
         self, tgt_rewards: Tensor, logged_rewards: Tensor,
         tgt_score: float) -> Tuple[float, float, float, float]:
     """
     Given a sequence of scores, normalizes the target score by the average logged score
     and computes the standard error of the target score. Normalizing by the logged score
     can provide a better metric to compare models against.
     """
     logged_policy_score = float(torch.mean(logged_rewards))
     if logged_policy_score < SCORE_THRESHOLD:
         normalizer = 0.0
     else:
         normalizer = 1.0 / logged_policy_score
     std_err = bootstrapped_std_error_of_mean(
         tgt_rewards, num_samples=tgt_rewards.shape[0])
     return (tgt_score, tgt_score * normalizer, std_err,
             std_err * normalizer)
예제 #3
0
 def _compute_metric_data(
         self, tgt_rewards: Tensor,
         logged_score: float) -> Tuple[float, float, float]:
     """
     Given a sequence of scores, normalizes the target score by the average logged score
     and computes the standard error of the target score. Normalizing by the logged score
     can provide a better metric to compare models against.
     """
     if len(tgt_rewards.shape) > 1:
         assert tgt_rewards.shape[1] == 1
         tgt_rewards = tgt_rewards.reshape((tgt_rewards.shape[0], ))
     if logged_score < SCORE_THRESHOLD:
         normalizer = 0.0
     else:
         normalizer = 1.0 / logged_score
     std_err = bootstrapped_std_error_of_mean(tgt_rewards)
     return (
         torch.mean(tgt_rewards).item() * normalizer,
         std_err,
         std_err * normalizer,
     )
    def estimate(self, edp: EvaluationDataPage) -> CpeEstimate:
        # For details, visit https://arxiv.org/pdf/1511.03722.pdf
        logged_rewards = edp.logged_rewards.squeeze()
        logged_propensities = edp.logged_propensities.squeeze()

        num_examples = edp.logged_rewards.shape[0]

        estimated_state_values = torch.sum(edp.model_propensities *
                                           edp.model_values,
                                           dim=1)

        estimated_q_values_for_logged_action = torch.sum(edp.model_values *
                                                         edp.action_mask,
                                                         dim=1)

        target_propensity_for_action = torch.sum(edp.model_propensities *
                                                 edp.action_mask,
                                                 dim=1)

        assert target_propensity_for_action.shape == logged_propensities.shape, (
            "Invalid shape: " + str(target_propensity_for_action.shape) +
            " != " + str(logged_propensities.shape))
        assert (target_propensity_for_action.shape ==
                estimated_q_values_for_logged_action.shape), (
                    "Invalid shape: " +
                    str(target_propensity_for_action.shape) + " != " +
                    str(estimated_q_values_for_logged_action.shape))
        assert target_propensity_for_action.shape == logged_rewards.shape, (
            "Invalid shape: " + str(target_propensity_for_action.shape) +
            " != " + str(logged_rewards.shape))
        importance_weight = target_propensity_for_action / logged_propensities

        doubly_robusts: List[float] = []
        episode_values: List[float] = []

        assert edp.mdp_id is not None
        i = 0
        last_episode_end = -1
        while i < num_examples:
            # calculate the doubly-robust Q-value for one episode
            if i == num_examples - 1 or edp.mdp_id[i] != edp.mdp_id[i + 1]:
                episode_end = i
                episode_value = 0.0
                doubly_robust = 0.0
                for j in range(episode_end, last_episode_end, -1):
                    doubly_robust = estimated_state_values[
                        j] + importance_weight[j] * (
                            logged_rewards[j] + self.gamma * doubly_robust -
                            estimated_q_values_for_logged_action[j])
                    episode_value *= self.gamma
                    episode_value += logged_rewards[j]
                if episode_value > 1e-6 or episode_value < -1e-6:
                    doubly_robusts.append(float(doubly_robust))
                    episode_values.append(float(episode_value))
                last_episode_end = episode_end
            i += 1

        assert len(doubly_robusts) > 0, (
            f"No valid doubly robusts data is generated. "
            f"Logged_rewards={logged_rewards}, importance_weight={importance_weight},"
            f" estimated_q_values_for_logged_action={estimated_q_values_for_logged_action}"
            f" estimated_state_values={estimated_state_values}, gamma={self.gamma}"
            f" Did you specify wrong metric names?")

        doubly_robusts = np.array(doubly_robusts)  # type: ignore
        dr_score = float(np.mean(doubly_robusts))
        dr_score_std_error = bootstrapped_std_error_of_mean(doubly_robusts)

        episode_values = np.array(episode_values)  # type: ignore
        logged_policy_score = np.mean(episode_values)
        if logged_policy_score < 1e-6:
            logger.warning(
                "Can't normalize SDR-CPE because of small or negative logged_policy_score"
            )
            return CpeEstimate(
                raw=dr_score,
                normalized=0.0,
                raw_std_error=dr_score_std_error,
                normalized_std_error=0.0,
            )
        return CpeEstimate(
            raw=dr_score,
            normalized=dr_score / logged_policy_score,
            raw_std_error=dr_score_std_error,
            normalized_std_error=dr_score_std_error / logged_policy_score,
        )
예제 #5
0
    def estimate(self, edp: EvaluationDataPage) -> CpeEstimate:
        # For details, visit https://arxiv.org/pdf/1511.03722.pdf
        logged_rewards = edp.logged_rewards.squeeze()
        logged_propensities = edp.logged_propensities.squeeze()

        num_examples = edp.logged_rewards.shape[0]

        estimated_state_values = torch.sum(edp.model_propensities *
                                           edp.model_values,
                                           dim=1)

        estimated_q_values_for_logged_action = torch.sum(edp.model_values *
                                                         edp.action_mask,
                                                         dim=1)

        target_propensity_for_action = torch.sum(edp.model_propensities *
                                                 edp.action_mask,
                                                 dim=1)

        assert target_propensity_for_action.shape == logged_propensities.shape, (
            "Invalid shape: " + str(target_propensity_for_action.shape) +
            " != " + str(logged_propensities.shape))
        assert (target_propensity_for_action.shape ==
                estimated_q_values_for_logged_action.shape), (
                    "Invalid shape: " +
                    str(target_propensity_for_action.shape) + " != " +
                    str(estimated_q_values_for_logged_action.shape))
        assert target_propensity_for_action.shape == logged_rewards.shape, (
            "Invalid shape: " + str(target_propensity_for_action.shape) +
            " != " + str(logged_rewards.shape))
        importance_weight = target_propensity_for_action / logged_propensities

        doubly_robusts: List[float] = []
        episode_values: List[float] = []

        assert edp.mdp_id is not None
        i = 0
        last_episode_end = -1
        while i < num_examples:
            # calculate the doubly-robust Q-value for one episode
            # pyre-ignore [16]: Optional type has no attribute `__getitem__`
            if i == num_examples - 1 or edp.mdp_id[i] != edp.mdp_id[i + 1]:
                episode_end = i
                episode_value = 0.0
                doubly_robust = 0.0
                for j in range(episode_end, last_episode_end, -1):
                    doubly_robust = estimated_state_values[
                        j] + importance_weight[j] * (
                            logged_rewards[j] + self.gamma * doubly_robust -
                            estimated_q_values_for_logged_action[j])
                    episode_value *= self.gamma
                    episode_value += logged_rewards[j]

                doubly_robusts.append(float(doubly_robust))
                episode_values.append(float(episode_value))
                last_episode_end = episode_end
            i += 1

        if len(doubly_robusts) == 0:
            torch.set_printoptions(profile="full")
            zipped_data = list(
                zip(*map(
                    lambda x: x.tolist(),
                    [
                        edp.mdp_id,
                        logged_rewards,
                        estimated_state_values,
                        estimated_q_values_for_logged_action,
                        importance_weight,
                    ],
                )))
            raise RuntimeError(
                f"No valid doubly robusts data is generated.\n"
                f"mdp_ids x logged_rewards x estimated_state_values x "
                f"estimated_q_values_for_logged_action x importance_weight:\n"
                f"{zipped_data};\n"
                f"gamma={self.gamma};\n"
                f"Did you specify wrong metric names?")

        # pyre-fixme[9]: doubly_robusts has type `List[float]`; used as `ndarray`.
        doubly_robusts = np.array(doubly_robusts)
        dr_score = float(np.mean(doubly_robusts))
        dr_score_std_error = bootstrapped_std_error_of_mean(doubly_robusts)

        # pyre-fixme[9]: episode_values has type `List[float]`; used as `ndarray`.
        episode_values = np.array(episode_values)
        logged_policy_score = np.mean(episode_values)
        if logged_policy_score < 1e-6:
            logger.warning(
                "Can't normalize SDR-CPE because of small"
                f" or negative logged_policy_score ({logged_policy_score})."
                f"Episode values: {episode_values}.")
            return CpeEstimate(
                raw=dr_score,
                normalized=0.0,
                raw_std_error=dr_score_std_error,
                normalized_std_error=0.0,
            )
        return CpeEstimate(
            raw=dr_score,
            normalized=dr_score / logged_policy_score,
            raw_std_error=dr_score_std_error,
            normalized_std_error=dr_score_std_error / logged_policy_score,
        )
예제 #6
0
    def _get_importance_sampling_estimates(
            self, isd: ImportanceSamplingData, hp: DoublyRobustHP
    ) -> Tuple[CpeEstimate, CpeEstimate, CpeEstimate]:
        # The score we would get if we evaluate the logged policy against itself
        logged_policy_score = float(
            torch.mean(isd.logged_rewards
                       ))  # logged_rewards is N*1 tensor of historical rewards
        if logged_policy_score < 1e-6:
            logger.warning(
                "Can't normalize DR-CPE because of small or negative " +
                "logged_policy_score")
            normalizer = 0.0
        else:
            normalizer = 1.0 / logged_policy_score

        if isd.model_rewards is None:
            # Fill with zero, equivalent to just doing IPS
            direct_method_values = torch.zeros(
                [isd.model_propensities.shape[0], 1], dtype=torch.float32)
        else:
            # model rewards is (N_samples)*N_actions tensor of predicted
            # counterfactual rewards for each possible action at each
            # historical context
            direct_method_values = torch.sum(isd.model_propensities *
                                             isd.model_rewards,
                                             dim=1,
                                             keepdim=True)

        direct_method_score = float(torch.mean(direct_method_values))
        direct_method_std_error = bootstrapped_std_error_of_mean(
            direct_method_values.squeeze(),
            sample_percent=hp.bootstrap_sample_percent,
            num_samples=hp.bootstrap_num_samples,
        )
        direct_method_estimate = CpeEstimate(
            raw=direct_method_score,
            normalized=direct_method_score * normalizer,
            raw_std_error=direct_method_std_error,
            normalized_std_error=direct_method_std_error * normalizer,
        )

        ips = isd.importance_weight * isd.logged_rewards  # N*1

        doubly_robust = (
            isd.importance_weight *
            (isd.logged_rewards -
             isd.model_rewards_for_logged_action)) + direct_method_values
        # model_rewards_for_logged_action is N*1 of estimated rewards for target
        # policy

        ips_score = float(torch.mean(ips))
        ips_score_std_error = bootstrapped_std_error_of_mean(
            ips.squeeze(),
            sample_percent=hp.bootstrap_sample_percent,
            num_samples=hp.bootstrap_num_samples,
        )
        inverse_propensity_estimate = CpeEstimate(
            raw=ips_score,
            normalized=ips_score * normalizer,
            raw_std_error=ips_score_std_error,
            normalized_std_error=ips_score_std_error * normalizer,
        )

        dr_score = float(torch.mean(doubly_robust))
        dr_score_std_error = bootstrapped_std_error_of_mean(
            doubly_robust.squeeze(),
            sample_percent=hp.bootstrap_sample_percent,
            num_samples=hp.bootstrap_num_samples,
        )
        doubly_robust_estimate = CpeEstimate(
            raw=dr_score,
            normalized=dr_score * normalizer,
            raw_std_error=dr_score_std_error,
            normalized_std_error=dr_score_std_error * normalizer,
        )

        return (
            direct_method_estimate,
            inverse_propensity_estimate,
            doubly_robust_estimate,
        )