def estimator_results_to_cpe_estimate( estimator_results: EstimatorResults, ) -> CpeEstimate: scores = torch.tensor( [r.estimated_reward for r in estimator_results.results], dtype=torch.double) log_scores = torch.tensor( [r.log_reward for r in estimator_results.results], dtype=torch.double) dr_score = float(torch.mean(scores).item()) dr_score_std_error = bootstrapped_std_error_of_mean(scores) log_score = float(torch.mean(log_scores).item()) if log_score < 1e-6: logger.warning("Can't normalize SDR-CPE because of small" f" or negative logged_policy_score ({log_score})." f"Episode values: {log_scores}.") return CpeEstimate( raw=dr_score, normalized=0.0, raw_std_error=dr_score_std_error, normalized_std_error=0.0, ) return CpeEstimate( raw=dr_score, normalized=dr_score / log_score, raw_std_error=dr_score_std_error, normalized_std_error=dr_score_std_error / log_score, )
def _compute_metric_data( self, tgt_rewards: Tensor, logged_rewards: Tensor, tgt_score: float) -> Tuple[float, float, float, float]: """ Given a sequence of scores, normalizes the target score by the average logged score and computes the standard error of the target score. Normalizing by the logged score can provide a better metric to compare models against. """ logged_policy_score = float(torch.mean(logged_rewards)) if logged_policy_score < SCORE_THRESHOLD: normalizer = 0.0 else: normalizer = 1.0 / logged_policy_score std_err = bootstrapped_std_error_of_mean( tgt_rewards, num_samples=tgt_rewards.shape[0]) return (tgt_score, tgt_score * normalizer, std_err, std_err * normalizer)
def _compute_metric_data( self, tgt_rewards: Tensor, logged_score: float) -> Tuple[float, float, float]: """ Given a sequence of scores, normalizes the target score by the average logged score and computes the standard error of the target score. Normalizing by the logged score can provide a better metric to compare models against. """ if len(tgt_rewards.shape) > 1: assert tgt_rewards.shape[1] == 1 tgt_rewards = tgt_rewards.reshape((tgt_rewards.shape[0], )) if logged_score < SCORE_THRESHOLD: normalizer = 0.0 else: normalizer = 1.0 / logged_score std_err = bootstrapped_std_error_of_mean(tgt_rewards) return ( torch.mean(tgt_rewards).item() * normalizer, std_err, std_err * normalizer, )
def estimate(self, edp: EvaluationDataPage) -> CpeEstimate: # For details, visit https://arxiv.org/pdf/1511.03722.pdf logged_rewards = edp.logged_rewards.squeeze() logged_propensities = edp.logged_propensities.squeeze() num_examples = edp.logged_rewards.shape[0] estimated_state_values = torch.sum(edp.model_propensities * edp.model_values, dim=1) estimated_q_values_for_logged_action = torch.sum(edp.model_values * edp.action_mask, dim=1) target_propensity_for_action = torch.sum(edp.model_propensities * edp.action_mask, dim=1) assert target_propensity_for_action.shape == logged_propensities.shape, ( "Invalid shape: " + str(target_propensity_for_action.shape) + " != " + str(logged_propensities.shape)) assert (target_propensity_for_action.shape == estimated_q_values_for_logged_action.shape), ( "Invalid shape: " + str(target_propensity_for_action.shape) + " != " + str(estimated_q_values_for_logged_action.shape)) assert target_propensity_for_action.shape == logged_rewards.shape, ( "Invalid shape: " + str(target_propensity_for_action.shape) + " != " + str(logged_rewards.shape)) importance_weight = target_propensity_for_action / logged_propensities doubly_robusts: List[float] = [] episode_values: List[float] = [] assert edp.mdp_id is not None i = 0 last_episode_end = -1 while i < num_examples: # calculate the doubly-robust Q-value for one episode if i == num_examples - 1 or edp.mdp_id[i] != edp.mdp_id[i + 1]: episode_end = i episode_value = 0.0 doubly_robust = 0.0 for j in range(episode_end, last_episode_end, -1): doubly_robust = estimated_state_values[ j] + importance_weight[j] * ( logged_rewards[j] + self.gamma * doubly_robust - estimated_q_values_for_logged_action[j]) episode_value *= self.gamma episode_value += logged_rewards[j] if episode_value > 1e-6 or episode_value < -1e-6: doubly_robusts.append(float(doubly_robust)) episode_values.append(float(episode_value)) last_episode_end = episode_end i += 1 assert len(doubly_robusts) > 0, ( f"No valid doubly robusts data is generated. " f"Logged_rewards={logged_rewards}, importance_weight={importance_weight}," f" estimated_q_values_for_logged_action={estimated_q_values_for_logged_action}" f" estimated_state_values={estimated_state_values}, gamma={self.gamma}" f" Did you specify wrong metric names?") doubly_robusts = np.array(doubly_robusts) # type: ignore dr_score = float(np.mean(doubly_robusts)) dr_score_std_error = bootstrapped_std_error_of_mean(doubly_robusts) episode_values = np.array(episode_values) # type: ignore logged_policy_score = np.mean(episode_values) if logged_policy_score < 1e-6: logger.warning( "Can't normalize SDR-CPE because of small or negative logged_policy_score" ) return CpeEstimate( raw=dr_score, normalized=0.0, raw_std_error=dr_score_std_error, normalized_std_error=0.0, ) return CpeEstimate( raw=dr_score, normalized=dr_score / logged_policy_score, raw_std_error=dr_score_std_error, normalized_std_error=dr_score_std_error / logged_policy_score, )
def estimate(self, edp: EvaluationDataPage) -> CpeEstimate: # For details, visit https://arxiv.org/pdf/1511.03722.pdf logged_rewards = edp.logged_rewards.squeeze() logged_propensities = edp.logged_propensities.squeeze() num_examples = edp.logged_rewards.shape[0] estimated_state_values = torch.sum(edp.model_propensities * edp.model_values, dim=1) estimated_q_values_for_logged_action = torch.sum(edp.model_values * edp.action_mask, dim=1) target_propensity_for_action = torch.sum(edp.model_propensities * edp.action_mask, dim=1) assert target_propensity_for_action.shape == logged_propensities.shape, ( "Invalid shape: " + str(target_propensity_for_action.shape) + " != " + str(logged_propensities.shape)) assert (target_propensity_for_action.shape == estimated_q_values_for_logged_action.shape), ( "Invalid shape: " + str(target_propensity_for_action.shape) + " != " + str(estimated_q_values_for_logged_action.shape)) assert target_propensity_for_action.shape == logged_rewards.shape, ( "Invalid shape: " + str(target_propensity_for_action.shape) + " != " + str(logged_rewards.shape)) importance_weight = target_propensity_for_action / logged_propensities doubly_robusts: List[float] = [] episode_values: List[float] = [] assert edp.mdp_id is not None i = 0 last_episode_end = -1 while i < num_examples: # calculate the doubly-robust Q-value for one episode # pyre-ignore [16]: Optional type has no attribute `__getitem__` if i == num_examples - 1 or edp.mdp_id[i] != edp.mdp_id[i + 1]: episode_end = i episode_value = 0.0 doubly_robust = 0.0 for j in range(episode_end, last_episode_end, -1): doubly_robust = estimated_state_values[ j] + importance_weight[j] * ( logged_rewards[j] + self.gamma * doubly_robust - estimated_q_values_for_logged_action[j]) episode_value *= self.gamma episode_value += logged_rewards[j] doubly_robusts.append(float(doubly_robust)) episode_values.append(float(episode_value)) last_episode_end = episode_end i += 1 if len(doubly_robusts) == 0: torch.set_printoptions(profile="full") zipped_data = list( zip(*map( lambda x: x.tolist(), [ edp.mdp_id, logged_rewards, estimated_state_values, estimated_q_values_for_logged_action, importance_weight, ], ))) raise RuntimeError( f"No valid doubly robusts data is generated.\n" f"mdp_ids x logged_rewards x estimated_state_values x " f"estimated_q_values_for_logged_action x importance_weight:\n" f"{zipped_data};\n" f"gamma={self.gamma};\n" f"Did you specify wrong metric names?") # pyre-fixme[9]: doubly_robusts has type `List[float]`; used as `ndarray`. doubly_robusts = np.array(doubly_robusts) dr_score = float(np.mean(doubly_robusts)) dr_score_std_error = bootstrapped_std_error_of_mean(doubly_robusts) # pyre-fixme[9]: episode_values has type `List[float]`; used as `ndarray`. episode_values = np.array(episode_values) logged_policy_score = np.mean(episode_values) if logged_policy_score < 1e-6: logger.warning( "Can't normalize SDR-CPE because of small" f" or negative logged_policy_score ({logged_policy_score})." f"Episode values: {episode_values}.") return CpeEstimate( raw=dr_score, normalized=0.0, raw_std_error=dr_score_std_error, normalized_std_error=0.0, ) return CpeEstimate( raw=dr_score, normalized=dr_score / logged_policy_score, raw_std_error=dr_score_std_error, normalized_std_error=dr_score_std_error / logged_policy_score, )
def _get_importance_sampling_estimates( self, isd: ImportanceSamplingData, hp: DoublyRobustHP ) -> Tuple[CpeEstimate, CpeEstimate, CpeEstimate]: # The score we would get if we evaluate the logged policy against itself logged_policy_score = float( torch.mean(isd.logged_rewards )) # logged_rewards is N*1 tensor of historical rewards if logged_policy_score < 1e-6: logger.warning( "Can't normalize DR-CPE because of small or negative " + "logged_policy_score") normalizer = 0.0 else: normalizer = 1.0 / logged_policy_score if isd.model_rewards is None: # Fill with zero, equivalent to just doing IPS direct_method_values = torch.zeros( [isd.model_propensities.shape[0], 1], dtype=torch.float32) else: # model rewards is (N_samples)*N_actions tensor of predicted # counterfactual rewards for each possible action at each # historical context direct_method_values = torch.sum(isd.model_propensities * isd.model_rewards, dim=1, keepdim=True) direct_method_score = float(torch.mean(direct_method_values)) direct_method_std_error = bootstrapped_std_error_of_mean( direct_method_values.squeeze(), sample_percent=hp.bootstrap_sample_percent, num_samples=hp.bootstrap_num_samples, ) direct_method_estimate = CpeEstimate( raw=direct_method_score, normalized=direct_method_score * normalizer, raw_std_error=direct_method_std_error, normalized_std_error=direct_method_std_error * normalizer, ) ips = isd.importance_weight * isd.logged_rewards # N*1 doubly_robust = ( isd.importance_weight * (isd.logged_rewards - isd.model_rewards_for_logged_action)) + direct_method_values # model_rewards_for_logged_action is N*1 of estimated rewards for target # policy ips_score = float(torch.mean(ips)) ips_score_std_error = bootstrapped_std_error_of_mean( ips.squeeze(), sample_percent=hp.bootstrap_sample_percent, num_samples=hp.bootstrap_num_samples, ) inverse_propensity_estimate = CpeEstimate( raw=ips_score, normalized=ips_score * normalizer, raw_std_error=ips_score_std_error, normalized_std_error=ips_score_std_error * normalizer, ) dr_score = float(torch.mean(doubly_robust)) dr_score_std_error = bootstrapped_std_error_of_mean( doubly_robust.squeeze(), sample_percent=hp.bootstrap_sample_percent, num_samples=hp.bootstrap_num_samples, ) doubly_robust_estimate = CpeEstimate( raw=dr_score, normalized=dr_score * normalizer, raw_std_error=dr_score_std_error, normalized_std_error=dr_score_std_error * normalizer, ) return ( direct_method_estimate, inverse_propensity_estimate, doubly_robust_estimate, )