def run_simulations_empirical_rewards(num_sims, reward_file, experiment_id, reward_header, is_cost, outfile_directory, prior_mean=0, forceActions=0, shuffle_data=False): ''' Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). Bandit uses the thompson_ng sampling policy. Assumes reward_file is formatted like ASSISTments data, where the reward is present under the column reward_header. Runs for as many steps as it's able to gain samples ''' num_actions = 2 max_steps = -1 means = [] variance = [] for i in range(num_sims): arm_1_rewards, arm_2_rewards = get_assistments_rewards.read_assistments_rewards( reward_file, reward_header, experiment_id, is_cost) if shuffle_data: random.shuffle(arm_1_rewards) random.shuffle(arm_2_rewards) max_steps = len(arm_1_rewards) + len(arm_2_rewards) means = [np.mean(arm_1_rewards), np.mean(arm_2_rewards)] variance = [np.var(arm_1_rewards), np.var(arm_2_rewards)] if forceActions != 0: print("Forcing actions:", forceActions) forced = make_forced_actions( num_actions, len(arm_1_rewards) + len(arm_2_rewards), forceActions) else: forced = forced_actions() cur_output_file = get_output_filename( outfile_directory, len(arm_1_rewards) + len(arm_2_rewards), i) models = [ ng_normal.NGNormal(mu=prior_mean, k=1, alpha=1, beta=1) for _ in range(num_actions) ] thompson_ng_policy.calculate_thompson_single_bandit_empirical_params( arm_1_rewards, arm_2_rewards, num_actions=num_actions, dest=cur_output_file, models=models, action_mode=thompson_ng_policy.ActionSelectionMode.prob_is_best, relearn=True, forced=forced) return max_steps, means, variance
def run_simulations_empirical_rewards(num_sims, reward_file, experiment_id, reward_header, is_cost, outfile_directory, successPrior=1, failurePrior=1, forceActions=0, shuffle_data=False): ''' Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). Bandit uses the thompson_ng sampling policy. ''' num_actions = 2 max_steps = -1 means = [] variance = [] for i in range(num_sims): arm_1_rewards, arm_2_rewards = get_assistments_rewards.read_assistments_rewards( reward_file, reward_header, experiment_id, is_cost) if shuffle_data: random.shuffle(arm_1_rewards) random.shuffle(arm_2_rewards) max_steps = len(arm_1_rewards) + len(arm_2_rewards) means = [np.mean(arm_1_rewards), np.mean(arm_2_rewards)] variance = [np.var(arm_1_rewards), np.var(arm_2_rewards)] if forceActions != 0: print("Forcing actions:", forceActions) forced = run_effect_size_simulations.make_forced_actions( num_actions, len(arm_1_rewards) + len(arm_2_rewards), forceActions) else: forced = forced_actions() cur_output_file = get_output_filename( outfile_directory, len(arm_1_rewards) + len(arm_2_rewards), i) models = [ beta_bernoulli.BetaBern(success=successPrior, failure=failurePrior) for _ in range(num_actions) ] thompson_policy.calculate_thompson_single_bandit_empirical_params( arm_1_rewards, arm_2_rewards, num_actions=num_actions, dest=cur_output_file, models=models, action_mode=thompson_policy.ActionSelectionMode.prob_is_best, relearn=True, forced=forced) return max_steps, means, variance