def run_simulations_empirical_rewards(num_sims,
                                      reward_file,
                                      experiment_id,
                                      reward_header,
                                      is_cost,
                                      outfile_directory,
                                      prior_mean=0,
                                      forceActions=0,
                                      shuffle_data=False):
    '''
    Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). 
    Bandit uses the thompson_ng sampling policy. Assumes reward_file is formatted like ASSISTments data,
    where the reward is present under the column reward_header. Runs for as many steps as it's able
    to gain samples
    '''
    num_actions = 2
    max_steps = -1
    means = []
    variance = []
    for i in range(num_sims):
        arm_1_rewards, arm_2_rewards = get_assistments_rewards.read_assistments_rewards(
            reward_file, reward_header, experiment_id, is_cost)
        if shuffle_data:
            random.shuffle(arm_1_rewards)
            random.shuffle(arm_2_rewards)
        max_steps = len(arm_1_rewards) + len(arm_2_rewards)
        means = [np.mean(arm_1_rewards), np.mean(arm_2_rewards)]
        variance = [np.var(arm_1_rewards), np.var(arm_2_rewards)]
        if forceActions != 0:
            print("Forcing actions:", forceActions)
            forced = make_forced_actions(
                num_actions,
                len(arm_1_rewards) + len(arm_2_rewards), forceActions)
        else:
            forced = forced_actions()

        cur_output_file = get_output_filename(
            outfile_directory,
            len(arm_1_rewards) + len(arm_2_rewards), i)
        models = [
            ng_normal.NGNormal(mu=prior_mean, k=1, alpha=1, beta=1)
            for _ in range(num_actions)
        ]
        thompson_ng_policy.calculate_thompson_single_bandit_empirical_params(
            arm_1_rewards,
            arm_2_rewards,
            num_actions=num_actions,
            dest=cur_output_file,
            models=models,
            action_mode=thompson_ng_policy.ActionSelectionMode.prob_is_best,
            relearn=True,
            forced=forced)
    return max_steps, means, variance
예제 #2
0
def run_simulations_empirical_rewards(num_sims,
                                      reward_file,
                                      experiment_id,
                                      reward_header,
                                      is_cost,
                                      outfile_directory,
                                      successPrior=1,
                                      failurePrior=1,
                                      forceActions=0,
                                      shuffle_data=False):
    '''
    Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). 
    Bandit uses the thompson_ng sampling policy.
    '''
    num_actions = 2
    max_steps = -1
    means = []
    variance = []
    for i in range(num_sims):
        arm_1_rewards, arm_2_rewards = get_assistments_rewards.read_assistments_rewards(
            reward_file, reward_header, experiment_id, is_cost)
        if shuffle_data:
            random.shuffle(arm_1_rewards)
            random.shuffle(arm_2_rewards)
        max_steps = len(arm_1_rewards) + len(arm_2_rewards)
        means = [np.mean(arm_1_rewards), np.mean(arm_2_rewards)]
        variance = [np.var(arm_1_rewards), np.var(arm_2_rewards)]
        if forceActions != 0:
            print("Forcing actions:", forceActions)
            forced = run_effect_size_simulations.make_forced_actions(
                num_actions,
                len(arm_1_rewards) + len(arm_2_rewards), forceActions)
        else:
            forced = forced_actions()

        cur_output_file = get_output_filename(
            outfile_directory,
            len(arm_1_rewards) + len(arm_2_rewards), i)
        models = [
            beta_bernoulli.BetaBern(success=successPrior, failure=failurePrior)
            for _ in range(num_actions)
        ]
        thompson_policy.calculate_thompson_single_bandit_empirical_params(
            arm_1_rewards,
            arm_2_rewards,
            num_actions=num_actions,
            dest=cur_output_file,
            models=models,
            action_mode=thompson_policy.ActionSelectionMode.prob_is_best,
            relearn=True,
            forced=forced)
    return max_steps, means, variance