Python BetaBern示例，beta_bernoulli.BetaBern Python示例

示例#1

0

显示文件

文件： hist_functions.py 项目： SIGKDDanon/SIGKDD2021DeAnonV2

def create_models_binary(actions_df, prior, num_actions):
    assert num_actions == 2

    all_models = []
    cache_keys = [[] for _ in range(actions_df.shape[0])]
    action = 0

    # print(actions_df.loc[:,H_ALGO_ACTION_SUCCESS.format(action + 1)])
    # print('Failures------------')
    #print(actions_df.loc[:,H_ALGO_ACTION_FAILURE.format(action + 1)])

    for action in range(num_actions):

        [
            cache_keys[i].extend((successes, failures))
            for (i, successes, failures) in zip(
                range(actions_df.shape[0]),
                actions_df.loc[:, H_ALGO_ACTION_SUCCESS.format(action + 1)],
                actions_df.loc[:, H_ALGO_ACTION_FAILURE.format(action + 1)])
        ]
        #        print((successes, failures)\
        #                      for (successes,failures) in\
        #                      zip(actions_df.loc[:,H_ALGO_ACTION_SUCCESS.format(action + 1)],\
        #                                         actions_df.loc[:,H_ALGO_ACTION_FAILURE.format(action + 1)]))

        cur_models = [beta_bernoulli.BetaBern(successes, failures)\
                      for (successes,failures) in\
                      zip(actions_df.loc[:,H_ALGO_ACTION_SUCCESS.format(action + 1)],\
                                         actions_df.loc[:,H_ALGO_ACTION_FAILURE.format(action + 1)])]
        # add in the one for the prior
        cur_models.insert(0, beta_bernoulli.BetaBern(prior[0], prior[1]))
        all_models.append(cur_models)
    # Add in a cache key for the prior
    cache_keys.insert(0, prior * num_actions)
    return all_models, cache_keys

示例#2

0

显示文件

文件： run_effect_size_simulations_beta_fast_EpsilonTS_decreasing.py 项目： SIGKDD2021ANON/PaperCodeBandits

def run_simulations(num_sims, prob_per_arm, step_sizes, outfile_directory,
    successPrior = 1, failurePrior = 1, softmax_beta = None,
    reordering_fn = None, forceActions = 0, batch_size = 1, burn_in_size = 1,
    random_dur=0, random_start=0, mode='', epsilon = 0.1, resample = True):
    '''
    Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). 
    Bandit uses the thompson_ng sampling policy.
    '''
    csv_output_file_names = []
    sim_results_dfs_list = []

    for num_steps in step_sizes:
        sim_results = []
        for i in range(num_sims):
            if forceActions != 0:
                forced = run_effect_size_simulations.make_forced_actions(len(prob_per_arm), num_steps, forceActions)
            else:
                forced = forced_actions()

            if softmax_beta != None:
                # reorder rewards
                raise ValueError("softmax_beta is not supported in fast mode.")

            if mode=='uniform':
                models = [beta_bernoulli.BetaBern(success=1, failure=1) for _ in range(len(prob_per_arm))]
                random_dur = num_steps
            else:
                models = [beta_bernoulli.BetaBern(success=successPrior, failure=failurePrior) for _ in range(len(prob_per_arm))]


            sim_result, column_names,_ = \
                thompson_policy.two_phase_random_thompson_policy(
                            prob_per_arm=prob_per_arm,
                            users_count=num_steps,
                            random_dur=random_dur,#100,
                            models=models,
                            random_start=random_start,
                            action_mode=thompson_policy.ActionSelectionMode.prob_is_best,
                            relearn=True,
                            forced = forced,
                            batch_size = batch_size, epsilon=epsilon,
                            decreasing_epsilon=1)

            sim_results.extend(sim_result)

        sim_results_df = pd.DataFrame(sim_results, columns=column_names)
        sim_results_df.index = [idx for idx in range(num_steps)]*num_sims
        sim_results_dfs_list.append(sim_results_df)

        cur_output_file = get_output_filename(outfile_directory, num_steps, None, mode)
        csv_output_file_names.append(cur_output_file)

    return sim_results_dfs_list, csv_output_file_names

示例#3

0

显示文件

文件： run_effect_size_simulations_beta_multiarm.py 项目： SIGKDDanon/SIGKDD2021DeAnonV2

def run_simulations_uniform_random(num_sims, prob_per_arm, step_sizes, outfile_directory, forceActions = 0):
    '''
    Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). 
    Bandit uses the thompson_ng sampling policy.
    '''

    for i in range(num_sims):
        for num_steps in step_sizes:
            if forceActions != 0:
                print("Forcing actions:", forceActions)
                forced = run_effect_size_simulations.make_forced_actions(len(prob_per_arm), num_steps, forceActions)
            else:
                forced = forced_actions()
            cur_reward_file = get_rewards_filename(outfile_directory, num_steps, i)
            generate_single_bandit.generate_file(np.array(prob_per_arm),
                                                 num_steps,        
                                                 cur_reward_file)
#        
            cur_output_file = get_output_filename(outfile_directory, num_steps, i)
            models = [beta_bernoulli.BetaBern(success=1, failure=1) for _ in range(len(prob_per_arm))]
            thompson_policy.calculate_thompson_single_bandit(cur_reward_file, 
                                         num_actions=len(prob_per_arm), 
                                         dest= cur_output_file, 
                                         models=models, 
                                         action_mode=thompson_policy.ActionSelectionMode.prob_is_best,
                                         epsilon = 1.0, 
                                         relearn=True,
                                         forced = forced)

示例#4

0

显示文件

文件： permutation_testing.py 项目： SIGKDD2021ANON/PaperCodeBandits

def non_parametric_confidence_interval(actions_df,
                                       stat_fn,
                                       prior,
                                       is_binary=True,
                                       num_permutations=5,
                                       epsilon=0,
                                       ci_size=.95,
                                       grid_size=.05,
                                       forced_actions=None):
    in_ci = []
    non_offset_tau_0 = 0
    for grid_offset in np.arange(-3, 3.001, grid_size):
        tau_0 = non_offset_tau_0 + grid_offset

        rewards = actions_df.loc[:, H_ALGO_OBSERVED_REWARD]
        original_actions = actions_df.loc[:, H_ALGO_ACTION]
        rewards_mod = rewards.copy()
        rewards_mod.loc[original_actions ==
                        1] = rewards_mod.loc[original_actions == 1] - tau_0
        actual_stat = stat_fn(original_actions, rewards_mod)

        all_stats = []
        more_extreme_count = 0
        for i in range(num_permutations):
            if is_binary:
                models = [
                    beta_bernoulli.BetaBern(prior[0], prior[1])
                    for _ in range(num_actions)
                ]
            else:
                models = [
                    ng_normal.NGNormal(mu=prior[0],
                                       k=prior[1],
                                       alpha=prior[2],
                                       beta=prior[3])
                    for _ in range(num_actions)
                ]

            chosen_actions, models = calculate_thompson_single_bandit_permutation_testing(
                rewards,
                models,
                epsilon=epsilon,
                forced_actions=forced_actions)
            cur_stat = stat_fn(chosen_actions, rewards_mod)
            if cur_stat >= actual_stat:
                more_extreme_count += 1
            all_stats.append(cur_stat)
            if debug and (i % 100) == 0:
                print(i, "/ num_permutations:", more_extreme_count)
        pvalue = more_extreme_count / num_permutations
        if np.isnan(actual_stat):
            pvalue = np.nan
        if (1 - pvalue) <= ci_size:
            in_ci.append(tau_0)

    return in_ci

示例#5

0

显示文件

文件： run_effect_size_simulations_beta.py 项目： SIGKDDanon/SIGKDD2021DeAnonV2

def run_simulations(num_sims, prob_per_arm, step_sizes, outfile_directory, successPrior = 1, failurePrior = 1, softmax_beta = None, \
    reordering_fn = None, forceActions = 0, batch_size = 1, burn_in_size = 1):
    '''
    Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). 
    Bandit uses the thompson_ng sampling policy.
    '''

    for i in range(num_sims):
      #  num_steps_prev = 0
        for num_steps in step_sizes:
            if forceActions != 0:
#                 print("Forcing actions:", forceActions)
                forced = run_effect_size_simulations.make_forced_actions(len(prob_per_arm), num_steps, forceActions)
            else:
                forced = forced_actions()
            cur_reward_file = get_rewards_filename(outfile_directory, num_steps, i)
            generate_single_bandit.generate_file(np.array(prob_per_arm),
                                                 num_steps,        
                                                 cur_reward_file)
            if softmax_beta != None:
                # reorder rewards
                reordered_reward_file = get_reordered_rewards_filename(outfile_directory, num_steps, i)
                reorder_samples_in_rewards.reorder_rewards_by_quartile(cur_reward_file, 
                                                                       reordered_reward_file, 
                                                                       reordering_fn, 
                                                                       softmax_beta)
            else:
                reordered_reward_file = cur_reward_file
            cur_output_file = get_output_filename(outfile_directory, num_steps, i)
            models = [beta_bernoulli.BetaBern(success=successPrior, failure=failurePrior) for _ in range(len(prob_per_arm))]


            '''thompson_policy.calculate_thompson_single_bandit(reordered_reward_file, 
                                         num_actions=len(prob_per_arm), 
                                         dest= cur_output_file, 
                                         models=models, 
                                         action_mode=thompson_policy.ActionSelectionMode.prob_is_best, 
                                         relearn=True,
                                         forced = forced,
                                         batch_size = batch_size, 
                                         burn_in_size = burn_in_size)
            '''
            # num_steps_prev = num_steps
            thompson_policy.old_two_phase_random_thompson_policy(reordered_reward_file, 
                                         num_actions=len(prob_per_arm), 
                                         dest= cur_output_file, 
                                         random_dur=0,
                                         models=models,
                                         random_start=0,
                                         action_mode=thompson_policy.ActionSelectionMode.prob_is_best, 
                                         relearn=True,
                                         forced = forced,
                                         batch_size = batch_size, 
                                         burn_in_size = burn_in_size)

示例#6

0

显示文件

def run_simulations_empirical_rewards(num_sims,
                                      reward_file,
                                      experiment_id,
                                      reward_header,
                                      is_cost,
                                      outfile_directory,
                                      successPrior=1,
                                      failurePrior=1,
                                      forceActions=0,
                                      shuffle_data=False):
    '''
    Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). 
    Bandit uses the thompson_ng sampling policy.
    '''
    num_actions = 2
    max_steps = -1
    means = []
    variance = []
    for i in range(num_sims):
        arm_1_rewards, arm_2_rewards = get_assistments_rewards.read_assistments_rewards(
            reward_file, reward_header, experiment_id, is_cost)
        if shuffle_data:
            random.shuffle(arm_1_rewards)
            random.shuffle(arm_2_rewards)
        max_steps = len(arm_1_rewards) + len(arm_2_rewards)
        means = [np.mean(arm_1_rewards), np.mean(arm_2_rewards)]
        variance = [np.var(arm_1_rewards), np.var(arm_2_rewards)]
        if forceActions != 0:
            print("Forcing actions:", forceActions)
            forced = run_effect_size_simulations.make_forced_actions(
                num_actions,
                len(arm_1_rewards) + len(arm_2_rewards), forceActions)
        else:
            forced = forced_actions()

        cur_output_file = get_output_filename(
            outfile_directory,
            len(arm_1_rewards) + len(arm_2_rewards), i)
        models = [
            beta_bernoulli.BetaBern(success=successPrior, failure=failurePrior)
            for _ in range(num_actions)
        ]
        thompson_policy.calculate_thompson_single_bandit_empirical_params(
            arm_1_rewards,
            arm_2_rewards,
            num_actions=num_actions,
            dest=cur_output_file,
            models=models,
            action_mode=thompson_policy.ActionSelectionMode.prob_is_best,
            relearn=True,
            forced=forced)
    return max_steps, means, variance

示例#7

0

显示文件

def run_simulations(num_sims, prob_per_arm, step_sizes, outfile_directory, successPrior = 1, failurePrior = 1, softmax_beta = None, \
    reordering_fn = None, forceActions = 0, batch_size = 1, burn_in_size = 1, c = 0.1, resample = True):
    '''
    Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). 
    Bandit uses the thompson_ng sampling policy.
    '''

    for i in range(num_sims):
        #  num_steps_prev = 0
        for num_steps in step_sizes:
            if forceActions != 0:
                #                 print("Forcing actions:", forceActions)
                forced = run_effect_size_simulations.make_forced_actions(
                    len(prob_per_arm), num_steps, forceActions)
            else:
                forced = forced_actions()
            cur_reward_file = get_rewards_filename(outfile_directory,
                                                   num_steps, i)
            generate_single_bandit.generate_file(np.array(prob_per_arm),
                                                 num_steps, cur_reward_file)
            if softmax_beta != None:
                # reorder rewards
                reordered_reward_file = get_reordered_rewards_filename(
                    outfile_directory, num_steps, i)
                reorder_samples_in_rewards.reorder_rewards_by_quartile(
                    cur_reward_file, reordered_reward_file, reordering_fn,
                    softmax_beta)
            else:
                reordered_reward_file = cur_reward_file
            cur_output_file = get_output_filename(outfile_directory, num_steps,
                                                  i)
            models = [
                beta_bernoulli.BetaBern(success=successPrior,
                                        failure=failurePrior)
                for _ in range(len(prob_per_arm))
            ]

            #if don't pass model, then will be Greedy
            #thresh = 0.03
            #        thresh = 0.1 # for small effect, es = 0.1, 0.55 - 0.45 = 0.10
            ppd.calculate_epsilon_single_bandit(reordered_reward_file,
                                                models=models,
                                                num_actions=len(prob_per_arm),
                                                dest=cur_output_file,
                                                forced=forced,
                                                c=c,
                                                resample=resample)

示例#8

0

显示文件

文件： run_switch_to_best_simulations.py 项目： SIGKDDanon/SIGKDD2021DeAnonV2

def run_simulations_uniform_random_binary(
        num_sims,
        prob_per_arm,
        steps_before_switch,
        steps_after_switch,
        outfile_directory,
        forceActions=0,
        switch_to_best_if_nonsignificant=True):
    '''
    Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). 
    Samples uniformly at random.
    '''
    num_steps = steps_before_switch + steps_after_switch

    for i in range(num_sims):
        if forceActions != 0:
            print("Forcing actions:", forceActions)
            forced = run_effect_size_simulations.make_forced_actions(
                len(prob_per_arm), num_steps, forceActions)
        else:
            forced = forced_actions()

        cur_reward_file = get_rewards_filename(outfile_directory, num_steps, i)
        generate_single_bandit.generate_file(np.array(prob_per_arm), num_steps,
                                             cur_reward_file)
        #
        cur_output_file = get_output_filename(outfile_directory, num_steps, i)
        models = [
            beta_bernoulli.BetaBern(success=1, failure=1)
            for _ in range(len(prob_per_arm))
        ]
        thompson_policy.calculate_thompson_switch_to_fixed_policy(
            cur_reward_file,
            num_actions=len(prob_per_arm),
            dest=cur_output_file,
            num_actions_before_switch=steps_before_switch,
            models=models,
            action_mode=thompson_policy.ActionSelectionMode.prob_is_best,
            epsilon=1.0,
            switch_to_best_if_nonsignificant=switch_to_best_if_nonsignificant,
            forced=forced)

示例#9

0

显示文件

文件： calc_prob_posteriors_differ.py 项目： SIGKDD2021ANON/PaperCodeBandits

def get_models_from_simulation(simulation_out_file, is_binary=True):
    df = pd.read_csv(simulation_out_file, header=1)
    last_row = df.iloc[df.shape[0] - 1, :]
    if is_binary:
        # Action1SuccessCount
        models = [
            beta_bernoulli.BetaBern(
                last_row.loc['Action' + str(i) + 'SuccessCount'],
                last_row.loc['Action' + str(i) + 'FailureCount'])
            for i in range(1, 3)
        ]
    else:
        models = [
            ng_normal.NGNormal(
                mu=last_row.loc['Action' + str(i) + 'EstimatedMu'],
                k=last_row.loc['Action' + str(i) + 'EstimatedVariance'],
                alpha=last_row.loc['Action' + str(i) + 'EstimatedAlpha'],
                beta=last_row.loc['Action' + str(i) + 'EstimatedBeta'])
            for i in range(1, 3)
        ]
    return models

示例#10

0

显示文件

文件： permutation_testing.py 项目： SIGKDD2021ANON/PaperCodeBandits

def permutation_test(actions_df,
                     stat_fn,
                     prior,
                     is_binary=True,
                     num_permutations=5,
                     epsilon=0,
                     forced_actions=None):
    rewards = actions_df.loc[:, H_ALGO_OBSERVED_REWARD]
    #"ObservedRewardofAction"
    original_actions = actions_df.loc[:, H_ALGO_ACTION]  #"AlgorithmAction"
    actual_stat = stat_fn(original_actions, rewards)

    all_stats = []
    more_extreme_count = 0
    for i in range(num_permutations):
        if is_binary:
            models = [
                beta_bernoulli.BetaBern(prior[0], prior[1])
                for _ in range(num_actions)
            ]
        else:
            models = [
                ng_normal.NGNormal(mu=prior[0],
                                   k=prior[1],
                                   alpha=prior[2],
                                   beta=prior[3]) for _ in range(num_actions)
            ]

        chosen_actions, models = calculate_thompson_single_bandit_permutation_testing(
            rewards, models, epsilon=epsilon, forced_actions=forced_actions)
        cur_stat = stat_fn(chosen_actions, rewards)
        if cur_stat >= actual_stat:
            more_extreme_count += 1
        all_stats.append(cur_stat)
        if debug and (i % 100) == 0:
            print(i, "/ num_permutations:", more_extreme_count)
    pvalue = more_extreme_count / num_permutations
    if np.isnan(actual_stat):
        pvalue = np.nan
    return pvalue, all_stats, actual_stat

示例#11

0

显示文件

def switch_bandit_queue(immediate_input, true_input, immediate_output, true_output, 
    time_step_switch, total_time_steps, num_actions = 3):
    
    #Reward info for samples
    samples_with_true_reward = read_reward_file(true_input, num_actions)
    samples_with_immediate_reward = read_reward_file(immediate_input, num_actions)
    
    # Store the samples we have but that haven't yet arrived
    samples = []
    cur_sample_number = 0
    
    # Keep track of what actions are chosen so we can write out the results at the end
    chosen_actions = []
    sampling_distributions = []
    
    #Queue algorithm variables
    num_samples = 1
    mixing_weight = .01 # User defined mixing weight for how much to trust heuristic Policy (alpha in paper)
    queues = [[] for _ in range(num_actions)] # queues for holding samples
    max_queue_size = 1 # limit on queue size (B in paper)
    queue_sizes = np.zeros(num_actions)
    delays = [] # records how long it is between sample being selected and arriving (L in paper) 
    models_heuristic = [beta_bernoulli.BetaBern(success = 1, failure = 1) for _ in range(num_actions)] # Thompson sampling stats for heuristic(immediate); h in paper
    models_base = [beta_bernoulli.BetaBern(success = 1, failure = 1) for _ in range(num_actions)] # Thompson sampling stats for base(delayed)
    heuristic_dist = get_thompson_sample_distribution(models_heuristic) # approx distribution over arms for heuristic; h in paper
    #base_dist = get_thompson_sample_distribution(models_base) # approx distribution over arms for base; p in paper
    
    arm_choice = get_thompson_arm_choice(models_base) #Draw first action choice from base distribution (I in paper)
    while cur_sample_number < total_time_steps:
        while len(queues[arm_choice]) != 0:
            reward = queues[arm_choice].pop(0) # Get the new reward
            queue_sizes[arm_choice] -= 1
            # update base model with this reward
            # converted to range {-1,1} - this is based on standard thompson sampling doing this, although I don't see why it needs to 
            #(looking at BetaBernoulli code, it doesn't need to)
            models_base[arm_choice].update_posterior(0, 2 * reward - 1)
            
            # resample arm_choice
            arm_choice = get_thompson_arm_choice(models_base) 
            
        # resample base arm distribution
        base_dist = get_thompson_sample_distribution(models_base, arm_choice=arm_choice)
        heuristic_dist = get_thompson_sample_distribution(models_heuristic) # approx distribution over arms for heuristic; h in paper
    
#         for base_model,i  in zip(models_base, range(len(models_base))):
#             print("Base model", i, "successes", base_model.success, "failures", base_model.failure)
#         for heuristic_model,i  in zip(models_heuristic, range(len(models_heuristic))):
#             print("Heur model", i, "successes", heuristic_model.success, "failures", heuristic_model.failure)
#             
    
    
        sampling_dist = get_sampling_dist(heuristic_dist, base_dist, num_actions, arm_choice, queue_sizes, max_queue_size, mixing_weight)
        if sum(sampling_dist) < .995:
            print("Sampling dist not a probability distribution")
            sampling_dist = get_sampling_dist(heuristic_dist, base_dist, num_actions, arm_choice, queue_sizes, max_queue_size, mixing_weight)
        # sample from environment (paper notes one if online updates or else for one batch)
        for _ in range(num_samples):
            i = np.argmax(nprand.multinomial(1, sampling_dist))
            # need to get this sample and put it in our samples list
            # sample stores when it was selected, the time step it will arrive, the arm choice, the immediate reward, and the final reward
            sample = (cur_sample_number, max(time_step_switch, cur_sample_number), i, 
                      2*samples_with_immediate_reward[cur_sample_number][i]-1,  2*samples_with_true_reward[cur_sample_number][i] - 1)
            # observe the reward for the heuristic bandit if we aren't yet past the switch time (this isn't part of the paper)
            # we always observe it here because we'll always remove it when the sample arrives
            models_heuristic[i].update_posterior(0, sample[IMMEDIATE_REWARD])
            
            # store the sample so we'll be able to check it when it arrives
            samples.append(sample)
            
            # store the action so we can write it out to report results
            chosen_actions.append(i)
            sampling_distributions.append(sampling_dist)
            
            # increment queue size and sample counts based on sample
            queue_sizes[i] += 1
            cur_sample_number += 1
            

            
        # now need to find out which of the samples have arrived - i.e., delayed reward has come in
        samples_to_remove = []
        for sample in samples:
            # check if the arrival time is here
            if sample[ARRIVAL_TIME] <= cur_sample_number:
                # this sample has arrived - need to update the heuristic model, put it in a queue and mark it for removal
                queues[sample[ARM_INDEX]].append(sample[FINAL_REWARD])
                samples_to_remove.append(sample)
                # update the heuristic by dropping immediate reward from model and adding final reward
                models_heuristic[sample[ARM_INDEX]].remove_from_model(0, sample[IMMEDIATE_REWARD])
                models_heuristic[sample[ARM_INDEX]].update_posterior(0, sample[FINAL_REWARD])
                
                # record the delay of this sample
                delays.append(cur_sample_number - sample[SAMPLE_TIME])
        
        # remove any of the samples that arrived this time around
        for sample in samples_to_remove:
            samples.remove(sample)
            
        # set max_queue_size to maximum delay
        # Problem: can't take the max of an empty list
        # Bigger semantic problem: what should the maximum delay be if nothing has yet arrived?
        if len(delays) != 0:
            max_queue_size = max(delays)
        else:
            max_queue_size += num_samples # increment max queue size base on how many we've seen so far
    
    # At the end, we write out 
    writeOutFile(true_input, true_output, chosen_actions, num_actions, sampling_distributions)
    writeOutFile(immediate_input, immediate_output, chosen_actions, num_actions, sampling_distributions)

示例#12

0

显示文件

文件： run_effect_size_simulations_beta_fast_PPDGreedy.py 项目： SIGKDDanon/SIGKDD2021DeAnonV2

def run_simulations(num_sims, prob_per_arm, step_sizes, outfile_directory,
    successPrior = 1, failurePrior = 1, softmax_beta = None,
    reordering_fn = None, forceActions = 0, batch_size = 1, burn_in_size = 1,
    random_dur=0, random_start=0, mode='', c = 0.1, resample = True, ns_stop = 0):
    '''
    Runs num_sims bandit simulations with several different sample sizes (those in the list step_sizes). 
    Bandit uses the thompson_ng sampling policy.
    '''
    csv_output_file_names = []
    sim_results_dfs_list = []

    for num_steps in step_sizes:
        sim_results = []
        for i in range(num_sims):
            if forceActions != 0:
                forced = run_effect_size_simulations.make_forced_actions(len(prob_per_arm), num_steps, forceActions)
            else:
                forced = forced_actions()

            if softmax_beta != None:
                # reorder rewards
                raise ValueError("softmax_beta is not supported in fast mode.")

            if mode=='uniform':
                models = [beta_bernoulli.BetaBern(success=1, failure=1) for _ in range(len(prob_per_arm))]
                random_dur = num_steps
            else:
                models = [beta_bernoulli.BetaBern(success=successPrior, failure=failurePrior) for _ in range(len(prob_per_arm))]


            
            sim_result, column_names,_ = \
                thompson_policy.ppd_two_phase_random_thompson_policy(
                            prob_per_arm=prob_per_arm,
                            users_count=num_steps,
                            random_dur=random_dur,#100,
                            models=models,
                            random_start=random_start,
                            action_mode='Greedy',
                            relearn=True,
                            forced = forced,
                            batch_size = batch_size, c=c, resample = resample, ns_stop = ns_stop)

            # do ipw here? This is the equivalent of old acitons file(actions_df) 
#            sim_result_df = pd.DataFrame(sim_result, columns=column_names) #Not used yet

 #           calculate_ipw_by_step_size(actions_root = sim_result_df, num_samples=1000, num_actions = 2, cached_probs = {}, \
  #                                        prior = prior, binary_rewards = is_binary, config = config, n = n, num_sims = num_sims, batch_size = bs)

   #         print("sim_result_df", sim_result_df)
    #        print("shape", sim_result_df.shape)
     #       print("shape cols", sim_result_df.columns)
#            print(sim_result.columns())
            sim_results.extend(sim_result)

        sim_results_df = pd.DataFrame(sim_results, columns=column_names)
        sim_results_df.index = [idx for idx in range(num_steps)]*num_sims
        sim_results_dfs_list.append(sim_results_df)

        cur_output_file = get_output_filename(outfile_directory, num_steps, None, mode)
        csv_output_file_names.append(cur_output_file)

    return sim_results_dfs_list, csv_output_file_names