def make_stats_row_from_df(cur_data, include_power, effect_size = None, alpha = None): '''Calculates output statistics given the data frame cur_data. If include_power, includes the power calculation. efffect_size and alpha are only required/used if power is calculated ''' sample_sizes = np.array([np.sum(cur_data[action_header] == i) for i in range(1,3)]) successes = np.array([np.sum(cur_data[cur_data[action_header] == 1][obs_reward_header]), np.sum(cur_data[cur_data[action_header] == 2][obs_reward_header])]) #calculate sample size and mean cur_row = {} sample_size_1 = sample_sizes[0] sample_size_2 = sample_sizes[1] cur_row['sample_size_1'] = sample_size_1 cur_row['sample_size_2'] = sample_size_2 mean_1 = np.mean(cur_data[cur_data[action_header] == 1][obs_reward_header])# JN mean for arm 1 mean_2 = np.mean(cur_data[cur_data[action_header] == 2][obs_reward_header])#JN mean for arm 2 cur_row['mean_1'] = mean_1 cur_row['mean_2'] = mean_2 #SE = sqrt[(P^hat_A*(1-P^hat_A)/N_A + (P^hat_B*(1-P^hat_B)/N_B] SE = np.sqrt(mean_1*(1 - mean_1)/sample_size_1 + mean_2*(1 - mean_2)/sample_size_2) wald_type_stat = (mean_1 - mean_2)/SE #(P^hat_A - P^hat_b)/SE #print('wald_type_stat:', wald_type_stat) wald_pval = (1 - scipy.stats.norm.cdf(np.abs(wald_type_stat)))*2 #Two sided, symetric, so compare to 0.05 cur_row['wald_type_stat'] = wald_type_stat #TODO add in Delta!!!! cur_row['wald_pval'] = wald_pval #print("wald_pval", wald_pval) #calculate total reward cur_row['total_reward'] = np.sum(cur_data[obs_reward_header]) #calculate power cur_row['ratio'] = sample_sizes[0] / sample_sizes[1] if include_power: cur_row['power'] = smp.GofChisquarePower().solve_power(effect_size, nobs = sum(sample_sizes), n_bins=(2-1)*(2-1) + 1, alpha = alpha) cur_row['actual_es'] = calculate_effect_size(sample_sizes, successes) #calculate chi squared contingency test table = sms.Table(np.stack((successes,sample_sizes - successes)).T) rslt = table.test_nominal_association() cur_row['stat'] = rslt.statistic cur_row['pvalue'] = rslt.pvalue cur_row['df'] = rslt.df # Added to match normal rewards cur_row['statUnequalVar'],cur_row['pvalueUnequalVar'], cur_row['dfUnequalVar'] = cur_row['stat'],cur_row['pvalue'],cur_row['df'] return cur_row
def make_stats_row_from_df(simulations_df, include_power, action_count, effect_size=None, alpha=None): '''Calculates output statistics given the data frame simulations_df. If include_power, includes the power calculation. efffect_size and alpha are only required/used if power is calculated ''' #REPLACE cur_data with simulations_df #number of actions! action_count step_size = max(simulations_df.index) + 1 #will be 4n for instance n = step_size / 4 #assumes set to 4n n_size_list = [math.ceil(n / 2), int(n), int(2 * n), int(4 * n)] print("step_size", step_size) print("n_size_list", n_size_list) trials_count = len(simulations_df) print("trials_count", trials_count) #will go to end, so 4n*num_sims print(simulations_df.columns) all_rows = [] for n_size in n_size_list: sim_num = 0 for idx in range(0, trials_count, step_size): one_sim_df = simulations_df[idx:idx + n_size].copy() assert (len(one_sim_df) == n_size) cur_row = {} prop_exploring_ppd_cuml = np.sum( one_sim_df[ppd_exp_header]) / n_size #cumulative # print(one_sim_df["SampleNumber"]) # print("n_size", n_size, exploring_this_n) exploring_this_n = one_sim_df[ one_sim_df["SampleNumber"] == n_size - 1][ppd_exp_header].iloc[0] #snap shot, conver to idx sample_sizes = np.array([]) successes = np.array([]) means = np.array([]) np.array([]) np.array([]) for i in range(1, action_count + 1): sample_sizes = np.append( sample_sizes, np.sum(one_sim_df[action_header] == i)) successes = np.append( successes, np.sum(one_sim_df[one_sim_df[action_header] == i] [obs_reward_header])) means = np.append( means, np.mean(one_sim_df[one_sim_df[action_header] == i] [obs_reward_header])) #calculate sample size and mean for i in range(action_count): cur_row['sample_size_{}'.format(i + 1)] = sample_sizes[i] cur_row['mean_{}'.format(i + 1)] = means[i] if action_count == 2: #SE = sqrt[(P^hat_A*(1-P^hat_A)/N_A + (P^hat_B*(1-P^hat_B)/N_B] SE = np.sqrt(means[0] * (1 - means[0]) / sample_sizes[0] + means[1] * (1 - means[1]) / sample_sizes[1]) wald_type_stat = (means[0] - means[1]) / SE #(P^hat_A - P^hat_b)/SE #print('wald_type_stat:', wald_type_stat) #Two sided, symetric, so compare to 0.05 wald_pval = (1 - scipy.stats.norm.cdf(np.abs(wald_type_stat))) * 2 cur_row['wald_type_stat'] = wald_type_stat cur_row['wald_pval'] = wald_pval #calculate total reward cur_row['total_reward'] = np.sum(one_sim_df[obs_reward_header]) #calculate power cur_row['ratio'] = sample_sizes[0] / sample_sizes[1] if include_power: cur_row['power'] = smp.GofChisquarePower().solve_power( effect_size, nobs=sum(sample_sizes), n_bins=(2 - 1) * (2 - 1) + 1, alpha=alpha) cur_row['actual_es'] = calculate_effect_size( sample_sizes, successes) #calculate chi squared contingency test table = sms.Table( np.stack((successes, sample_sizes - successes)).T) rslt = table.test_nominal_association() cur_row['stat'] = rslt.statistic cur_row['pvalue'] = rslt.pvalue cur_row['df'] = rslt.df # Added to match normal rewards cur_row['statUnequalVar'] = cur_row['stat'] cur_row['pvalueUnequalVar'] = cur_row['pvalue'] cur_row['dfUnequalVar'] = cur_row['df'] cur_row['num_steps'] = max(one_sim_df.index) + 1 # cur_row['num_steps'] = n_size cur_row['sim'] = sim_num cur_row["prop_exploring_ppd_cuml"] = prop_exploring_ppd_cuml cur_row["exploring_ppd_at_this_n"] = exploring_this_n all_rows.append(cur_row) sim_num += 1 return all_rows
def make_stats_row_from_df(simulations_df, include_power, action_count, effect_size=None, alpha=None): '''Calculates output statistics given the data frame simulations_df. If include_power, includes the power calculation. efffect_size and alpha are only required/used if power is calculated ''' #REPLACE cur_data with simulations_df #number of actions! action_count step_size = max(simulations_df.index) + 1 trials_count = len(simulations_df) sim_num = 0 all_rows = [] for idx in range(0, trials_count, step_size): one_sim_df = simulations_df[idx:idx + step_size].copy() sample_sizes = np.array([]) successes = np.array([]) means = np.array([]) np.array([]) np.array([]) for i in range(1, action_count + 1): sample_sizes = np.append(sample_sizes, np.sum(one_sim_df[action_header] == i)) successes = np.append( successes, np.sum(one_sim_df[one_sim_df[action_header] == i] [obs_reward_header])) means = np.append( means, np.mean(one_sim_df[one_sim_df[action_header] == i] [obs_reward_header])) #calculate sample size and mean cur_row = {} for i in range(action_count): cur_row['sample_size_{}'.format(i + 1)] = sample_sizes[i] cur_row['mean_{}'.format(i + 1)] = means[i] if action_count == 2: #SE = sqrt[(P^hat_A*(1-P^hat_A)/N_A + (P^hat_B*(1-P^hat_B)/N_B] SE = np.sqrt(means[0] * (1 - means[0]) / sample_sizes[0] + means[1] * (1 - means[1]) / sample_sizes[1]) wald_type_stat = (means[0] - means[1]) / SE #(P^hat_A - P^hat_b)/SE #print('wald_type_stat:', wald_type_stat) #Two sided, symetric, so compare to 0.05 wald_pval = (1 - scipy.stats.norm.cdf(np.abs(wald_type_stat))) * 2 cur_row['wald_type_stat'] = wald_type_stat cur_row['wald_pval'] = wald_pval #calculate total reward cur_row['total_reward'] = np.sum(one_sim_df[obs_reward_header]) #calculate power cur_row['ratio'] = sample_sizes[0] / sample_sizes[1] if include_power: cur_row['power'] = smp.GofChisquarePower().solve_power( effect_size, nobs=sum(sample_sizes), n_bins=(2 - 1) * (2 - 1) + 1, alpha=alpha) cur_row['actual_es'] = calculate_effect_size(sample_sizes, successes) #calculate chi squared contingency test table = sms.Table(np.stack((successes, sample_sizes - successes)).T) rslt = table.test_nominal_association() cur_row['stat'] = rslt.statistic cur_row['pvalue'] = rslt.pvalue cur_row['df'] = rslt.df # Added to match normal rewards cur_row['statUnequalVar'] = cur_row['stat'] cur_row['pvalueUnequalVar'] = cur_row['pvalue'] cur_row['dfUnequalVar'] = cur_row['df'] cur_row['num_steps'] = max(one_sim_df.index) + 1 cur_row['sim'] = sim_num sim_num += 1 all_rows.append(cur_row) return all_rows