def get_pis_epsilon(all_counts, all_sums, clipping=0): all_means = [] for k in range(args.K): all_means.append( np.divide(dict_sum(all_sums[k]), dict_sum(all_counts[k]))) all_means = np.transpose(np.nan_to_num(all_means)) pis = np.array([[clipping for k in range(args.K)] for k in range(args.N)]) max_vals = np.broadcast_to(np.expand_dims(np.max(all_means, axis=1), 1), (args.N, args.K)) pis += (1 - 2 * clipping) * (np.equal(max_vals, all_means)) return pis[:, 1]
def get_pis_TS(all_counts, all_sums, var, clipping=0): # Calculate posterior to get probability of sampling arm all_means = [] summed_counts = [] pm = [] pv = [] for k in range(args.K): counts_k = dict_sum(all_counts[k]) summed_counts.append(counts_k) sums_k = dict_sum(all_sums[k]) mean_k = np.divide(sums_k, counts_k, out=np.zeros_like(sums_k), where=counts_k != 0) all_means.append(mean_k) # Posterior mean pm_temp = np.divide( prior_means[k] * var + prior_vars[k] * mean_k * counts_k, var + prior_vars[k] * counts_k) # Posterior variance pv_temp = np.divide(prior_vars[k] * var, var + prior_vars[k] * counts_k) pm.append(pm_temp) pv.append(pv_temp) pv = np.array(pv) # Posterior variance ps = np.sqrt(pv) # Posterior std pm = np.array(pm) # Posterior mean all_means = np.array(all_means) pis = [] post_mean = pm[1] - pm[0] post_var = pv[1] + pv[0] # Calculate sampling probability ratio = np.divide(post_mean, np.sqrt(post_var)) pis = stats.norm.cdf(ratio) if clipping > 0: pis = np.minimum(np.maximum(pis, clipping), 1 - clipping) return pis
def compute_statistics(self, chromosome: str, start_position: int): """ Calculate statistics for fragments overlapping at given position. """ counts = self.collect_counts(chromosome, start_position) all_fourmers = dict_sum(counts.watson_fourmer, counts.crick_fourmer, inplace=False) wild_type, variants = self._determine_wild_type_variant_bases( counts.fragment_length) return { "fragment_length": count_fragments(counts.fragment_length), "watson_fourmer": count_fragments(counts.watson_fourmer), "crick_fourmer": count_fragments(counts.crick_fourmer), "fourmer": count_fragments(all_fourmers), "chromosome": chromosome, "position": start_position, "wild_type_base": wild_type, "variant_bases": variants, }
def Wdecorrelated_inference(simulation_dict, alphas, power_dict, noise_std): power_dict['Wdecorrelated'] = { t:{ alpha: {} for alpha in alphas } for t in Tvals } all_sums = simulation_dict['all_sums'] all_counts = simulation_dict['all_counts'] all_rewards = simulation_dict['all_rewards'] if not null: with open( os.path.join( save_f_null, 'lambda.json' ), 'r') as f: null_lambdas = json.load(f) else: null_lambdas = {} if args.adjust: if null: adjusted_cutoffs = {} else: with open( os.path.join( save_f_null, 'cutoff_adjustments', 'Wdecorrelated.json' ), 'r' ) as f: adjusted_cutoffs = json.load( f ) print( '\nW-decorrelated' ) for t in Tvals: sums0 = dict_sum(all_sums[0], t) counts0 = dict_sum(all_counts[0], t) sums1 = dict_sum(all_sums[1], t) counts1 = dict_sum(all_counts[1], t) if null: lam = np.quantile( np.minimum( counts0, counts1 ) / np.log( args.n*t ), 1/(args.n*t) ) null_lambdas[t] = lam else: lam = null_lambdas[str(t)] mean0 = np.divide( sums0, counts0 ) mean1 = np.divide( sums1, counts1 ) R = 1/(1+lam) Rvec = np.array([ R*(1-R)**j for j in range(args.n * t) ]) residual0 = [ np.concatenate( [ all_rewards[0][k][i] - mean0[i] for k in range(1,t+1) ] ) \ for i in range(args.N) ] residual1 = [ np.concatenate( [ all_rewards[1][k][i] - mean1[i] for k in range(1,t+1) ] ) \ for i in range(args.N) ] all_residuals0_padded = \ np.array( [ np.concatenate( [ residual0[i], np.zeros( args.n*t - len(residual0[i] ) ) ] ) \ for i in range(args.N) ] ) all_residuals1_padded = \ np.array( [ np.concatenate( [ residual1[i], np.zeros( args.n*t - len(residual1[i] ) ) ] ) \ for i in range(args.N) ] ) arm0_correction = np.sum( np.multiply( Rvec, all_residuals0_padded ), axis=1 ) arm1_correction = np.sum( np.multiply( Rvec, all_residuals1_padded ), axis=1 ) RvecSquare = np.square( [ R*(1-R)**j for j in range(args.n * args.T) ] ) arm0_var = np.array( [ np.sum( RvecSquare[:counts0[i]] ) for i in range(args.N) ] ) arm1_var = np.array( [ np.sum( RvecSquare[:counts1[i]] ) for i in range(args.N) ] ) W0_est = mean0 + arm0_correction W1_est = mean1 + arm1_correction W_stat = ( W1_est - W0_est ) / np.sqrt( arm0_var + arm1_var ) W_stat = W_stat / noise_std if args.T <= 5 or (args.T > 5 and t % 5 == 0): make_hist( 'Wdecorrelated_distribution_t={}'.format(t), W_stat, power=True ) cutoffs = [ math.fabs( scipy.stats.norm.ppf( alpha / 2 ) ) for alpha in alphas ] if args.adjust: if null: adjusted_cutoffs[t] = calculate_cutoff_adjustment(alphas, W_stat, \ orig_cutoffs=cutoffs) else: # get adjusted cutoffs cutoffs = [ v for k, v in adjusted_cutoffs[str(t)].items() if float(k) in alphas ] calculate_power(alphas, cutoffs, W_stat, power_dict['Wdecorrelated'][t]) print_results(t, alphas, print_dict=power_dict['Wdecorrelated']) with open( os.path.join( save_f, 'lambda.json' ), 'w' ) as f: json.dump(null_lambdas, f, indent=4) if args.adjust: if null: with open( os.path.join( save_f, 'cutoff_adjustments', 'Wdecorrelated.json' ), 'w' ) as f: json.dump( adjusted_cutoffs, f, indent=4 )
def awaipw_inference(simulation_dict, alphas, power_dict): power_dict['awaipw'] = { t:{ alpha: {} for alpha in alphas } for t in Tvals } all_sums = simulation_dict['all_sums'] all_counts = simulation_dict['all_counts'] all_rewards = simulation_dict['all_rewards'] all_mu1 = { 0: np.zeros(args.N) } all_mu0 = { 0: np.zeros(args.N) } if args.adjust: if null: adjusted_cutoffs = {} else: with open( os.path.join( save_f_null, 'cutoff_adjustments', 'awaipw.json' ), 'r' ) as f: adjusted_cutoffs = json.load( f ) for t in range(1, args.T+1): # Update model mu sums0 = dict_sum(all_sums[0], t) counts0 = dict_sum(all_counts[0], t) sums1 = dict_sum(all_sums[1], t) counts1 = dict_sum(all_counts[1], t) all_mu1[t] = sums1 / counts1 all_mu0[t] = sums0 / counts0 print( '\nAW-AIPW' ) for t in Tvals: # weights: h_t = sqrt(pi); mu_hat is sample mean hsum1 = 0; hsum0 = 0; Q1 = 0; Q0 = 0 # First, we calculate the estimators Q1, Q0 for k in range(1,t+1): # IPW portion ipw1 = all_sums[1][k] / all_pis[k-1] ipw0 = all_sums[0][k] / (1-all_pis[k-1]) # Augmented model portion aug1 = all_mu1[k-1] * ( ( 1 - 1 / all_pis[k-1] ) * all_counts[1][k] + all_counts[0][k] ) aug0 = all_mu0[k-1] * ( ( 1 - 1 / (1-all_pis[k-1]) ) * all_counts[0][k] + all_counts[1][k] ) ht1 = np.sqrt( all_pis[k-1] ) ht0 = np.sqrt( 1-all_pis[k-1] ) Q1 = Q1 + ht1 * ( ipw1 + aug1 ) Q0 = Q0 + ht0 * ( ipw0 + aug0 ) hsum1 += args.n * ht1 hsum0 += args.n * ht0 Q1 = Q1 / hsum1 Q0 = Q0 / hsum0 v1_num = 0; v0_num = 0; cov_num = 0 for k in range(1, t+1): all_err0 = []; all_err1 = []; all_cov = [] for i in range(args.N): ipw_err0 = ( all_rewards[0][k][i] - Q0[i] ) / (1-all_pis[k-1][i]) aug_err0 = ( 1 - 1/(1-all_pis[k-1][i]) ) * ( all_mu0[k-1][i] - Q0[i] ) augonly_err0 = np.array( all_counts[1][k][i] * [ all_mu0[k-1][i] - Q0[i] ] ) err0 = np.hstack( [ ipw_err0 + aug_err0, augonly_err0 ] ) all_err0.append( sum( np.square( err0 ) ) ) ipw_err1 = ( all_rewards[1][k][i] - Q1[i] ) / all_pis[k-1][i] aug_err1 = ( 1 - 1/all_pis[k-1][i] ) * ( all_mu1[k-1][i] - Q1[i] ) augonly_err1 = np.array( all_counts[0][k][i] * [ all_mu1[k-1][i] - Q1[i] ] ) err1 = np.hstack( [ augonly_err1, ipw_err1 + aug_err1 ] ) all_err1.append( sum( np.square( err1 ) ) ) all_cov.append( sum( err1 * err0 ) ) ht1_square = all_pis[k-1] ht0_square = 1-all_pis[k-1] v1_num += ht1_square * np.array(all_err1) v0_num += ht0_square * np.array(all_err0) cov_num += np.sqrt( ht1_square ) * np.sqrt( ht0_square ) * np.array(all_cov) v1 = v1_num / np.square( hsum1 ) v0 = v0_num / np.square( hsum0 ) cov = cov_num / ( hsum1 * hsum0 ) # Calculate test statistic awaipw_stat = ( Q1 - Q0 ) / np.sqrt( v1 + v0 -2 * cov ) cutoffs = [ math.fabs( scipy.stats.norm.ppf( alpha / 2 ) ) for alpha in alphas ] if args.adjust: if null: adjusted_cutoffs[t] = calculate_cutoff_adjustment(alphas, awaipw_stat, \ orig_cutoffs=cutoffs) else: # get adjusted cutoffs cutoffs = [ v for k, v in adjusted_cutoffs[str(t)].items() if float(k) in alphas ] calculate_power(alphas, cutoffs, awaipw_stat, power_dict['awaipw'][t]) print_results(t, alphas, print_dict=power_dict['awaipw']) if args.adjust: if null: with open( os.path.join( save_f, 'cutoff_adjustments', 'awaipw.json' ), 'w' ) as f: json.dump( adjusted_cutoffs, f, indent=4 )
def bols_inference(simulation_dict, alphas, power_dict, nste=False): # Find cutoff values using Student-t distribution if args.estvar: est_cutoffs = {} t_sample = np.array( [ np.random.standard_t(df=args.n-2, size=100*args.N) for x in range(args.T) ] ) for t in Tvals: est_cutoffs[t] = [] if not nste: # Stationary treatment effect for alpha in alphas: # Simulate cutoffs using Student-t distribution ave_t_sample = np.sum( t_sample[:t], axis=0 ) / math.sqrt(t) cutoff = stats.mstats.mquantiles( ave_t_sample, prob=1-alpha/2 )[0] est_cutoffs[t].append(cutoff) se = np.std( ave_t_sample > cutoff ) / args.N if args.verbose: print('BOLS cutoff', 't={}'.format(t), 'alpha={}'.format(alpha), cutoff, se) else: # Non-stationary treatment effect for alpha in alphas: # Simulate cutoffs using Student-t distribution squared_t_sample = np.sum( np.square(t_sample[:t]), axis=0 ) cutoff = stats.mstats.mquantiles( squared_t_sample, prob=1-alpha )[0] est_cutoffs[t].append(cutoff) se = np.std( squared_t_sample > cutoff ) / args.N if args.verbose: print('BOLS NSTE cutoff', 't={}'.format(t), 'alpha={}'.format(alpha), cutoff, se) if nste: power_dict['bols_nste'] = { t:{ alpha: {} for alpha in alphas } for t in Tvals } else: power_dict['bols'] = { t:{ alpha: {} for alpha in alphas } for t in Tvals } bols_dict = { 'est0': [], 'est1': [], 'stats': [] } all_sums = simulation_dict['all_sums'] all_counts = simulation_dict['all_counts'] if args.estvar: all_rewards_array = simulation_dict['all_rewards_array'] mask0 = simulation_dict['mask0'] if nste: print( '\nBOLS NSTE' ) else: print( '\nBOLS' ) for t in range(1, args.T+1): sums0 = dict_sum(all_sums[0], t) counts0 = dict_sum(all_counts[0], t) sums1 = dict_sum(all_sums[1], t) counts1 = dict_sum(all_counts[1], t) ols1_est = np.divide( sums1, counts1, out=np.zeros_like(sums1), where=counts1!=0 ) ols0_est = np.divide( sums0, counts0, out=np.zeros_like(sums0), where=counts0!=0 ) ols_margin_est = ols1_est - ols0_est bols_t_est0 = np.divide( all_sums[0][t], all_counts[0][t] ) bols_t_est1 = np.divide( all_sums[1][t], all_counts[1][t] ) bols_dict['est0'].append( bols_t_est0 ) bols_dict['est1'].append( bols_t_est1 ) bols_t_est = bols_t_est1 - bols_t_est0 bols_t_sd = 1/np.sqrt( all_counts[0][t] * all_counts[1][t] / ( all_counts[0][t] + all_counts[1][t] ) ) bols_t_stat = bols_t_est / bols_t_sd bols_dict['stats'].append( bols_t_stat ) for t in Tvals: # Estimate variance if args.estvar: all_batch_var = [] for b in range(1,t+1): residuals = all_rewards_array[b-1] \ - mask0[b-1] * np.expand_dims( bols_dict['est0'][b-1], 1 ) \ - (1-mask0[b-1]) * np.expand_dims( bols_dict['est1'][b-1], 1 ) batch_var = np.var(residuals, axis=1, ddof=2) all_batch_var.append( batch_var ) noise_std = np.sqrt( np.concatenate( [np.expand_dims(x,0) for x in all_batch_var], axis=0) ) else: noise_std = np.ones(args.N)*math.sqrt(args.var) # Simulate cutoffs if args.estvar: cutoffs = est_cutoffs[t] else: # variance known if not nste: # Stationary treatment effect # Cutoffs from Normal distribution cutoffs = [ math.fabs( scipy.stats.norm.ppf( alpha / 2 ) ) for alpha in alphas ] else: # Non-stationary treatment effect cutoffs = [ stats.chi2.ppf(1-alpha, df=t) for alpha in alphas ] if not nste: # Stationary treatment effect bols_stat = np.sum( np.array( bols_dict['stats'][:t] ) / noise_std, axis=0) / math.sqrt(t) calculate_power(alphas, cutoffs, bols_stat, power_dict['bols'][t]) if args.T <= 5 or (args.T > 5 and t % 5 == 0): make_hist( 'bols_distribution_t={}'.format(t), bols_stat, power=True ) if t % 5 == 0: print_results(t, alphas, print_dict=power_dict['bols']) else: # Non-stationary treatment effect bols_nste_stat = np.sum( np.square( np.array( bols_dict['stats'][:t] ) / noise_std ), axis=0) calculate_power(alphas, cutoffs, bols_nste_stat, power_dict['bols_nste'][t]) if t % 5 == 0: print_results(t, alphas, print_dict=power_dict['bols_nste']) return bols_dict
def ols_inference(simulation_dict, alphas, power_dict): power_dict['ols'] = { t:{ alpha: {} for alpha in alphas } for t in Tvals } all_sums = simulation_dict['all_sums'] all_counts = simulation_dict['all_counts'] if args.estvar: all_rewards_array = simulation_dict['all_rewards_array'] mask0 = simulation_dict['mask0'] if args.adjust: if null: adjusted_cutoffs = {} else: with open( os.path.join( save_f_null, 'cutoff_adjustments', 'ols.json' ), 'r' ) as f: adjusted_cutoffs = json.load( f ) print( '\nOLS' ) for t in Tvals: sums0 = dict_sum(all_sums[0], t) counts0 = dict_sum(all_counts[0], t) sums1 = dict_sum(all_sums[1], t) counts1 = dict_sum(all_counts[1], t) ols1_est = np.divide( sums1, counts1, out=np.zeros_like(sums1), where=counts1!=0 ) ols0_est = np.divide( sums0, counts0, out=np.zeros_like(sums0), where=counts0!=0 ) ols_margin_est = ols1_est - ols0_est if args.estvar: all_residuals = [] for k in range(1,t+1): residuals_k = all_rewards_array[k-1] - mask0[k-1] * np.expand_dims(ols0_est,1) \ - (1-mask0[k-1]) * np.expand_dims(ols1_est,1) all_residuals.append(residuals_k) all_residuals = np.concatenate( all_residuals, 1 ) noise_std = np.std(all_residuals, ddof=2, axis=1) else: noise_std = np.ones(args.N)*math.sqrt(args.var) ols_margin_stat = np.sqrt( counts0 * counts1 / (counts0 + counts1) ) * ( ols_margin_est / noise_std ) cutoffs = [ math.fabs( scipy.stats.norm.ppf( alpha / 2 ) ) for alpha in alphas ] if args.adjust: if null: adjusted_cutoffs[t] = calculate_cutoff_adjustment(alphas, ols_margin_stat, \ orig_cutoffs=cutoffs) else: # get adjusted cutoffs cutoffs = [ v for k, v in adjusted_cutoffs[str(t)].items() if float(k) in alphas ] calculate_power(alphas, cutoffs, ols_margin_stat, power_dict['ols'][t]) if args.T <= 5 or (args.T > 5 and t % 5 == 0): make_hist( 'ols_distribution_t={}'.format(t), ols_margin_stat, power=True ) if t % 5 == 0: print_results(t, alphas, print_dict=power_dict['ols']) if args.adjust: if null: with open( os.path.join( save_f, 'cutoff_adjustments', 'ols.json' ), 'w' ) as f: json.dump( adjusted_cutoffs, f, indent=4 ) return noise_std