def get_pis_epsilon(all_counts, all_sums, clipping=0):
    all_means = []
    for k in range(args.K):
        all_means.append(
            np.divide(dict_sum(all_sums[k]), dict_sum(all_counts[k])))
    all_means = np.transpose(np.nan_to_num(all_means))

    pis = np.array([[clipping for k in range(args.K)] for k in range(args.N)])
    max_vals = np.broadcast_to(np.expand_dims(np.max(all_means, axis=1), 1),
                               (args.N, args.K))
    pis += (1 - 2 * clipping) * (np.equal(max_vals, all_means))
    return pis[:, 1]
def get_pis_TS(all_counts, all_sums, var, clipping=0):
    # Calculate posterior to get probability of sampling arm
    all_means = []
    summed_counts = []
    pm = []
    pv = []
    for k in range(args.K):
        counts_k = dict_sum(all_counts[k])
        summed_counts.append(counts_k)
        sums_k = dict_sum(all_sums[k])
        mean_k = np.divide(sums_k,
                           counts_k,
                           out=np.zeros_like(sums_k),
                           where=counts_k != 0)
        all_means.append(mean_k)

        # Posterior mean
        pm_temp = np.divide(
            prior_means[k] * var + prior_vars[k] * mean_k * counts_k,
            var + prior_vars[k] * counts_k)
        # Posterior variance
        pv_temp = np.divide(prior_vars[k] * var,
                            var + prior_vars[k] * counts_k)
        pm.append(pm_temp)
        pv.append(pv_temp)

    pv = np.array(pv)  # Posterior variance
    ps = np.sqrt(pv)  # Posterior std
    pm = np.array(pm)  # Posterior mean
    all_means = np.array(all_means)

    pis = []
    post_mean = pm[1] - pm[0]
    post_var = pv[1] + pv[0]
    # Calculate sampling probability
    ratio = np.divide(post_mean, np.sqrt(post_var))
    pis = stats.norm.cdf(ratio)

    if clipping > 0:
        pis = np.minimum(np.maximum(pis, clipping), 1 - clipping)

    return pis
예제 #3
0
    def compute_statistics(self, chromosome: str, start_position: int):
        """
        Calculate statistics for fragments overlapping at given position.
        """
        counts = self.collect_counts(chromosome, start_position)

        all_fourmers = dict_sum(counts.watson_fourmer,
                                counts.crick_fourmer,
                                inplace=False)

        wild_type, variants = self._determine_wild_type_variant_bases(
            counts.fragment_length)

        return {
            "fragment_length": count_fragments(counts.fragment_length),
            "watson_fourmer": count_fragments(counts.watson_fourmer),
            "crick_fourmer": count_fragments(counts.crick_fourmer),
            "fourmer": count_fragments(all_fourmers),
            "chromosome": chromosome,
            "position": start_position,
            "wild_type_base": wild_type,
            "variant_bases": variants,
        }
def Wdecorrelated_inference(simulation_dict, alphas, power_dict, noise_std):
    power_dict['Wdecorrelated'] = { t:{ alpha: {} for alpha in alphas } for t in Tvals }
    
    all_sums = simulation_dict['all_sums']
    all_counts = simulation_dict['all_counts']
    all_rewards = simulation_dict['all_rewards']
   
    if not null:
        with open( os.path.join( save_f_null, 'lambda.json' ), 'r') as f:
            null_lambdas = json.load(f)
    else:
        null_lambdas = {}
    
    if args.adjust:
        if null:
            adjusted_cutoffs = {}
        else:
            with open( os.path.join( save_f_null, 'cutoff_adjustments', 'Wdecorrelated.json' ), 'r' ) as f:
                adjusted_cutoffs = json.load( f )
    
    print( '\nW-decorrelated' )
    for t in Tvals:
        sums0 = dict_sum(all_sums[0], t)
        counts0 = dict_sum(all_counts[0], t)
        sums1 = dict_sum(all_sums[1], t)
        counts1 = dict_sum(all_counts[1], t)
        
        if null:
            lam = np.quantile( np.minimum( counts0, counts1 ) / np.log( args.n*t ), 1/(args.n*t) )
            null_lambdas[t] = lam
        else:
            lam = null_lambdas[str(t)]

        mean0 = np.divide( sums0, counts0 )
        mean1 = np.divide( sums1, counts1 )

        R = 1/(1+lam)
        Rvec = np.array([ R*(1-R)**j for j in range(args.n * t) ])

        residual0 = [ np.concatenate( [ all_rewards[0][k][i] - mean0[i] for k in range(1,t+1) ] ) \
                for i in range(args.N) ]
        residual1 = [ np.concatenate( [ all_rewards[1][k][i] - mean1[i] for k in range(1,t+1) ] ) \
                for i in range(args.N) ]
        all_residuals0_padded = \
                np.array( [ np.concatenate( [ residual0[i], np.zeros( args.n*t - len(residual0[i] ) ) ] ) \
                for i in range(args.N) ] )
        all_residuals1_padded = \
                np.array( [ np.concatenate( [ residual1[i], np.zeros( args.n*t - len(residual1[i] ) ) ] ) \
                for i in range(args.N) ] )

        arm0_correction = np.sum( np.multiply( Rvec, all_residuals0_padded ), axis=1 )
        arm1_correction = np.sum( np.multiply( Rvec, all_residuals1_padded ), axis=1 )

        RvecSquare = np.square( [ R*(1-R)**j for j in range(args.n * args.T) ] )
        arm0_var = np.array( [ np.sum( RvecSquare[:counts0[i]] ) for i in range(args.N) ] )
        arm1_var = np.array( [ np.sum( RvecSquare[:counts1[i]] ) for i in range(args.N) ] )

        W0_est = mean0 + arm0_correction
        W1_est = mean1 + arm1_correction

        W_stat = ( W1_est - W0_est ) / np.sqrt( arm0_var + arm1_var )
        W_stat = W_stat / noise_std

        if args.T <= 5 or (args.T > 5 and t % 5 == 0):
            make_hist( 'Wdecorrelated_distribution_t={}'.format(t), W_stat, power=True )

        cutoffs = [ math.fabs( scipy.stats.norm.ppf( alpha / 2 ) ) for alpha in alphas ]
        
        if args.adjust:
            if null:
                adjusted_cutoffs[t] = calculate_cutoff_adjustment(alphas, W_stat, \
                        orig_cutoffs=cutoffs)
            else:
                # get adjusted cutoffs
                cutoffs = [ v for k, v in adjusted_cutoffs[str(t)].items() if float(k) in alphas ]
        
        calculate_power(alphas, cutoffs, W_stat, power_dict['Wdecorrelated'][t])
        print_results(t, alphas, print_dict=power_dict['Wdecorrelated'])
        
    with open( os.path.join( save_f, 'lambda.json' ), 'w' ) as f:
        json.dump(null_lambdas, f, indent=4)
    
    if args.adjust:
        if null:
            with open( os.path.join( save_f, 'cutoff_adjustments', 'Wdecorrelated.json' ), 'w' ) as f:
                json.dump( adjusted_cutoffs, f, indent=4 )
def awaipw_inference(simulation_dict, alphas, power_dict):
    power_dict['awaipw'] = { t:{ alpha: {} for alpha in alphas } for t in Tvals }
    
    all_sums = simulation_dict['all_sums']
    all_counts = simulation_dict['all_counts']
    all_rewards = simulation_dict['all_rewards']

    all_mu1 = { 0: np.zeros(args.N) }
    all_mu0 = { 0: np.zeros(args.N) }
    
    if args.adjust:
        if null:
            adjusted_cutoffs = {}
        else:
            with open( os.path.join( save_f_null, 'cutoff_adjustments', 'awaipw.json' ), 'r' ) as f:
                adjusted_cutoffs = json.load( f )
    
    for t in range(1, args.T+1):
        # Update model mu
        sums0 = dict_sum(all_sums[0], t)
        counts0 = dict_sum(all_counts[0], t)
        sums1 = dict_sum(all_sums[1], t)
        counts1 = dict_sum(all_counts[1], t)

        all_mu1[t] = sums1 / counts1
        all_mu0[t] = sums0 / counts0


    print( '\nAW-AIPW' )
    for t in Tvals:
        # weights: h_t = sqrt(pi); mu_hat is sample mean
        hsum1 = 0; hsum0 = 0; Q1 = 0; Q0 = 0

        # First, we calculate the estimators Q1, Q0
        for k in range(1,t+1):
            # IPW portion
            ipw1 = all_sums[1][k] / all_pis[k-1]
            ipw0 = all_sums[0][k] / (1-all_pis[k-1])

            # Augmented model portion
            aug1 = all_mu1[k-1] * ( ( 1 - 1 / all_pis[k-1] ) * all_counts[1][k] + all_counts[0][k] )
            aug0 = all_mu0[k-1] * ( ( 1 - 1 / (1-all_pis[k-1]) ) * all_counts[0][k] + all_counts[1][k] )

            ht1 = np.sqrt( all_pis[k-1] )
            ht0 = np.sqrt( 1-all_pis[k-1] )

            Q1 = Q1 + ht1 * ( ipw1 + aug1 )
            Q0 = Q0 + ht0 * ( ipw0 + aug0 )

            hsum1 += args.n * ht1
            hsum0 += args.n * ht0

        Q1 = Q1 / hsum1
        Q0 = Q0 / hsum0
        
        v1_num = 0; v0_num = 0; cov_num = 0
        for k in range(1, t+1):
            all_err0 = []; all_err1 = []; all_cov = []
            for i in range(args.N):
                ipw_err0 = ( all_rewards[0][k][i] - Q0[i] ) / (1-all_pis[k-1][i])
                aug_err0 = ( 1 - 1/(1-all_pis[k-1][i]) ) * ( all_mu0[k-1][i] - Q0[i] )
                augonly_err0 = np.array( all_counts[1][k][i] * [ all_mu0[k-1][i] - Q0[i] ] )
                err0 = np.hstack( [ ipw_err0 + aug_err0, augonly_err0 ] )
                all_err0.append( sum( np.square( err0 ) ) )

                ipw_err1 = ( all_rewards[1][k][i] - Q1[i] ) / all_pis[k-1][i]
                aug_err1 = ( 1 - 1/all_pis[k-1][i] ) * ( all_mu1[k-1][i] - Q1[i] )
                augonly_err1 = np.array( all_counts[0][k][i] * [ all_mu1[k-1][i] - Q1[i] ] )
                err1 = np.hstack( [ augonly_err1, ipw_err1 + aug_err1 ] )
                all_err1.append( sum( np.square( err1 ) ) )

                all_cov.append( sum( err1 * err0 ) )

            ht1_square = all_pis[k-1]
            ht0_square = 1-all_pis[k-1]

            v1_num += ht1_square * np.array(all_err1)
            v0_num += ht0_square * np.array(all_err0)
            cov_num += np.sqrt( ht1_square ) * np.sqrt( ht0_square ) * np.array(all_cov)

        v1 = v1_num / np.square( hsum1 )
        v0 = v0_num / np.square( hsum0 )
        cov = cov_num / ( hsum1 * hsum0 )
        
        # Calculate test statistic
        awaipw_stat = ( Q1 - Q0 ) / np.sqrt( v1 + v0 -2 * cov )
        cutoffs = [ math.fabs( scipy.stats.norm.ppf( alpha / 2 ) ) for alpha in alphas ]
        
        if args.adjust:
            if null:
                adjusted_cutoffs[t] = calculate_cutoff_adjustment(alphas, awaipw_stat, \
                        orig_cutoffs=cutoffs)
            else:
                # get adjusted cutoffs
                cutoffs = [ v for k, v in adjusted_cutoffs[str(t)].items() if float(k) in alphas ]
        
        calculate_power(alphas, cutoffs, awaipw_stat, power_dict['awaipw'][t])
        print_results(t, alphas, print_dict=power_dict['awaipw'])
    
    if args.adjust:
        if null:
            with open( os.path.join( save_f, 'cutoff_adjustments', 'awaipw.json' ), 'w' ) as f:
                json.dump( adjusted_cutoffs, f, indent=4 )
def bols_inference(simulation_dict, alphas, power_dict, nste=False):
    # Find cutoff values using Student-t distribution
    if args.estvar:
        est_cutoffs = {}
        t_sample = np.array( [ np.random.standard_t(df=args.n-2, size=100*args.N) for x in range(args.T) ] )
        for t in Tvals:
            est_cutoffs[t] = []
            if not nste:
                # Stationary treatment effect
                for alpha in alphas:
                    # Simulate cutoffs using Student-t distribution
                    ave_t_sample = np.sum( t_sample[:t], axis=0 ) / math.sqrt(t)
                    cutoff = stats.mstats.mquantiles( ave_t_sample, prob=1-alpha/2 )[0]
                    est_cutoffs[t].append(cutoff)
                    se = np.std( ave_t_sample > cutoff ) / args.N
                    if args.verbose:
                        print('BOLS cutoff', 't={}'.format(t), 'alpha={}'.format(alpha), cutoff, se)
            else:
                # Non-stationary treatment effect
                for alpha in alphas:
                    # Simulate cutoffs using Student-t distribution
                    squared_t_sample = np.sum( np.square(t_sample[:t]), axis=0 )
                    cutoff = stats.mstats.mquantiles( squared_t_sample, prob=1-alpha )[0]
                    est_cutoffs[t].append(cutoff)
                    se = np.std( squared_t_sample > cutoff ) / args.N
                    if args.verbose:
                        print('BOLS NSTE cutoff', 't={}'.format(t), 'alpha={}'.format(alpha), cutoff, se)
    
    if nste:
        power_dict['bols_nste'] = { t:{ alpha: {} for alpha in alphas } for t in Tvals }
    else:
        power_dict['bols'] = { t:{ alpha: {} for alpha in alphas } for t in Tvals }
    bols_dict = { 'est0': [], 'est1': [], 'stats': [] }
    
    all_sums = simulation_dict['all_sums']
    all_counts = simulation_dict['all_counts']
    if args.estvar:
        all_rewards_array = simulation_dict['all_rewards_array']
        mask0 = simulation_dict['mask0']
   
    if nste:
        print( '\nBOLS NSTE' )
    else:
        print( '\nBOLS' )
    for t in range(1, args.T+1):
        sums0 = dict_sum(all_sums[0], t)
        counts0 = dict_sum(all_counts[0], t)
        sums1 = dict_sum(all_sums[1], t)
        counts1 = dict_sum(all_counts[1], t)

        ols1_est = np.divide( sums1, counts1, out=np.zeros_like(sums1), where=counts1!=0 )
        ols0_est = np.divide( sums0, counts0, out=np.zeros_like(sums0), where=counts0!=0 )
        ols_margin_est = ols1_est - ols0_est

        bols_t_est0 = np.divide( all_sums[0][t], all_counts[0][t] )
        bols_t_est1 = np.divide( all_sums[1][t], all_counts[1][t] )
        bols_dict['est0'].append( bols_t_est0 )
        bols_dict['est1'].append( bols_t_est1 )

        bols_t_est = bols_t_est1 - bols_t_est0
        bols_t_sd = 1/np.sqrt( all_counts[0][t] * all_counts[1][t] / ( all_counts[0][t] + all_counts[1][t] ) )
        bols_t_stat = bols_t_est / bols_t_sd
        bols_dict['stats'].append( bols_t_stat )

    for t in Tvals:
        # Estimate variance
        if args.estvar:
            all_batch_var = []
            for b in range(1,t+1):
                residuals = all_rewards_array[b-1] \
                        - mask0[b-1] * np.expand_dims( bols_dict['est0'][b-1], 1 ) \
                        - (1-mask0[b-1]) * np.expand_dims( bols_dict['est1'][b-1], 1 )
                batch_var = np.var(residuals, axis=1, ddof=2)
                all_batch_var.append( batch_var )
            noise_std = np.sqrt( np.concatenate( [np.expand_dims(x,0) for x in all_batch_var], axis=0) )
        else:
            noise_std = np.ones(args.N)*math.sqrt(args.var)
        
        # Simulate cutoffs
        if args.estvar:
            cutoffs = est_cutoffs[t]
        else:
            # variance known
            if not nste:
                # Stationary treatment effect
                # Cutoffs from Normal distribution
                cutoffs = [ math.fabs( scipy.stats.norm.ppf( alpha / 2 ) ) for alpha in alphas ]
            else:
                # Non-stationary treatment effect
                cutoffs = [ stats.chi2.ppf(1-alpha, df=t) for alpha in alphas ]
   
        if not nste:
            # Stationary treatment effect
            bols_stat = np.sum( np.array( bols_dict['stats'][:t] ) / noise_std, axis=0) / math.sqrt(t)
            calculate_power(alphas, cutoffs, bols_stat, power_dict['bols'][t])
        
            if args.T <= 5 or (args.T > 5 and t % 5 == 0):
                make_hist( 'bols_distribution_t={}'.format(t), bols_stat, power=True )

            if t % 5 == 0:
                print_results(t, alphas, print_dict=power_dict['bols'])
        else:
            # Non-stationary treatment effect
            bols_nste_stat = np.sum( np.square( np.array( bols_dict['stats'][:t] ) / noise_std ), axis=0)
            calculate_power(alphas, cutoffs, bols_nste_stat, power_dict['bols_nste'][t])
            if t % 5 == 0:
                print_results(t, alphas, print_dict=power_dict['bols_nste'])

    return bols_dict
def ols_inference(simulation_dict, alphas, power_dict):
    power_dict['ols'] = { t:{ alpha: {} for alpha in alphas } for t in Tvals }
    
    all_sums = simulation_dict['all_sums']
    all_counts = simulation_dict['all_counts']
    if args.estvar:
        all_rewards_array = simulation_dict['all_rewards_array']
        mask0 = simulation_dict['mask0']

    if args.adjust:
        if null:
            adjusted_cutoffs = {}
        else:
            with open( os.path.join( save_f_null, 'cutoff_adjustments', 'ols.json' ), 'r' ) as f:
                adjusted_cutoffs = json.load( f )

    print( '\nOLS' )
    for t in Tvals:
        sums0 = dict_sum(all_sums[0], t)
        counts0 = dict_sum(all_counts[0], t)
        sums1 = dict_sum(all_sums[1], t)
        counts1 = dict_sum(all_counts[1], t)

        ols1_est = np.divide( sums1, counts1, out=np.zeros_like(sums1), where=counts1!=0 )
        ols0_est = np.divide( sums0, counts0, out=np.zeros_like(sums0), where=counts0!=0 )
        ols_margin_est = ols1_est - ols0_est

        if args.estvar:
            all_residuals = []
            for k in range(1,t+1):
                residuals_k = all_rewards_array[k-1] - mask0[k-1] * np.expand_dims(ols0_est,1) \
                        - (1-mask0[k-1]) * np.expand_dims(ols1_est,1)
                all_residuals.append(residuals_k)
            all_residuals = np.concatenate( all_residuals, 1 )
            noise_std = np.std(all_residuals, ddof=2, axis=1)
        else:
            noise_std = np.ones(args.N)*math.sqrt(args.var)

        ols_margin_stat = np.sqrt( counts0 * counts1 / (counts0 + counts1) ) * ( ols_margin_est / noise_std )
        cutoffs = [ math.fabs( scipy.stats.norm.ppf( alpha / 2 ) ) for alpha in alphas ]
        
        if args.adjust:
            if null:
                adjusted_cutoffs[t] = calculate_cutoff_adjustment(alphas, ols_margin_stat, \
                        orig_cutoffs=cutoffs)
            else:
                # get adjusted cutoffs
                cutoffs = [ v for k, v in adjusted_cutoffs[str(t)].items() if float(k) in alphas ]
        
        calculate_power(alphas, cutoffs, ols_margin_stat, power_dict['ols'][t])
        
        if args.T <= 5 or (args.T > 5 and t % 5 == 0):
            make_hist( 'ols_distribution_t={}'.format(t), ols_margin_stat, power=True )

        if t % 5 == 0:
            print_results(t, alphas, print_dict=power_dict['ols'])
        
    if args.adjust:
        if null:
            with open( os.path.join( save_f, 'cutoff_adjustments', 'ols.json' ), 'w' ) as f:
                json.dump( adjusted_cutoffs, f, indent=4 )

    return noise_std