def test_one_sided_y_greater_x(): p = permutation_test(treatment, control, func=lambda x, y: np.mean(y) - np.mean(x)) assert round(p, 3) == 1 - 0.03, p p = permutation_test(treatment, control, func="x_mean < y_mean") assert round(p, 3) == 1 - 0.03, p
def test_one_sided_x_greater_y(): p = permutation_test(treatment, control, func=lambda x, y: np.mean(x) - np.mean(y)) assert round(p, 4) == 0.0274, p p = permutation_test(treatment, control, func="x_mean > y_mean") assert round(p, 4) == 0.0274, p
def test_one_sided_x_greater_y(): p = permutation_test(treatment, control, func=lambda x, y: np.mean(x) - np.mean(y)) assert round(p, 4) == 0.0274, p p = permutation_test(treatment, control, func="x_mean > y_mean") assert round(p, 4) == 0.0274, p
def test_two_sided(): p = permutation_test(treatment, control, func=lambda x, y: np.abs(np.mean(x) - np.mean(y))) assert round(p, 3) == 0.055, p p = permutation_test(treatment, control, func="x_mean != y_mean") assert round(p, 3) == 0.055, p
def test_one_sided_y_greater_x(): p = permutation_test(treatment, control, func=lambda x, y: np.mean(y) - np.mean(x)) assert round(p, 3) == 1 - 0.03, p p = permutation_test(treatment, control, func="x_mean < y_mean") assert round(p, 3) == 1 - 0.03, p
def test_two_sided(): p = permutation_test(treatment, control, func=lambda x, y: np.abs(np.mean(x) - np.mean(y))) assert round(p, 3) == 0.055, p p = permutation_test(treatment, control, func="x_mean != y_mean") assert round(p, 3) == 0.055, p
def analyze_by_tissue(tissue_list, positive_genes, negative_genes): fc_list = [] sem_list = [] p_list = [] for item in tissue_list: print(item) col_list = ['Genes', item] data = pd.read_csv(filename, usecols=col_list) data = data.set_index('Genes') #print (data) data['Mean'] = data.mean(axis=1) positive_df = data.loc[positive_genes] #print ('positive', positive_df) positive_df = positive_df.dropna() positives = positive_df['Mean'].tolist() #print (positives) positive_values = np.array(positives) #print ('positives', len(positive_genes)) #print ('positive', positive_df) #print (len(positive_values)) negative_df = data.loc[negative_genes] #print ('negative', len(negative_genes), negative_genes[:5]) #print ('negative', negative_df) negative_df = negative_df.dropna() negatives = negative_df['Mean'].tolist() #print (len(negatives)) negative_values = np.array(negatives) #print (negative_values[:5]) fc = np.mean(positive_values) / np.mean(negative_values) norm_pos = positive_values / np.mean(negative_values) pos_sem = stats.sem(norm_pos) p_value = permutation_test(positive_values, negative_values, method='approximate', num_rounds=10000, seed=0) #print (len(positive_values), len(negative_values)) #plot_distributions(positive_values, negative_values, item) fc_list.append(fc) p_list.append(p_value) sem_list.append(pos_sem) tissue_list = [x.capitalize() for x in tissue_list] df = pd.DataFrame({ 'Tissues': tissue_list, 'Fold_Change': fc_list, 'Significance': p_list, 'SEM': sem_list }) return df
def gridcell_history(array_stack_1, array_stack_2): """this is a support function that gathers the instances of each gridcell from the n*m grid for every year and collects them in a vector, for both of the n*m*t array stacks given in the input. the vector is t long, where t is the third dimension of the array stacks (note that the n*m dimensions of the array stack must be the same but the t axes of the array stacks may not be the same.) It then does the permutation test on each of these gridcell histories and returns the corresponding pvalue""" #building "history" of each grid cell [x, y] = np.shape(array_stack_1[0]) pvals = np.zeros([x, y]) count = 0 total = x * y for i in range( 0, x, ): for j in range(0, y): t = time.time() array_stack_1_sample = [] array_stack_2_sample = [] for arr in array_stack_1: array_stack_1_sample.append(arr[i, j]) for arr in array_stack_2: array_stack_2_sample.append(arr[i, j]) array_stack_1_sample = np.asarray(array_stack_1_sample) array_stack_2_sample = np.asarray(array_stack_2_sample) print "doing permutation test {}/{}".format(count, total) pvals[i, j] = permutation_test(array_stack_1_sample, array_stack_2_sample, method='approximate', num_rounds=1000, seed=0) elapsed = time.time() - t print pvals[i, j] print "time required to do one operation is {}".format(elapsed) count += 1 return pvals
def permutation_ttest(W, B): p_value = permutation_test(W, B, method='approximate', num_rounds=100, func=lambda W, B: stats.ttest_ind(W, B), seed=0) return 1 if p_value < 0.05 else 0
def trend_diff(Score, n_ch_comp): p_value = [] alpha = 0.01 #/n_ch_comp #set alpha value to define significant change; this can be scaled on the number of comparisons for i in range(n_ch_comp): treatment = Score[i + 1] control = Score[i] p_value.append( permutation_test(treatment, control, method='approximate', num_rounds=int(alpha**(-1) * 100))) significance = [i < alpha for i in p_value] print('Number of changes: ', sum(significance)) idx = np.nonzero(significance)[0] idx = idx.tolist( ) # indeces of comparison number where change was detected print('Changes after n number of channels: ', np.array(idx) + 1) return p_value, significance, alpha, idx
def p_value_one_sided(treatment, control): p_value = permutation_test(treatment, control, method='approximate', num_rounds=10000, seed=42, func=lambda x, y: np.mean(y) - np.mean(x)) return p_value
def calculate_permutation_test(single_predictions, joint_predictions): p_value = permutation_test(single_predictions, joint_predictions, method='approximate', num_rounds=10000, seed=0) return p_value
def plot_sim_score_dist(): all_pos=[] all_neg=[] for i in range(5): final=make_avg_score_df(i) print (final) pos=final[final['group']==1] print (pos) pos_mean=pos['mean'].tolist() all_pos.append(pos_mean) neg=final[final['group']==0] print (neg) neg_mean=neg['mean'].tolist() all_neg.append(neg_mean) pos_list = [item for sublist in all_pos for item in sublist] neg_list = [item for sublist in all_neg for item in sublist] p_value = permutation_test(pos_list, neg_list, method='approximate', num_rounds=10000, seed=0) print(p_value) plot_distributions(pos_list, neg_list)
def find_permutation(positives, negatives): p_value = permutation_test(positives, negatives, method='approximate', num_rounds=10000, seed=0) #print (p_value) return p_value
def test_approximate(): p = permutation_test(treatment, control, method='approximate', alt_hypothesis='x > y', num_permutations=5000, seed=123) assert round(p, 3) == 0.028, round(p, 4)
def test_approximateone_sided_x_greater_y(): p = permutation_test(treatment, control, func=lambda x, y: np.mean(x) - np.mean(y), method='approximate', num_rounds=5000, seed=123) assert round(p, 3) == 0.028, p
def test_approximateone_sided_x_greater_y(): p = permutation_test(treatment, control, func=lambda x, y: np.mean(x) - np.mean(y), method='approximate', num_rounds=5000, seed=123) assert round(p, 3) == 0.028, p
def calculate_permutation_test(hyperpartisan_valid_predictions, joint_valid_predictions): p_value = permutation_test(hyperpartisan_valid_predictions, joint_valid_predictions, method='approximate', num_rounds=10000, seed=0) return p_value
def test_paired_runs_approximate(): a = [3.67, 1.72, 3.46, 2.60, 2.03, 2.10, 3.01] b = [2.11, 1.79, 2.71, 1.89, 1.69, 1.71, 2.01] p = permutation_test(a, b, paired=True, method="approximate", seed=0, num_rounds=100000) assert round(p, 3) == 0.031
def _permutation_test(self, x, y): r_score = np.corrcoef(x, y)[1][0] p_value = permutation_test(x, y, method='approximate', num_rounds=10000, func=lambda x, y: np.corrcoef(x, y)[1][0], seed=0) return round(r_score, 3), round(p_value, 4)
def perm_test(x,y): print (x[0], y[0]) X.append(x[0]) Y.append(y[0]) a = hist[hist.index==x[0]].values b = hist[hist.index==y[0]].values p_value = permutation_test(a, b, method='approximate', num_rounds=100000, seed=0) return p_value
def make_pheno_df(phe_list1, phe_list2): p_value = permutation_test(phe_list1, phe_list2, method='approximate', num_rounds=10000, seed=0) print(p_value) df=pd.DataFrame({'Group': 'Syndromic', 'Phe_No': phe_list1}) df=df.set_index('Group') df2=pd.DataFrame({'Group': 'Non-Syndromic', 'Phe_No': phe_list2}) df2=df2.set_index('Group') final=pd.concat([df, df2], axis=0) return final
def kruskal_similar_distribution(self, column, pvalue_threshold=0.05, num_rounds=3): p_value = permutation_test( self.new_data[column], self.historical_data[column], method="approximate", num_rounds=num_rounds, func=lambda x, y: stats.kruskal(x, y).statistic, seed=0) if p_value < pvalue_threshold: return False return True
def non_parametric_permutation_test(recurrence_group, no_recurrence_group): """ A non-parametric permutation test for the null hypothesis that patients grouped by cancer recurrence come from the same distribution. 1) Compute the difference (here: mean) of sample x and sample y 2) Combine all measurements into a single dataset 3) Draw a permuted dataset from all possible permutations of the dataset in 2. 4) Divide the permuted dataset into two datasets x' and y' of size n and m, respectively 5) Compute the difference (here: mean) of sample x' and sample y' and record this difference 6) Repeat steps 3-5 until all permutations are evaluated 7) Return the p-value as the number of times the recorded differences were more extreme than the original difference from 1. and divide this number by the total number of permutations :param selected_patients: A data frame column of a patient characteristic to analyse :param test_type: A string either 'two_tail', or 'one_tail' to signify the type of test :return: The p-value for the test """ global_p_value = permutation_test(recurrence_group, no_recurrence_group, method='approximate', num_rounds=10000, seed=0) lower_p_value = permutation_test(recurrence_group, no_recurrence_group, func='x_mean < y_mean', method='approximate', num_rounds=10000, seed=0) upper_p_value = permutation_test(recurrence_group, no_recurrence_group, func='x_mean > y_mean', method='approximate', num_rounds=10000, seed=0) return global_p_value, lower_p_value, upper_p_value
def _calc_weat_pvalue(first_associations, second_associations, method=PVALUE_DEFUALT_METHOD): if method not in PVALUE_METHODS: raise ValueError('method should be one of {}, {} was given'.format( PVALUE_METHODS, method)) pvalue = permutation_test(first_associations, second_associations, func=lambda x, y: sum(x) - sum(y), method=method, seed=RANDOM_STATE) # if exact - no meaning return pvalue
def pearson_similar_correlation(self, column, correlation_lower_bound, pvalue_threshold=0.05, num_rounds=3): correlation_info = stats.pearsonr(self.new_data[column], self.historical_data[column]) p_value = permutation_test( self.new_data[column], self.historical_data[column], method="approximate", num_rounds=num_rounds, func=lambda x, y: stats.pearsonr(x, y)[0], seed=0) if p_value > pvalue_threshold: return False if correlation_info[0] < correlation_lower_bound: return False return True
def displacement_diff(Score1, Score2, n_ch_comp): p_value = [] alpha = 0.01 #/n_ch_comp #set alpha value to define significant change; this can be scaled on the number of comparisons for i in range(n_ch_comp): treatment = Score1[i] control = Score2[i] p_value.append( permutation_test(treatment, control, method='approximate', num_rounds=int(alpha**(-1) * 100))) significance = [i < alpha for i in p_value] return p_value, significance, alpha
def analyze_cell_lines(data, positives, negatives, nonbrain_cells, group): fc_list = [] sem_list = [] p_list = [] for item in nonbrain_cells: nonbrain_df = data[item] #print (nonbrain_df) pos = nonbrain_df.loc[positives] print(pos) pos_list = pos.tolist() neg = nonbrain_df.loc[negatives] neg_list = neg.tolist() p_value = permutation_test(pos_list, neg_list, method='approximate', num_rounds=10000, seed=0) print('permutation test p', p_value) fc = np.mean(pos_list) / np.mean(neg_list) fc_list.append(fc) norm_pos = pos_list / np.mean(neg_list) sem = stats.sem(norm_pos) sem_list.append(sem) p_list.append(p_value) #print (len(nonbrain_cells), len(fc_list), len(sem_list), len(p_list)) df = pd.DataFrame({ 'Cell_Line': nonbrain_cells, 'Fold_Change': fc_list, 'SEM': sem_list, 'Significance': p_list }) print(df) return df
def _permutation_test( x1, x2, alternative=ALTERNATIVE, num_rounds=NUM_ROUNDS, random_state=RANDOM_STATE, ): pvalue = permutation_test( x1, x2, method='approximate', num_rounds=int(num_rounds), seed=random_state, ) if alternative == 'two-sided': pvalue = pvalue elif alternative == 'greater': pvalue = pvalue / 2 else: raise Exception(f'{alternative}') return pvalue
def test(args, model, train_dataloader, eval_dataset, K=10): lims = np.linspace(0, len(eval_dataset), K + 1).astype('int') A = [] B = [] for i in range(K): sub_dataset = Subset(eval_dataset, list(range(lims[i], lims[i + 1]))) sub_sampler = SequentialSampler( sub_dataset) if args.local_rank == -1 else DistributedSampler( sub_dataset) sub_dataloader = DataLoader(sub_dataset, sampler=sub_sampler, batch_size=args.batch_size, collate_fn=default_data_collator) if i == 0: A_scores, A_sparsities, A_head_masks = mask_heads( args, model, train_dataloader, sub_dataloader) A.append(auc(A_sparsities, A_scores).cpu()) B_scores, B_sparsities, B_head_masks = unmask_dpp( args, model, train_dataloader, sub_dataloader) B.append(auc(B_sparsities, B_scores).cpu()) else: A_scores = [] for head_mask in A_head_masks: A_scores.append( evaluate(args, model, sub_dataloader, head_mask=head_mask)) A.append(auc(A_sparsities, A_scores).cpu()) B_scores = [] for head_mask in B_head_masks: B_scores.append( evaluate(args, model, sub_dataloader, head_mask=head_mask)) B.append(auc(B_sparsities, B_scores).cpu()) p_value = permutation_test(A, B, method='exact', func=lambda x, y: np.mean(y) - np.mean(x)) np.save(os.path.join(args.output_dir, "A.npy"), A) np.save(os.path.join(args.output_dir, "B.npy"), B) return p_value
def get_permutation_of_pdists(temp_df1, temp_df2): """ Applies the Mantel test on two dataframes. First the pairwise distance matrix of each dataframe is computed. Then a permutation test is applied on the two distributions. :param temp_df1: Dataframe :param temp_df2: Dataframe :return: p value of the permutation test """ temp_df1_pdist, temp_df2_pdist = get_pdists(temp_df1, temp_df2) # print("~Starting permutation test!") # start = time() p_value = permutation_test(temp_df1_pdist, temp_df2_pdist, method='approximate', num_rounds=1000, seed=0) # end = time() # print(f"~Permutation test ended!It took {end - start} seconds!") return p_value
def get_p_values(results, model_name, n_vars=10): """ Compare the scores from the first and second-most important features as determined by multipass. """ multipass_scores = results[f'multipass_scores__{model_name}'].values scores_to_compare_against = results[f'second_place_scores__{model_name}'].values p_values = [] for n in range(n_vars): p_value = permutation_test(multipass_scores[n,:], scores_to_compare_against[n,:], method='approximate', num_rounds=1000, seed=0) p_values.append(p_value) if p_value > 0.05: print('Probably the same distribution\n') else: print('Probably different distributions\n') p_values = np.array(p_values)>0.05 return p_values
def GetStats(atoms,SelectedHingeResidues,filename='Output'): """This sub-method is used to get the statistical data on the hinges and print it into a file. Notes: * * Function level: 1 (1 being top) * Do something about the output file Args: atoms ([packman.molecule.Atom]) : Set of atoms. (Read parent method description) SelectedHingeResidues ([packman.molecule.Residue]): Predicted hinge residues. filename (str, optional) : Output file name. Defaults to 'Output'. Returns: [p-value, stats] (float): p-value of the predicted hinge, statistics of the hinge (in that order) """ hinge_atoms=[i.get_backbone() for i in SelectedHingeResidues] hinge_atoms=[item for sublist in hinge_atoms for item in sublist] non_hinge_atoms=list(set([i for i in atoms])-set(hinge_atoms)) all_atoms_bfactor=[i.get_bfactor() for i in atoms] hinge_atoms_bfactor=[i.get_bfactor() for i in hinge_atoms] non_hinge_atoms_bfactor=[i.get_bfactor() for i in non_hinge_atoms] return_stats=[] outputfile.write('\nSTATISTICS\n\t\tN\tMin\tMax\tMean\tMode\tMedian\tSTDDev\n') return_stats.append(['','N','Min','Max','Mean','Mode','Median','STDDev']) outputfile.write('Total '+'\t'+str(len(all_atoms_bfactor))+'\t'+str(numpy.min(all_atoms_bfactor))+'\t'+str(numpy.max(all_atoms_bfactor))+'\t'+str(numpy.mean(all_atoms_bfactor))+'\t'+str(mode(all_atoms_bfactor)[0][0])+'\t'+str(numpy.median(all_atoms_bfactor))+'\t'+str(numpy.std(all_atoms_bfactor))+'\n') return_stats.append(['Total',len(all_atoms_bfactor),numpy.min(all_atoms_bfactor),numpy.max(all_atoms_bfactor),numpy.mean(all_atoms_bfactor),mode(all_atoms_bfactor)[0][0],numpy.median(all_atoms_bfactor),numpy.std(all_atoms_bfactor)]) outputfile.write('Hinge '+'\t'+str(len(hinge_atoms_bfactor))+'\t'+str(numpy.min(hinge_atoms_bfactor))+'\t'+str(numpy.max(hinge_atoms_bfactor))+'\t'+str(numpy.mean(hinge_atoms_bfactor))+'\t'+str(mode(hinge_atoms_bfactor)[0][0])+'\t'+str(numpy.median(hinge_atoms_bfactor))+'\t'+str(numpy.std(hinge_atoms_bfactor))+'\n') return_stats.append(['Hinge',len(hinge_atoms_bfactor),numpy.min(hinge_atoms_bfactor),numpy.max(hinge_atoms_bfactor),numpy.mean(hinge_atoms_bfactor),mode(hinge_atoms_bfactor)[0][0],numpy.median(hinge_atoms_bfactor),numpy.std(hinge_atoms_bfactor)]) outputfile.write('NonHinge'+'\t'+str(len(non_hinge_atoms_bfactor))+'\t'+str(numpy.min(non_hinge_atoms_bfactor))+'\t'+str(numpy.max(non_hinge_atoms_bfactor))+'\t'+str(numpy.mean(non_hinge_atoms_bfactor))+'\t'+str(mode(non_hinge_atoms_bfactor)[0][0])+'\t'+str(numpy.median(non_hinge_atoms_bfactor))+'\t'+str(numpy.std(non_hinge_atoms_bfactor))+'\n') return_stats.append(['NonHinge',len(non_hinge_atoms_bfactor),numpy.min(non_hinge_atoms_bfactor),numpy.max(non_hinge_atoms_bfactor),numpy.mean(non_hinge_atoms_bfactor),mode(non_hinge_atoms_bfactor)[0][0],numpy.median(non_hinge_atoms_bfactor),numpy.std(non_hinge_atoms_bfactor)]) p_value = permutation_test(hinge_atoms_bfactor, non_hinge_atoms_bfactor,method='approximate',num_rounds=10000,seed=0) outputfile.write('\np-value:\t'+str(p_value)+'\n') return p_value,return_stats
def test_default(): p = permutation_test(treatment, control) assert round(p, 3) == 0.055, p
def test_default(): p = permutation_test(treatment, control) assert round(p, 3) == 0.055, p