def multipletests(pvals, alpha=0.05, method="hs", is_sorted=False): """ Test results and p-value correction for multiple tests. Parameters ---------- pvals : array_like Uncorrected p-values. alpha : float FWER, family-wise error rate, e.g. ``0.1``. method : string Method used for testing and adjustment of pvalues. Can be either the full name or initial letters. Available methods are :: `bonferroni` : one-step correction `sidak` : one-step correction `holm-sidak` : step down method using Sidak adjustments `holm` : step-down method using Bonferroni adjustments `simes-hochberg` : step-up method (independent) `hommel` : closed method based on Simes tests (non-negative) `fdr_bh` : Benjamini/Hochberg (non-negative) `fdr_by` : Benjamini/Yekutieli (negative) `fdr_tsbh` : two stage fdr correction (non-negative) `fdr_tsbky` : two stage fdr correction (non-negative) is_sorted : bool If ``False`` (default), the p_values will be sorted, but the corrected pvalues are in the original order. If ``True``, then it assumed that the pvalues are already sorted in ascending order. Returns ------- reject : ndarray, boolean ``True`` for hypothesis that can be rejected for given alpha. pvals_corrected : ndarray P-values corrected for multiple tests. alphacSidak : float Corrected alpha for Sidak method. alphacBonf : float Corrected alpha for Bonferroni method. Notes ----- This is a wrapper around a function from the `statsmodels`_ package. .. _statsmodels: http://www.statsmodels.org """ from statsmodels.sandbox.stats.multicomp import multipletests as mt return mt(pvals, alpha=alpha, method=method, is_sorted=is_sorted)
def filter_low_coverage(self, alpha=0.25): use_inds = np.where(self.data['status'] == 0)[0] cell = self.data['cell'][use_inds] position = self.positions[use_inds] rmt = self.data['rmt'][use_inds] genes = self.genes[use_inds] # A triplet is a (cell, position, rmt) triplet in each gene df = pd.DataFrame({ 'gene': genes, 'cell': cell, 'position': position, 'rmt': rmt }) grouped = df.groupby(['gene', 'position']) # This gives the gene followed by the number of triplets at each position # Summing across each gene will give the number of total triplets in gene num_per_position = (grouped['position'].agg( {'Num Triplets at Pos': np.count_nonzero})).reset_index() # Total triplets in each gene trips_in_gene = (num_per_position.groupby([ 'gene' ]))['Num Triplets at Pos'].agg({'Num Triplets at Gene': np.sum}) trips_in_gene = trips_in_gene.reset_index() num_per_position = num_per_position.merge(trips_in_gene, how='left') # for each (c,rmt) in df check in grouped2 if it is lonely # determine number of lonely triplets at each position grouped2 = df.groupby(['gene', 'cell', 'rmt']) # lonely_triplets = grouped2["position"].apply(lambda x: len(x.unique())) # This is a list of each gene, cell, rmt combo and the positions with that criteria lonely_triplets = grouped2['position'].apply(np.unique) lonely_triplets = pd.DataFrame(lonely_triplets) # if the length is one, this is a lonely triplet lonely_triplets_u = lonely_triplets['position'].apply(len) lonely_triplets_u = pd.DataFrame(lonely_triplets_u) lonely_triplets_u = lonely_triplets_u.reset_index() lonely_triplets = lonely_triplets.reset_index() # Rename the columns lonely_triplets = lonely_triplets.rename( columns=lambda x: x.replace('position', 'lonely position')) lonely_triplets_u = lonely_triplets_u.rename( columns=lambda x: x.replace('position', 'num')) # merge the column that is the length of the positions array # take the ones with length 1 lonely_triplets = lonely_triplets.merge(lonely_triplets_u, how='left') lonely_triplets = lonely_triplets.loc[lonely_triplets.loc[:, 'num'] == 1, :] # This is the gene, cell, rmt combo and the position that is lonely # We need to convert the array to a scalar scalar = lonely_triplets["lonely position"].apply(np.asscalar) lonely_triplets["lonely position"] = scalar # Now if we group as such, we can determine how many (c, rmt) paris exist at each position # This would be the number of lonely pairs at a position grouped3 = lonely_triplets.groupby(["gene", "lonely position"]) l_num_at_position = (grouped3["cell"].agg(['count'])).reset_index() l_num_at_position = l_num_at_position.rename( columns=lambda x: x.replace('count', 'lonely triplets at pos')) l_num_at_position = l_num_at_position.rename( columns=lambda x: x.replace('lonely position', 'position')) # lonely pairs in each gene l_num_at_gene = (lonely_triplets.groupby( ["gene"]))['lonely position'].agg(['count']) l_num_at_gene = l_num_at_gene.reset_index() l_num_at_gene = l_num_at_gene.rename( columns=lambda x: x.replace('count', 'lonely triplets at gen')) # aggregate total = l_num_at_position.merge(l_num_at_gene, how='left') total = total.merge(num_per_position, how='left') # scipy hypergeom p = total.apply(self._hypergeom_wrapper, axis=1) p = 1 - p from statsmodels.sandbox.stats.multicomp import multipletests as mt adj_p = mt(p, alpha=alpha, method='fdr_bh') keep = pd.DataFrame(adj_p[0]) total['remove'] = keep remove = total[total['remove'] == True] final = df.merge(remove, how="left") final = final[final["remove"] == True] # Indicies to remove remove_inds = use_inds[final.index.values] self.data['status'][remove_inds] |= self.filter_codes['lonely_triplet']
patients_start_age.append( np.min(df.loc[df.index[df.patient_id.str.contains(p)]].age)) print "age: ", np.mean(patients_start_age), "+/-", np.std( patients_start_age) groups = np.unique(df.crs_group) p_vals = dict() for key_id in xrange(8, len(df.columns)): key = df.columns[key_id] y = [df[key][df.crs_group == score] for score in groups] # print stats.ttest_ind(y[0].dropna(), y[2].dropna()) s = stats.mannwhitneyu(y[1].dropna(), y[2].dropna()) p_vals[key] = s[1] print key, np.mean(y[1]), np.mean(y[2]), ": p-value =", s[1] p_fdr = mt(p_vals.values(), alpha=0.05, method='fdr_bh') p_vals_fdr = dict() for k, key in enumerate(p_vals.keys()): p_vals_fdr[key] = p_fdr[1][k] #sorted_p = OrderedDict(sorted(p_vals_fdr.items(), key=lambda(k,v):(v,k))) sorted_p = sorted(p_vals_fdr.items(), key=operator.itemgetter(1)) selected_p = [(param, '%.5f' % p_value) for (param, p_value) in sorted_p] # if p_value < 0.05] print selected_p param_names = [param for (param, p_value) in selected_p] # param_names = param_names[:7] param_names.append('crs_group') # df.replace('', np.nan, inplace=True) # df.drop(["frequency_mse_spindle"], axis=1, inplace=True)