def get_threshold_go_list(go_list, stats_file, which_statistic, threshold): """ Returns a list of GOs above a given threshold for a given statistic from an input list of GOs and their previously generated statistics. Examples -------- Return a list of all input GOs 'GO0001,GO0002' whose 'log2_OR' is above a threshold of 2.0. Note: Only one statistic can be passed at a time! """ ## Create a DataFrame from the input TAB/CSV stats file full_stats_df = csv_io.csv_to_data_frame(stats_file) ## Select a subset of entries as specified from the go_list stats_df = full_stats_df.ix[go_list] ## Return a list of those GOs whose statistics was above the threshold return list( stats_df[(stats_df[which_statistic] > threshold) &\ (np.isfinite(stats_df[which_statistic]))].index )
def calc_statistics(abund_file, meta_file, stats_file, control_id, which_statistics): """Runs the main analysis on the two input files. """ ## Initialize go_list as empty, which will stay empty if there are any ## errors with the input files and/or the statistics go_list = '' ## Read data files into DataFrames. raw_abund_df = csv_io.csv_to_data_frame(abund_file) meta_df = csv_io.csv_to_data_frame(meta_file) ## Normalize data values (i.e., each row sums to 1.0) ## Note: For rows that sum to 0.0, force the values for each cell to be ## 0.0 instead of NaN #abund_df = DataFrame(np.nan_to_num( # [ tmp_abund_df.ix[i] / tmp_abund_df.ix[i].sum() # for i in tmp_abund_df.index ]), # columns=tmp_abund_df.columns, # index=tmp_abund_df.index) #abund_df = abund_df.apply(lambda x: x + abund_df.min(1)) min_vals = list(raw_abund_df.apply( lambda x: min_gt(x, 0), axis=0).values.tolist()) #abund_df = abund_df.add(min_vals, axis=1) raw_abund_df += min_vals sum_columns_list = raw_abund_df.sum(0).values.tolist() #abund_df = abund_df.apply(lambda x: x / abund_df.sum(1)) abund_df = raw_abund_df / sum_columns_list ## Run various checks on the input files. Exit printing errors. err = vet_data(meta_df) if err != '': return go_list, err ## Create a list of "row names" from the various DataFrames to use as ## indices or column names index_ids = list([i for i in abund_df.index]) sample_ids = list([i for i in meta_df.index]) class_ids = list(set([ c[0] for c in meta_df.values.tolist() ])) ## Check to see if the user supplied control_id exists in the metafile control_id = str(control_id) if control_id not in class_ids: err = "control_id = '%s' not found in metafile " % control_id err += "from available class_ids: %s" % (','.join(class_ids)) return go_list, err ## Assign the disease_id to the opposite of the control_id (there should ## only be 2 unique class ids listed in the metafile!) disease_id = [ c for c in class_ids if not c == control_id ][0] ## Create a dicionary containing the class_ids as the keys and the ## sample_ids as the values. ## E.g., {'Control': ['S1', 'S2'], 'Disease': ['S3', 'S4']} sample_dict = {} for c in class_ids: sample_dict[c] = [ s for s in sample_ids if meta_df.ix[s] == c ] ## Create separate DataFrame for the "control" and "disease" samples control_df = abund_df[[ s for s in sample_dict[control_id] ]] disease_df = abund_df[[ s for s in sample_dict[disease_id] ]] ## Calculate the odds ratio, log2(odds ratio), and update the DataFrame odds_ratio = statistics.calc_odds_ratio(control_df, disease_df) print "100% ODDS_RATIO" print odds_ratio print "-"*30 log2_OR = np.log2(odds_ratio) ## FIXME: Figure out how to round values in a Series #odds_ratio = Series([ round(i, 6) for i in odds_ratio.tolist() ]) #log2_OR = Series([ round(i, 6) for i in log2_OR.tolist() ]) abund_df['odds_ratio'] = odds_ratio abund_df['log2_OR'] = log2_OR #print "OR>=1.0: ",sum(log2_OR.dropna() >= 1.0) ## DEBUG ## Perform whichever tests/statistics are defined in 'which_statistics' ## (e.g., t-Test, Wilcoxon, mean ratios, etc.) and update the DataFrame ## FIXME: This is a _REALLY_ bad way to do this! if 'ttest' in which_statistics: results = abund_df.apply(statistics.calc_ttest, control=list(control_df.columns), disease=list(disease_df.columns), axis=1) abund_df['ttest'] = results['ttest'] if 'wilcoxon' in which_statistics: results = abund_df.apply(statistics.calc_wilcoxon, control=list(control_df.columns), disease=list(disease_df.columns), axis=1) abund_df['wilcoxon'] = results['wilcoxon'] if 'ranksums' in which_statistics: results = abund_df.apply(statistics.calc_ranksums, control=list(control_df.columns), disease=list(disease_df.columns), axis=1) abund_df['ranksums'] = results['ranksums'] if 'mean_ratio' in which_statistics: results = abund_df.apply(statistics.calc_mean_ratio, control=list(control_df.columns), disease=list(disease_df.columns), axis=1) abund_df['mean_ratio'] = results['mean_ratio'] ## DataFrame containing all of the statistics run on the abund_filename stats_df = abund_df[which_statistics] ## Save statistics DataFrame to file csv_io.data_frame_to_csv(stats_df, stats_file) ## Save list of GOs to file (one GO per line) #f = open(out_go_list_filename, 'w') #f.write('\n'.join(index_ids)) #f.close() ## Return go_list and, hopefully, an empty err string return list(index_ids), err
import csv_io dat = csv_io.csv_to_data_frame('chk.tab') lines = open('chk.tab','r').readlines() dat = list([ float(l.strip()) for l in lines ]) dat[0] len(dat) plt.boxplot(dat) plt.errorbar(dat)
def calc_hypergeometric(pathway_file, stats_file, go_list, threshold_go_list, pathway_ids_list): """ Calculates the hypergeometric probability mass function evaluated at k N = number of GOs in study n = number of GOs in given pathway m = number of disease-associated GOs k = number of disease-associated GOs in given pathway """ ## Initialize an empty pvalues DataFrame (really just an empty string) to ## return when there are errors pvalues_df = '' err = '' ## Convert TAB/CSV files into DataFrames pathway_df = csv_io.csv_to_data_frame(pathway_file, delimiter=',') stats_df = csv_io.csv_to_data_frame(stats_file) ## DEBUG #combined_full_df = stats_df.combineAdd(pathway_df.ix[list(go_list)]) #print combined_full_df.to_string() ## Check that user supplied pathway_ids exist in the pathway_df s = set(pathway_df.columns.tolist()) diff = [x for x in pathway_ids_list if x not in s] if len(diff) != 0: err = "ERROR: %s does not contain pathway_ids: %s" % ( pathway_file, diff) return pvalues_df, err ## Number of GOs in study N = len(go_list) ## Number of disease-associated GOs (this is a sub-set DataFrame of only ## those values in the stats_file whose chosen statistic was above the ## threshold) m = len(stats_df.ix[threshold_go_list]) ## Initialize empty dictionary hyper_dict = {'N': [], 'n': [], 'm': [], 'k': [], 'p_upper': [], 'p_lower': [], 'pvalue': []} ## Loop over pathway ids in the DataFrame for pw_id in pathway_ids_list: ## Number of disease-associated GOs in pathway pw_id #print pathway_df.ix[threshold_go_list][pw_id] k = int(pathway_df.ix[threshold_go_list][pw_id].sum()) ## Number of GOs in pathway pw_id n = int(pathway_df.ix[go_list][pw_id].sum()) ## Now calculate the p-values p_upper = float(sum(hypergeom.pmf(range(k,min(m,n)+1), N, m, n))) p_lower = float(1 - p_upper + hypergeom.pmf(k, N, m, n)) pvalue = min(p_upper, p_lower) ## Save p-values to dictionary hyper_dict['N'].append(N) hyper_dict['n'].append(n) hyper_dict['m'].append(m) hyper_dict['k'].append(k) hyper_dict['p_upper'].append(p_upper) hyper_dict['p_lower'].append(p_lower) hyper_dict['pvalue'].append(pvalue) ## DEBUG #test = sum(hypergeom.pmf(range(0,k+1), N, m, n)) #print "[%s] f(%s; %s, %s, %s) = %s vs. %s (%s)" % ( # pw_id, k, N, m, n, p_upper, p_lower, test) ## Format dictionary as a DataFrame pvalues_df = pd.DataFrame(hyper_dict, index=pathway_ids_list) ## Save DataFrame to output filename #pvalues_df.to_csv('path_pvals.dat', na_rep='NaN', sep='\t') return pvalues_df, err