Exemplo n.º 1
0
def get_threshold_go_list(go_list, stats_file, which_statistic, threshold):
    """
    Returns a list of GOs above a given threshold for a given statistic from
    an input list of GOs and their previously generated statistics.

    Examples
    --------
    Return a list of all input GOs 'GO0001,GO0002' whose 'log2_OR' is above a
    threshold of 2.0.

    Note: Only one statistic can be passed at a time!
    """
    ## Create a DataFrame from the input TAB/CSV stats file
    full_stats_df = csv_io.csv_to_data_frame(stats_file)

    ## Select a subset of entries as specified from the go_list
    stats_df = full_stats_df.ix[go_list]

    ## Return a list of those GOs whose statistics was above the threshold
    return list( stats_df[(stats_df[which_statistic] > threshold) &\
                          (np.isfinite(stats_df[which_statistic]))].index )
Exemplo n.º 2
0
def calc_statistics(abund_file, meta_file, stats_file, control_id, 
                    which_statistics):
    """Runs the main analysis on the two input files.
    """
    ## Initialize go_list as empty, which will stay empty if there are any
    ## errors with the input files and/or the statistics
    go_list = ''

    ## Read data files into DataFrames.
    raw_abund_df = csv_io.csv_to_data_frame(abund_file)
    meta_df = csv_io.csv_to_data_frame(meta_file)

    ## Normalize data values (i.e., each row sums to 1.0)
    ## Note: For rows that sum to 0.0, force the values for each cell to be
    ## 0.0 instead of NaN
    #abund_df = DataFrame(np.nan_to_num(
    #                     [ tmp_abund_df.ix[i] / tmp_abund_df.ix[i].sum() 
    #                       for i in tmp_abund_df.index ]),
    #                     columns=tmp_abund_df.columns,
    #                     index=tmp_abund_df.index)
    #abund_df = abund_df.apply(lambda x: x + abund_df.min(1))
    min_vals = list(raw_abund_df.apply(
                   lambda x: min_gt(x, 0), axis=0).values.tolist())
    #abund_df = abund_df.add(min_vals, axis=1)
    raw_abund_df += min_vals
    sum_columns_list = raw_abund_df.sum(0).values.tolist()
    #abund_df = abund_df.apply(lambda x: x / abund_df.sum(1))
    abund_df = raw_abund_df / sum_columns_list

    ## Run various checks on the input files. Exit printing errors.
    err = vet_data(meta_df)
    if err != '':
        return go_list, err

    ## Create a list of "row names" from the various DataFrames to use as
    ## indices or column names
    index_ids = list([i for i in abund_df.index])
    sample_ids = list([i for i in meta_df.index])
    class_ids = list(set([ c[0] for c in meta_df.values.tolist() ]))

    ## Check to see if the user supplied control_id exists in the metafile
    control_id = str(control_id)
    if control_id not in class_ids:
        err  = "control_id = '%s' not found in metafile " % control_id
        err += "from available class_ids: %s" % (','.join(class_ids))
        return go_list, err

    ## Assign the disease_id to the opposite of the control_id (there should
    ## only be 2 unique class ids listed in the metafile!)
    disease_id = [ c for c in class_ids if not c == control_id ][0]

    ## Create a dicionary containing the class_ids as the keys and the 
    ## sample_ids as the values.
    ## E.g., {'Control': ['S1', 'S2'], 'Disease': ['S3', 'S4']}
    sample_dict = {}
    for c in class_ids:
        sample_dict[c] = [ s for s in sample_ids if meta_df.ix[s] == c ]

    ## Create separate DataFrame for the "control" and "disease" samples
    control_df = abund_df[[ s for s in sample_dict[control_id] ]]
    disease_df = abund_df[[ s for s in sample_dict[disease_id] ]]

    ## Calculate the odds ratio, log2(odds ratio), and update the DataFrame
    odds_ratio = statistics.calc_odds_ratio(control_df, disease_df)
    print "100% ODDS_RATIO"
    print odds_ratio
    print "-"*30
    log2_OR = np.log2(odds_ratio)
    ## FIXME: Figure out how to round values in a Series
    #odds_ratio = Series([ round(i, 6) for i in odds_ratio.tolist() ])
    #log2_OR = Series([ round(i, 6) for i in log2_OR.tolist() ])
    abund_df['odds_ratio'] = odds_ratio
    abund_df['log2_OR'] = log2_OR
    #print "OR>=1.0: ",sum(log2_OR.dropna() >= 1.0) ## DEBUG

    ## Perform whichever tests/statistics are defined in 'which_statistics'
    ## (e.g., t-Test, Wilcoxon, mean ratios, etc.) and update the DataFrame
    ## FIXME: This is a _REALLY_ bad way to do this!
    if 'ttest' in which_statistics:
        results = abund_df.apply(statistics.calc_ttest,
                                control=list(control_df.columns), 
                                disease=list(disease_df.columns), 
                                axis=1)
        abund_df['ttest'] = results['ttest']

    if 'wilcoxon' in which_statistics:
        results = abund_df.apply(statistics.calc_wilcoxon,
                                control=list(control_df.columns), 
                                disease=list(disease_df.columns), 
                                axis=1)
        abund_df['wilcoxon'] = results['wilcoxon']

    if 'ranksums' in which_statistics:
        results = abund_df.apply(statistics.calc_ranksums,
                                control=list(control_df.columns), 
                                disease=list(disease_df.columns), 
                                axis=1)
        abund_df['ranksums'] = results['ranksums']

    if 'mean_ratio' in which_statistics:
        results = abund_df.apply(statistics.calc_mean_ratio,
                                control=list(control_df.columns), 
                                disease=list(disease_df.columns), 
                                axis=1)
        abund_df['mean_ratio'] = results['mean_ratio']

    ## DataFrame containing all of the statistics run on the abund_filename
    stats_df = abund_df[which_statistics]

    ## Save statistics DataFrame to file
    csv_io.data_frame_to_csv(stats_df, stats_file)

    ## Save list of GOs to file (one GO per line)
    #f = open(out_go_list_filename, 'w')
    #f.write('\n'.join(index_ids))
    #f.close()

    ## Return go_list and, hopefully, an empty err string
    return list(index_ids), err
Exemplo n.º 3
0
import csv_io
dat = csv_io.csv_to_data_frame('chk.tab')
lines = open('chk.tab','r').readlines()
dat = list([ float(l.strip()) for l in lines ])
dat[0]
len(dat)
plt.boxplot(dat)
plt.errorbar(dat)
def calc_hypergeometric(pathway_file, stats_file, go_list, threshold_go_list,
                        pathway_ids_list):
    """
    Calculates the hypergeometric probability mass function evaluated at k

    N = number of GOs in study
    n = number of GOs in given pathway
    m = number of disease-associated GOs
    k = number of disease-associated GOs in given pathway
    """
    ## Initialize an empty pvalues DataFrame (really just an empty string) to
    ## return when there are errors
    pvalues_df = ''
    err = ''

    ## Convert TAB/CSV files into DataFrames
    pathway_df = csv_io.csv_to_data_frame(pathway_file, delimiter=',')
    stats_df = csv_io.csv_to_data_frame(stats_file)

    ## DEBUG
    #combined_full_df = stats_df.combineAdd(pathway_df.ix[list(go_list)])
    #print combined_full_df.to_string()

    ## Check that user supplied pathway_ids exist in the pathway_df
    s = set(pathway_df.columns.tolist())
    diff = [x for x in pathway_ids_list if x not in s]
    if len(diff) != 0:
        err = "ERROR: %s does not contain pathway_ids: %s" % (
               pathway_file, diff)
        return pvalues_df, err

    ## Number of GOs in study
    N = len(go_list)

    ## Number of disease-associated GOs (this is a sub-set DataFrame of only
    ## those values in the stats_file whose chosen statistic was above the 
    ## threshold)
    m = len(stats_df.ix[threshold_go_list])

    ## Initialize empty dictionary
    hyper_dict = {'N': [], 'n': [], 'm': [], 'k': [], 
                  'p_upper': [], 'p_lower': [], 'pvalue': []}

    ## Loop over pathway ids in the DataFrame
    for pw_id in pathway_ids_list:
        ## Number of disease-associated GOs in pathway pw_id
        #print pathway_df.ix[threshold_go_list][pw_id]
        k = int(pathway_df.ix[threshold_go_list][pw_id].sum())

        ## Number of GOs in pathway pw_id
        n = int(pathway_df.ix[go_list][pw_id].sum())

        ## Now calculate the p-values
        p_upper = float(sum(hypergeom.pmf(range(k,min(m,n)+1), N, m, n)))
        p_lower = float(1 - p_upper + hypergeom.pmf(k, N, m, n))
        pvalue = min(p_upper, p_lower)

        ## Save p-values to dictionary
        hyper_dict['N'].append(N)
        hyper_dict['n'].append(n)
        hyper_dict['m'].append(m)
        hyper_dict['k'].append(k)
        hyper_dict['p_upper'].append(p_upper)
        hyper_dict['p_lower'].append(p_lower)
        hyper_dict['pvalue'].append(pvalue)

        ## DEBUG
        #test = sum(hypergeom.pmf(range(0,k+1), N, m, n))
        #print "[%s] f(%s; %s, %s, %s) = %s vs. %s (%s)" % (
        #    pw_id, k, N, m, n, p_upper, p_lower, test)

    ## Format dictionary as a DataFrame
    pvalues_df = pd.DataFrame(hyper_dict, index=pathway_ids_list)

    ## Save DataFrame to output filename
    #pvalues_df.to_csv('path_pvals.dat', na_rep='NaN', sep='\t')

    return pvalues_df, err