예제 #1
0
def main(args):
    """ Function to input all the arguments"""
    # Checking if levels
    if args.levels and args.group:
        levels = [args.group] + args.levels
    elif args.group and not args.levels:
        levels = [args.group]
    else:
        levels = []
    logger.info(u"Groups used to color by: {0}".format(",".join(levels)))

    # Import data
    dat = wideToDesign(args.input,
                       args.design,
                       args.uniqID,
                       group=args.group,
                       anno=args.levels,
                       logger=logger)

    # Remove groups with just one element
    dat.removeSingle()

    # Cleaning from missing data
    dat.dropMissing()

    # Treat everything as float and round it to 3 digits
    dat.wide = dat.wide.applymap(lambda x: round(x, 3))

    # Get colors
    palette.getColors(dat.design, levels)

    # Use group separation or not depending on user input
    CV, CVcutoff = calculateCV(data=dat.wide,
                               design=palette.design,
                               cutoff=args.CVcutoff,
                               levels=palette.combName)

    # Plot CVplots for each group and a distribution plot for all groups together
    logger.info("Plotting Data")
    with PdfPages(args.figure) as pdf:
        plotCVplots(data=CV, cutoff=CVcutoff, palette=palette, pdf=pdf)
        plotDistributions(data=CV, cutoff=CVcutoff, palette=palette, pdf=pdf)

    # Create flag file instance and output flags by group
    logger.info("Creatting Flags")
    flag = Flags(index=CV['cv'].index)
    for name, group in palette.design.groupby(palette.combName):
        flag.addColumn(column="flag_feature_big_CV_{0}".format(name),
                       mask=((CV['cv_' + name].get_values() > CVcutoff[name])
                             | CV['cv_' + name].isnull()))

    # Write flag file
    flag.df_flags.to_csv(args.flag, sep='\t')

    # Finishing script
    logger.info("Script Complete!")
def main(args):
    #Importing data
    logger.info("Importing data with the Interface")
    dat = wideToDesign(args.input,
                       args.design,
                       args.uniqID,
                       args.group,
                       logger=logger)

    # Cleaning from missing data
    dat.dropMissing()

    # Calculate the means of each group but blanks
    logger.info("Calcualting group means")
    df_nobMeans = pd.DataFrame(index=dat.wide.index)
    for name, group in dat.design.groupby(dat.group):
        if name == args.blank:
            df_blank = dat.wide[group.index].copy()
        else:
            df_nobMeans[name] = dat.wide[group.index].mean(axis=1)

    # Calculating the LOD
    # Calculates the average of the blanks plus3 times the SD of the same.
    # If value calculated is 0 then use the default lod (default = 5000)
    # NOTE: ["lod"]!=0 expression represents that eveything that is not 0 is fine
    # and shoud remain as it is, and eveything that is 0  shoud be replaced
    # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.where.html
    logger.info(
        "Calculating limit of detection for each group default value [{0}].".
        format(args.bff))
    df_blank.loc[:, "lod"] = np.average(
        df_blank, axis=1) + (3 * np.std(df_blank, ddof=1, axis=1))
    df_blank["lod"].where(df_blank["lod"] != 0, args.bff, inplace=True)

    # Apoply the limit of detection to the rest of the data, these values will be
    # compared agains the criteria value for flagging.
    logger.info(
        "Comparing value of limit of detection to criteria [{0}].".format(
            args.criteria))
    nob_bff = pd.DataFrame(index=dat.wide.index, columns=df_nobMeans.columns)
    for group in nob_bff:
        nob_bff.loc[:, group] = (df_nobMeans[group] -
                                 df_blank["lod"]) / df_blank["lod"]

    # We create flags based on the criteria value (user customizable)
    logger.info("Creating flags.")
    df_offFlags = Flags(index=nob_bff.index)
    for group in nob_bff:
        df_offFlags.addColumn(column='flag_bff_' + group + '_off',
                              mask=(nob_bff[group] < args.criteria))

    # Output BFF values and flags
    nob_bff.to_csv(args.outbff, sep='\t')
    df_offFlags.df_flags.to_csv(args.outflags, sep='\t')
    logger.info("Script Complete!")
def main(args):
    # Reading input data, set "rowID" as index and just the pval column
    logger.info("Importing data")
    toCorrect_df = pd.read_csv(args.input, sep="\t")
    toCorrect_df.set_index(args.uniqID, inplace=True)
    justPvals = toCorrect_df[args.pval].values

    # Making bonferroni, Benjamini/Hochberg, Benjamini/Yekutieli
    # Alpha for FWER, family-wise error rate, e.g. 0.1
    # http://statsmodels.sourceforge.net/devel/generated/statsmodels.sandbox.stats.multicomp.multipletests.html
    logger.info("Runnig corrections")
    bonferroni = pd.Series(stm.multipletests(justPvals,
                                             alpha=args.alpha,
                                             returnsorted=False,
                                             method="bonferroni")[1],
                           name=args.pval + "_bonferroni",
                           index=toCorrect_df.index)
    bHochberg = pd.Series(stm.multipletests(justPvals,
                                            alpha=args.alpha,
                                            returnsorted=False,
                                            method="fdr_bh")[1],
                          name=args.pval + "_bHochberg",
                          index=toCorrect_df.index)
    bYekutieli = pd.Series(stm.multipletests(justPvals,
                                             alpha=args.alpha,
                                             returnsorted=False,
                                             method="fdr_by")[1],
                           name=args.pval + "_bYekutieli",
                           index=toCorrect_df.index)

    # Creating objet with flags
    # Add a column for each correction
    logger.info("Getting Flags")
    significance_flags = Flags(index=toCorrect_df.index)
    for test in [bonferroni, bHochberg, bYekutieli]:
        significance_flags.addColumn(column="flag_{0}_significant".format(
            test.name),
                                     mask=(test < args.alpha))

    # Concatenating results with pvals
    results = pd.concat(
        [toCorrect_df[args.pval], bonferroni, bHochberg, bYekutieli], axis=1)

    # Saving data
    logger.info("Saving results and flags")
    results.to_csv(args.outadjusted, sep="\t")
    significance_flags.df_flags.to_csv(args.flags, sep="\t")
    logger.info("Script Complete!")
예제 #4
0
def main(args):
    # Need to take each arg and turn into data frame and add to new list
    flagDataFrameList = []
    logger.info("Importing data")
    if ',' in args.flagFiles[0]:
        args.flagFiles = args.flagFiles[0].split(',')
    print(args.flagFiles)
    if args.filename:
        filenames = [cleanStr(x=fname) for fname in args.filename]
    print(filenames)
    for flagFile,filename in zip(args.flagFiles,filenames):
        dataFrame = pd.read_table(flagFile)
        if args.flagUniqID:
            try:
                dataFrame.set_index(args.flagUniqID, inplace=True)
            except:
                logger.error("Index {0} does not exist on file.".format(args.flagUniqID))
        dataFrame.columns=[name+"_"+filename for name in dataFrame.columns]
        flagDataFrameList.append(dataFrame)
    mergedFlags = Flags.merge(flagDataFrameList)
    # NOTE: Pandas cannot store NANs as an int. If there are NANs from the
    # merge, then the column becomes a float. Here I change the float output to
    # look like an int.
    mergedFlags.to_csv(args.mergedFile, float_format='%.0f', sep='\t')
    logger.info("Script Complete!")
def saveFlags(count):
    """ 
    Function to create and export flags for the counts.

        :Arguments:
            :type count: pandas.DataFrame.
            :param count: DataFrama with the counted digits and min, max and 
                            diff among rows.
    """

    # Create flag object
    flag = Flags(index=count.index)

    # If the difference is greater than 1 a flag is set for dat row/met.
    flag.addColumn(column="flag_feature_count_digits", mask=count["diff"] >= 2)

    #Save flags
    flag.df_flags.to_csv(os.path.abspath(args.flags), sep="\t")
def main(args):
    # Import data
    logger.info("Importing data with the interface")
    dat = wideToDesign(args.input, args.design, args.uniqID)

    # Cleaning from missing data
    dat.dropMissing()

    # Iterate through each group to add flags for if a group has over half of
    # its data above the cutoff
    logger.info("Running threshold based flags")
    df_offFlags = Flags(index=dat.wide.index)
    for title, group in dat.design.groupby(args.group):
        mask = (dat.wide[group.index] < args.cutoff)
        meanOn = mask.mean(axis=1)
        df_offFlags.addColumn(column='flag_feature_' + title + '_off',
                              mask=meanOn > 0.5)

    logger.info("Creating output")
    df_offFlags.df_flags.to_csv(args.output, sep="\t")
예제 #7
0
def main(args):
    # Need to take each arg and turn into data frame and add to new list
    flagDataFrameList = []
    logger.info("Importing data")

    # Check for commas, commas are used in galaxy. If there are commas separate
    # the list by commas
    if ',' in args.flagFiles[0]:
        args.flagFiles = args.flagFiles[0].split(',')

        print(args.flagFiles)

    # If args.filename is provided then use it to add its name to column names
    # This paramether will should be used only on galaxy
    if args.filename:
        # Cleaning weird characters on file names and replacing them with '_'.
        filenames = [cleanStr(x=fname) for fname in args.filename]

        print(filenames)

    # Convert files into dataframes and populate into new list
    for flagFile, filename in zip(args.flagFiles, filenames):
        # Read table
        dataFrame = pd.read_table(flagFile)

        # Flag uniqID
        if args.flagUniqID:
            try:
                dataFrame.set_index(args.flagUniqID, inplace=True)
            except:
                logger.error("Index {0} does not exist on file.".format(
                    args.flagUniqID))

        dataFrame.columns = [
            name + "_" + filename for name in dataFrame.columns
        ]

        # List of frame
        flagDataFrameList.append(dataFrame)

    #logger.info("Checking all indexes are the same")

    # Merge flags using Flags class
    mergedFlags = Flags.merge(flagDataFrameList)

    # Export merged flags
    # NOTE: Pandas cannot store NANs as an int. If there are NANs from the
    # merge, then the column becomes a float. Here I change the float output to
    # look like an int.
    mergedFlags.to_csv(args.mergedFile, float_format='%.0f', sep='\t')
    logger.info("Script Complete!")
예제 #8
0
def main(args):
    # Checking if levels
    if args.levels and args.group:
        levels = [args.group]+args.levels
    elif args.group and not args.levels:
        levels = [args.group]
    else:
        levels = []

    #Parsing data with interface
    logger.info("Loading data with the Interface")
    dat = wideToDesign(args.input, args.design, args.uniqID, args.group, 
                        runOrder=args.order, anno=args.levels, logger=logger)

    # Cleaning from missing data
    dat.dropMissing()

    # Get colors
    palette.getColors(dat.design,levels)

    # Transpose Data so compounds are columns, set the runOrder as index
    # and drop the colum with the groups from the tranposed wide.
    trans = dat.transpose()
    trans.set_index(dat.runOrder, inplace=True)
    trans.drop(dat.group, axis=1, inplace=True)

    # Run regressions
    logger.info("Running Regressions")
    ror_df = runRegression(trans)

    # Creating flags flags for pvals 0.05 and 0.1
    ror_flags = Flags(index=ror_df.index)    
    ror_flags.addColumn(column="flag_feature_runOrder_pval_05",
                        mask=(ror_df["pval"]<=0.05))
    ror_flags.addColumn(column="flag_feature_runOrder_pval_01",
                        mask=(ror_df["pval"]<=0.01))

    # Plot Results
    # Open a multiple page PDF for plots
    logger.info("Plotting Results")
    with PdfPages(args.figure) as pdf:
        plotSignificantROR(ror_df, pdf, palette)
        
        # If not pages
        if pdf.get_pagecount() == 0:
            fig = plt.figure()
            fig.text(0.5, 0.4, "There were no features significant for plotting.", fontsize=12)
            pdf.savefig(fig)

    # Write  results and flasg to TSV files
    ror_df.to_csv(args.table, sep="\t", float_format="%.4f", index_label=args.uniqID,
                columns=["pval","rsq","slope"])
    ror_flags.df_flags.to_csv(args.flags, sep="\t", index_label=args.uniqID)
예제 #9
0
def main(args):
    # Convert flag file to DataFrame
    df_inp_flags = pd.DataFrame.from_csv(args.flagFile, sep='\t')

    #Creating flag object
    offFlags_df = Flags(index=df_inp_flags.index)

    #Get flags for sum,mean, any and all
    logger.info("Creating flags")
    offFlags_df.addColumn("flag_sum", df_inp_flags.sum(axis=1))
    offFlags_df.df_flags.loc[:, "flag_mean"] = df_inp_flags.mean(axis=1)
    offFlags_df.addColumn("flag_any_off", df_inp_flags.any(axis=1))
    offFlags_df.addColumn("flag_all_off", df_inp_flags.all(axis=1))

    #Concatenate flags and summary flags
    offFlags_df = pd.concat([df_inp_flags, offFlags_df.df_flags], axis=1)

    #Output flags
    offFlags_df.to_csv(args.outSummary, sep='\t')

    # Finishing script
    logger.info("Script complete.")
예제 #10
0
def main(args):
    # Import data
    dat = wideToDesign(args.input,
                       args.design,
                       args.uniqID,
                       args.group,
                       logger=logger)

    # Get a list of samples to process, if processOnly is specified only
    # analyze specified group.
    if args.processOnly:
        dat.design = dat.design[dat.design[args.group].isin(args.processOnly)]
        toProcess = dat.design.index
        dat.sampleIDs = toProcess.tolist()

    # Create dataframe with sampleIDs that are to be analyzed.
    dat.keep_sample(dat.sampleIDs)

    # Get list of pairwise combinations. If group is specified, only do
    # within group combinations.
    combos = list()
    if args.group:
        # If group is given, only do within group pairwise combinations
        logger.info('Only doing within group, pairwise comparisons.')
        for groupName, dfGroup in dat.design.groupby(dat.group):
            combos.extend(list(combinations(dfGroup.index, 2)))
    else:
        logger.info('Doing all pairwise comparisons. This could take a while!')
        # Get all pairwise combinations for all samples
        combos.extend(list(combinations(dat.sampleIDs, 2)))

    # Open a multiple page PDF for plots
    ppBA = PdfPages(args.baName)

    # Loop over combinations and generate plots and return a list of flags.
    logger.info('Generating flags and plots.')
    flags = map(lambda combo: iterateCombo(dat, combo, ppBA), combos)

    # Close PDF with plots
    ppBA.close()

    # Merge flags
    logger.info('Merging outlier flags.')
    merged = Flags.merge(flags)

    # Summarize flags
    logger.info('Summarizing outlier flags.')
    propSample, propFeature, propSample_p, propFeature_p, propSample_c, propFeature_c, propSample_d, propFeature_d = summarizeFlags(
        dat, merged, combos)
    plotFlagDist(propSample, propFeature, args.distName)

    # Create sample level flags
    flag_sample = Flags(index=dat.sampleIDs)
    flag_sample.addColumn(column='flag_sample_BA_outlier',
                          mask=(propSample >= args.sampleCutoff))
    flag_sample.addColumn(column='flag_sample_BA_pearson',
                          mask=(propSample_p >= args.sampleCutoff))
    flag_sample.addColumn(column='flag_sample_BA_cooks',
                          mask=(propSample_c >= args.sampleCutoff))
    flag_sample.addColumn(column='flag_sample_BA_dffits',
                          mask=(propSample_d >= args.sampleCutoff))
    flag_sample.df_flags.index.name = "sampleID"
    flag_sample.df_flags.to_csv(args.flagSample, sep='\t')

    # Create metabolite level flags
    flag_metabolite = Flags(dat.wide.index)
    flag_metabolite.addColumn(column='flag_feature_BA_outlier',
                              mask=(propFeature >= args.featureCutoff))
    flag_metabolite.addColumn(column='flag_feature_BA_pearson',
                              mask=(propFeature_p >= args.featureCutoff))
    flag_metabolite.addColumn(column='flag_feature_BA_cooks',
                              mask=(propFeature_c >= args.featureCutoff))
    flag_metabolite.addColumn(column='flag_feature_BA_dffits',
                              mask=(propFeature_d >= args.featureCutoff))
    flag_metabolite.df_flags.to_csv(args.flagFeature, sep='\t')

    # Finish Script
    logger.info("Script Complete!")
예제 #11
0
def iterateCombo(dat, combo, pdf):
    """ A function to iterate generate all plots and flags.

    :Arguments:
        :type dat: interface.wideToDesign
        :param dat: A wideToDesign object containing wide and design information.

        :param tuple combo: A tuple of pairwise combination for current sample.

        :type pdf: matplotlib.backends.backend_pdf.PdfPages
        :param pdf: Handler for multi-page PDF that will contain all plots.

    :Updates:
        :type pdf: matplotlib.backends.backend_pdf.PdfPages
        :param pdf: Handler for multi-page PDF that will contain all plots.

    :Returns:
        :rtype flag: interface.Flags
        :param flag: A Flags object with outlier flags.

    """

    # Current combination
    c1 = combo[0]
    c2 = combo[1]

    # Set up figure with 2 subplots
    fh = figureHandler(proj='2d',
                       numAx=2,
                       numRow=2,
                       numCol=2,
                       arrangement=[(0, 0, 1, 2), (0, 1, 1, 2)])

    # Scatter Plot of c1 vs c2
    makeScatter(dat.wide.loc[:, c1], dat.wide.loc[:, c2], fh.ax[0], fh)

    # BA plot of c1 vs c2
    outlier, pearson, cooks, dffits = makeBA(dat.wide.loc[:, c1],
                                             dat.wide.loc[:, c2], fh.ax[1], fh)

    # Build plot title
    title = buildTitle(dat, c1, c2)

    # Add plot title to the figure
    fh.formatAxis(figTitle=title)

    # Stablishing a tight layout for the figure
    plt.tight_layout(pad=2, w_pad=.05)

    # Shinking figure
    fh.shrink(top=.85, bottom=.25, left=.15, right=.9)

    # Output figure to pdf
    fh.addToPdf(dpi=90, pdfPages=pdf)

    # Create flags
    flag = Flags(index=dat.wide.index)
    flag.addColumn(column='flag_{0}_{1}'.format(c1, c2), mask=outlier)
    flag.addColumn(column='flag_pearson_{0}_{1}'.format(c1, c2), mask=pearson)
    flag.addColumn(column='flag_cooks_{0}_{1}'.format(c1, c2), mask=cooks)
    flag.addColumn(column='flag_dffits_{0}_{1}'.format(c1, c2), mask=dffits)

    return flag.df_flags