def main(args): """ Function to input all the arguments""" # Checking if levels if args.levels and args.group: levels = [args.group] + args.levels elif args.group and not args.levels: levels = [args.group] else: levels = [] logger.info(u"Groups used to color by: {0}".format(",".join(levels))) # Import data dat = wideToDesign(args.input, args.design, args.uniqID, group=args.group, anno=args.levels, logger=logger) # Remove groups with just one element dat.removeSingle() # Cleaning from missing data dat.dropMissing() # Treat everything as float and round it to 3 digits dat.wide = dat.wide.applymap(lambda x: round(x, 3)) # Get colors palette.getColors(dat.design, levels) # Use group separation or not depending on user input CV, CVcutoff = calculateCV(data=dat.wide, design=palette.design, cutoff=args.CVcutoff, levels=palette.combName) # Plot CVplots for each group and a distribution plot for all groups together logger.info("Plotting Data") with PdfPages(args.figure) as pdf: plotCVplots(data=CV, cutoff=CVcutoff, palette=palette, pdf=pdf) plotDistributions(data=CV, cutoff=CVcutoff, palette=palette, pdf=pdf) # Create flag file instance and output flags by group logger.info("Creatting Flags") flag = Flags(index=CV['cv'].index) for name, group in palette.design.groupby(palette.combName): flag.addColumn(column="flag_feature_big_CV_{0}".format(name), mask=((CV['cv_' + name].get_values() > CVcutoff[name]) | CV['cv_' + name].isnull())) # Write flag file flag.df_flags.to_csv(args.flag, sep='\t') # Finishing script logger.info("Script Complete!")
def main(args): #Importing data logger.info("Importing data with the Interface") dat = wideToDesign(args.input, args.design, args.uniqID, args.group, logger=logger) # Cleaning from missing data dat.dropMissing() # Calculate the means of each group but blanks logger.info("Calcualting group means") df_nobMeans = pd.DataFrame(index=dat.wide.index) for name, group in dat.design.groupby(dat.group): if name == args.blank: df_blank = dat.wide[group.index].copy() else: df_nobMeans[name] = dat.wide[group.index].mean(axis=1) # Calculating the LOD # Calculates the average of the blanks plus3 times the SD of the same. # If value calculated is 0 then use the default lod (default = 5000) # NOTE: ["lod"]!=0 expression represents that eveything that is not 0 is fine # and shoud remain as it is, and eveything that is 0 shoud be replaced # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.where.html logger.info( "Calculating limit of detection for each group default value [{0}].". format(args.bff)) df_blank.loc[:, "lod"] = np.average( df_blank, axis=1) + (3 * np.std(df_blank, ddof=1, axis=1)) df_blank["lod"].where(df_blank["lod"] != 0, args.bff, inplace=True) # Apoply the limit of detection to the rest of the data, these values will be # compared agains the criteria value for flagging. logger.info( "Comparing value of limit of detection to criteria [{0}].".format( args.criteria)) nob_bff = pd.DataFrame(index=dat.wide.index, columns=df_nobMeans.columns) for group in nob_bff: nob_bff.loc[:, group] = (df_nobMeans[group] - df_blank["lod"]) / df_blank["lod"] # We create flags based on the criteria value (user customizable) logger.info("Creating flags.") df_offFlags = Flags(index=nob_bff.index) for group in nob_bff: df_offFlags.addColumn(column='flag_bff_' + group + '_off', mask=(nob_bff[group] < args.criteria)) # Output BFF values and flags nob_bff.to_csv(args.outbff, sep='\t') df_offFlags.df_flags.to_csv(args.outflags, sep='\t') logger.info("Script Complete!")
def main(args): # Reading input data, set "rowID" as index and just the pval column logger.info("Importing data") toCorrect_df = pd.read_csv(args.input, sep="\t") toCorrect_df.set_index(args.uniqID, inplace=True) justPvals = toCorrect_df[args.pval].values # Making bonferroni, Benjamini/Hochberg, Benjamini/Yekutieli # Alpha for FWER, family-wise error rate, e.g. 0.1 # http://statsmodels.sourceforge.net/devel/generated/statsmodels.sandbox.stats.multicomp.multipletests.html logger.info("Runnig corrections") bonferroni = pd.Series(stm.multipletests(justPvals, alpha=args.alpha, returnsorted=False, method="bonferroni")[1], name=args.pval + "_bonferroni", index=toCorrect_df.index) bHochberg = pd.Series(stm.multipletests(justPvals, alpha=args.alpha, returnsorted=False, method="fdr_bh")[1], name=args.pval + "_bHochberg", index=toCorrect_df.index) bYekutieli = pd.Series(stm.multipletests(justPvals, alpha=args.alpha, returnsorted=False, method="fdr_by")[1], name=args.pval + "_bYekutieli", index=toCorrect_df.index) # Creating objet with flags # Add a column for each correction logger.info("Getting Flags") significance_flags = Flags(index=toCorrect_df.index) for test in [bonferroni, bHochberg, bYekutieli]: significance_flags.addColumn(column="flag_{0}_significant".format( test.name), mask=(test < args.alpha)) # Concatenating results with pvals results = pd.concat( [toCorrect_df[args.pval], bonferroni, bHochberg, bYekutieli], axis=1) # Saving data logger.info("Saving results and flags") results.to_csv(args.outadjusted, sep="\t") significance_flags.df_flags.to_csv(args.flags, sep="\t") logger.info("Script Complete!")
def main(args): # Need to take each arg and turn into data frame and add to new list flagDataFrameList = [] logger.info("Importing data") if ',' in args.flagFiles[0]: args.flagFiles = args.flagFiles[0].split(',') print(args.flagFiles) if args.filename: filenames = [cleanStr(x=fname) for fname in args.filename] print(filenames) for flagFile,filename in zip(args.flagFiles,filenames): dataFrame = pd.read_table(flagFile) if args.flagUniqID: try: dataFrame.set_index(args.flagUniqID, inplace=True) except: logger.error("Index {0} does not exist on file.".format(args.flagUniqID)) dataFrame.columns=[name+"_"+filename for name in dataFrame.columns] flagDataFrameList.append(dataFrame) mergedFlags = Flags.merge(flagDataFrameList) # NOTE: Pandas cannot store NANs as an int. If there are NANs from the # merge, then the column becomes a float. Here I change the float output to # look like an int. mergedFlags.to_csv(args.mergedFile, float_format='%.0f', sep='\t') logger.info("Script Complete!")
def saveFlags(count): """ Function to create and export flags for the counts. :Arguments: :type count: pandas.DataFrame. :param count: DataFrama with the counted digits and min, max and diff among rows. """ # Create flag object flag = Flags(index=count.index) # If the difference is greater than 1 a flag is set for dat row/met. flag.addColumn(column="flag_feature_count_digits", mask=count["diff"] >= 2) #Save flags flag.df_flags.to_csv(os.path.abspath(args.flags), sep="\t")
def main(args): # Import data logger.info("Importing data with the interface") dat = wideToDesign(args.input, args.design, args.uniqID) # Cleaning from missing data dat.dropMissing() # Iterate through each group to add flags for if a group has over half of # its data above the cutoff logger.info("Running threshold based flags") df_offFlags = Flags(index=dat.wide.index) for title, group in dat.design.groupby(args.group): mask = (dat.wide[group.index] < args.cutoff) meanOn = mask.mean(axis=1) df_offFlags.addColumn(column='flag_feature_' + title + '_off', mask=meanOn > 0.5) logger.info("Creating output") df_offFlags.df_flags.to_csv(args.output, sep="\t")
def main(args): # Need to take each arg and turn into data frame and add to new list flagDataFrameList = [] logger.info("Importing data") # Check for commas, commas are used in galaxy. If there are commas separate # the list by commas if ',' in args.flagFiles[0]: args.flagFiles = args.flagFiles[0].split(',') print(args.flagFiles) # If args.filename is provided then use it to add its name to column names # This paramether will should be used only on galaxy if args.filename: # Cleaning weird characters on file names and replacing them with '_'. filenames = [cleanStr(x=fname) for fname in args.filename] print(filenames) # Convert files into dataframes and populate into new list for flagFile, filename in zip(args.flagFiles, filenames): # Read table dataFrame = pd.read_table(flagFile) # Flag uniqID if args.flagUniqID: try: dataFrame.set_index(args.flagUniqID, inplace=True) except: logger.error("Index {0} does not exist on file.".format( args.flagUniqID)) dataFrame.columns = [ name + "_" + filename for name in dataFrame.columns ] # List of frame flagDataFrameList.append(dataFrame) #logger.info("Checking all indexes are the same") # Merge flags using Flags class mergedFlags = Flags.merge(flagDataFrameList) # Export merged flags # NOTE: Pandas cannot store NANs as an int. If there are NANs from the # merge, then the column becomes a float. Here I change the float output to # look like an int. mergedFlags.to_csv(args.mergedFile, float_format='%.0f', sep='\t') logger.info("Script Complete!")
def main(args): # Checking if levels if args.levels and args.group: levels = [args.group]+args.levels elif args.group and not args.levels: levels = [args.group] else: levels = [] #Parsing data with interface logger.info("Loading data with the Interface") dat = wideToDesign(args.input, args.design, args.uniqID, args.group, runOrder=args.order, anno=args.levels, logger=logger) # Cleaning from missing data dat.dropMissing() # Get colors palette.getColors(dat.design,levels) # Transpose Data so compounds are columns, set the runOrder as index # and drop the colum with the groups from the tranposed wide. trans = dat.transpose() trans.set_index(dat.runOrder, inplace=True) trans.drop(dat.group, axis=1, inplace=True) # Run regressions logger.info("Running Regressions") ror_df = runRegression(trans) # Creating flags flags for pvals 0.05 and 0.1 ror_flags = Flags(index=ror_df.index) ror_flags.addColumn(column="flag_feature_runOrder_pval_05", mask=(ror_df["pval"]<=0.05)) ror_flags.addColumn(column="flag_feature_runOrder_pval_01", mask=(ror_df["pval"]<=0.01)) # Plot Results # Open a multiple page PDF for plots logger.info("Plotting Results") with PdfPages(args.figure) as pdf: plotSignificantROR(ror_df, pdf, palette) # If not pages if pdf.get_pagecount() == 0: fig = plt.figure() fig.text(0.5, 0.4, "There were no features significant for plotting.", fontsize=12) pdf.savefig(fig) # Write results and flasg to TSV files ror_df.to_csv(args.table, sep="\t", float_format="%.4f", index_label=args.uniqID, columns=["pval","rsq","slope"]) ror_flags.df_flags.to_csv(args.flags, sep="\t", index_label=args.uniqID)
def main(args): # Convert flag file to DataFrame df_inp_flags = pd.DataFrame.from_csv(args.flagFile, sep='\t') #Creating flag object offFlags_df = Flags(index=df_inp_flags.index) #Get flags for sum,mean, any and all logger.info("Creating flags") offFlags_df.addColumn("flag_sum", df_inp_flags.sum(axis=1)) offFlags_df.df_flags.loc[:, "flag_mean"] = df_inp_flags.mean(axis=1) offFlags_df.addColumn("flag_any_off", df_inp_flags.any(axis=1)) offFlags_df.addColumn("flag_all_off", df_inp_flags.all(axis=1)) #Concatenate flags and summary flags offFlags_df = pd.concat([df_inp_flags, offFlags_df.df_flags], axis=1) #Output flags offFlags_df.to_csv(args.outSummary, sep='\t') # Finishing script logger.info("Script complete.")
def main(args): # Import data dat = wideToDesign(args.input, args.design, args.uniqID, args.group, logger=logger) # Get a list of samples to process, if processOnly is specified only # analyze specified group. if args.processOnly: dat.design = dat.design[dat.design[args.group].isin(args.processOnly)] toProcess = dat.design.index dat.sampleIDs = toProcess.tolist() # Create dataframe with sampleIDs that are to be analyzed. dat.keep_sample(dat.sampleIDs) # Get list of pairwise combinations. If group is specified, only do # within group combinations. combos = list() if args.group: # If group is given, only do within group pairwise combinations logger.info('Only doing within group, pairwise comparisons.') for groupName, dfGroup in dat.design.groupby(dat.group): combos.extend(list(combinations(dfGroup.index, 2))) else: logger.info('Doing all pairwise comparisons. This could take a while!') # Get all pairwise combinations for all samples combos.extend(list(combinations(dat.sampleIDs, 2))) # Open a multiple page PDF for plots ppBA = PdfPages(args.baName) # Loop over combinations and generate plots and return a list of flags. logger.info('Generating flags and plots.') flags = map(lambda combo: iterateCombo(dat, combo, ppBA), combos) # Close PDF with plots ppBA.close() # Merge flags logger.info('Merging outlier flags.') merged = Flags.merge(flags) # Summarize flags logger.info('Summarizing outlier flags.') propSample, propFeature, propSample_p, propFeature_p, propSample_c, propFeature_c, propSample_d, propFeature_d = summarizeFlags( dat, merged, combos) plotFlagDist(propSample, propFeature, args.distName) # Create sample level flags flag_sample = Flags(index=dat.sampleIDs) flag_sample.addColumn(column='flag_sample_BA_outlier', mask=(propSample >= args.sampleCutoff)) flag_sample.addColumn(column='flag_sample_BA_pearson', mask=(propSample_p >= args.sampleCutoff)) flag_sample.addColumn(column='flag_sample_BA_cooks', mask=(propSample_c >= args.sampleCutoff)) flag_sample.addColumn(column='flag_sample_BA_dffits', mask=(propSample_d >= args.sampleCutoff)) flag_sample.df_flags.index.name = "sampleID" flag_sample.df_flags.to_csv(args.flagSample, sep='\t') # Create metabolite level flags flag_metabolite = Flags(dat.wide.index) flag_metabolite.addColumn(column='flag_feature_BA_outlier', mask=(propFeature >= args.featureCutoff)) flag_metabolite.addColumn(column='flag_feature_BA_pearson', mask=(propFeature_p >= args.featureCutoff)) flag_metabolite.addColumn(column='flag_feature_BA_cooks', mask=(propFeature_c >= args.featureCutoff)) flag_metabolite.addColumn(column='flag_feature_BA_dffits', mask=(propFeature_d >= args.featureCutoff)) flag_metabolite.df_flags.to_csv(args.flagFeature, sep='\t') # Finish Script logger.info("Script Complete!")
def iterateCombo(dat, combo, pdf): """ A function to iterate generate all plots and flags. :Arguments: :type dat: interface.wideToDesign :param dat: A wideToDesign object containing wide and design information. :param tuple combo: A tuple of pairwise combination for current sample. :type pdf: matplotlib.backends.backend_pdf.PdfPages :param pdf: Handler for multi-page PDF that will contain all plots. :Updates: :type pdf: matplotlib.backends.backend_pdf.PdfPages :param pdf: Handler for multi-page PDF that will contain all plots. :Returns: :rtype flag: interface.Flags :param flag: A Flags object with outlier flags. """ # Current combination c1 = combo[0] c2 = combo[1] # Set up figure with 2 subplots fh = figureHandler(proj='2d', numAx=2, numRow=2, numCol=2, arrangement=[(0, 0, 1, 2), (0, 1, 1, 2)]) # Scatter Plot of c1 vs c2 makeScatter(dat.wide.loc[:, c1], dat.wide.loc[:, c2], fh.ax[0], fh) # BA plot of c1 vs c2 outlier, pearson, cooks, dffits = makeBA(dat.wide.loc[:, c1], dat.wide.loc[:, c2], fh.ax[1], fh) # Build plot title title = buildTitle(dat, c1, c2) # Add plot title to the figure fh.formatAxis(figTitle=title) # Stablishing a tight layout for the figure plt.tight_layout(pad=2, w_pad=.05) # Shinking figure fh.shrink(top=.85, bottom=.25, left=.15, right=.9) # Output figure to pdf fh.addToPdf(dpi=90, pdfPages=pdf) # Create flags flag = Flags(index=dat.wide.index) flag.addColumn(column='flag_{0}_{1}'.format(c1, c2), mask=outlier) flag.addColumn(column='flag_pearson_{0}_{1}'.format(c1, c2), mask=pearson) flag.addColumn(column='flag_cooks_{0}_{1}'.format(c1, c2), mask=cooks) flag.addColumn(column='flag_dffits_{0}_{1}'.format(c1, c2), mask=dffits) return flag.df_flags