def main(args): # Importing data trough logger.info("Loading data trough the interface") dat = wideToDesign(args.input, args.design, args.uniqID, logger=logger) # Cleaning from missing data dat.dropMissing() # Transpose data to normalize toNormalize_df = dat.wide.T # Selecting method for normalization logger.info("Normalizing data using {0} method".format(args.method)) if args.method == "mean": toNormalize_df[args.method] = toNormalize_df.mean(axis=1) elif args.method == "sum": toNormalize_df[args.method] = toNormalize_df.sum(axis=1) elif args.method == "median": toNormalize_df[args.method] = toNormalize_df.median(axis=1) # Dividing by factor toNormalize_df = toNormalize_df.apply(lambda x: x / x[args.method], axis=1) # Dropping extra column toNormalize_df.drop(args.method, axis=1, inplace=True) # Transposing normalized data normalized_df = toNormalize_df.T # Saving data normalized_df.to_csv(args.out, sep="\t") logger.info("Script Complete!")
def main(args): """ Function to input all the arguments""" # Checking if levels if args.levels and args.group: levels = [args.group] + args.levels elif args.group and not args.levels: levels = [args.group] else: levels = [] logger.info(u"Groups used to color by: {0}".format(",".join(levels))) # Import data dat = wideToDesign(args.input, args.design, args.uniqID, group=args.group, anno=args.levels, logger=logger) # Remove groups with just one element dat.removeSingle() # Cleaning from missing data dat.dropMissing() # Treat everything as float and round it to 3 digits dat.wide = dat.wide.applymap(lambda x: round(x, 3)) # Get colors palette.getColors(dat.design, levels) # Use group separation or not depending on user input CV, CVcutoff = calculateCV(data=dat.wide, design=palette.design, cutoff=args.CVcutoff, levels=palette.combName) # Plot CVplots for each group and a distribution plot for all groups together logger.info("Plotting Data") with PdfPages(args.figure) as pdf: plotCVplots(data=CV, cutoff=CVcutoff, palette=palette, pdf=pdf) plotDistributions(data=CV, cutoff=CVcutoff, palette=palette, pdf=pdf) # Create flag file instance and output flags by group logger.info("Creatting Flags") flag = Flags(index=CV['cv'].index) for name, group in palette.design.groupby(palette.combName): flag.addColumn(column="flag_feature_big_CV_{0}".format(name), mask=((CV['cv_' + name].get_values() > CVcutoff) | CV['cv_' + name].isnull())) # Write flag file flag.df_flags.to_csv(args.flag, sep='\t') # Finishing script logger.info("Script Complete!")
def main(args): #Importing data logger.info("Importing data with the Interface") dat = wideToDesign(args.input, args.design, args.uniqID, args.group, logger=logger) # Cleaning from missing data dat.dropMissing() # Calculate the means of each group but blanks logger.info("Calcualting group means") df_nobMeans = pd.DataFrame(index=dat.wide.index) for name, group in dat.design.groupby(dat.group): if name == args.blank: df_blank = dat.wide[group.index].copy() else: df_nobMeans[name] = dat.wide[group.index].mean(axis=1) # Calculating the LOD # Calculates the average of the blanks plus3 times the SD of the same. # If value calculated is 0 then use the default lod (default = 5000) # NOTE: ["lod"]!=0 expression represents that eveything that is not 0 is fine # and shoud remain as it is, and eveything that is 0 shoud be replaced # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.where.html logger.info( "Calculating limit of detection for each group default value [{0}].". format(args.bff)) df_blank.loc[:, "lod"] = np.average( df_blank, axis=1) + (3 * np.std(df_blank, ddof=1, axis=1)) df_blank["lod"].where(df_blank["lod"] != 0, args.bff, inplace=True) # Apoply the limit of detection to the rest of the data, these values will be # compared agains the criteria value for flagging. logger.info( "Comparing value of limit of detection to criteria [{0}].".format( args.criteria)) nob_bff = pd.DataFrame(index=dat.wide.index, columns=df_nobMeans.columns) for group in nob_bff: nob_bff.loc[:, group] = (df_nobMeans[group] - df_blank["lod"]) / df_blank["lod"] # We create flags based on the criteria value (user customizable) logger.info("Creating flags.") df_offFlags = Flags(index=nob_bff.index) for group in nob_bff: df_offFlags.addColumn(column='flag_bff_' + group + '_off', mask=(nob_bff[group] < args.criteria)) # Output BFF values and flags nob_bff.to_csv(args.outbff, sep='\t') df_offFlags.df_flags.to_csv(args.outflags, sep='\t') logger.info("Script Complete!")
def main(args): # Import data through the SECIMTools interface dat = wideToDesign(wide=args.input, design=args.design, uniqID=args.uniqID, logger=logger) logger.info('Number of variables: {0}'.format(dat.wide.shape[0])) logger.info('Number of observations per variable: {0}'.format( dat.wide.shape[1])) ## If there is no variance in a row, the correlations cannot be computed. dat.wide["variance"] = dat.wide.apply(lambda x: ((x - x.mean()).sum()**2), axis=1) dat.wide = dat.wide[dat.wide["variance"] != 0.0] dat.wide.drop("variance", axis=1, inplace=True) logger.info("Table arranged") # Compute the matrix of correlation coefficients. C = dat.wide.T.corr(method=args.correlation).values logger.info("Correlated") # For now, ignore the possibility that a variable # will have negligible variation. mask = np.ones(dat.wide.shape[0], dtype=bool) # Count the number of variables not excluded from the clustering. p = np.count_nonzero(mask) # Consider all values of tuning parameter sigma in this array. sigmas, step = np.linspace(args.sigmaLow, args.sigmaHigh, num=args.sigmaNum, retstep=True) # Compute the clustering for each of the several values of sigma. # Each sigma corresponds to a different affinity matrix, # so the modularity matrix is also different for each sigma. # The goal is to the clustering whose modularity is greatest # across all joint (sigma, partition) pairs. # In practice, we will look for an approximation of this global optimum. #exit() logger.info("Begin clustering") clustering, sigma, m = get_clustering(C, sigmas) # Report a summary of the results of the technical analysis. logger.info("After partition refinement:") logger.info("Sigma: {0}".format(sigma)) logger.info("Number of clusters: {0}".format(clustering.max() + 1)) logger.info("Modulated modularity: {0}".format(m)) # Run the nontechnical analysis using the data frame and the less nerdy # of the outputs from the technical analysis. nontechnical_analysis(args, dat.wide, mask, C, clustering) logger.info("Script Complete!")
def main(args): # Checking if levels if args.levels and args.group: levels = [args.group] + args.levels elif args.group and not args.levels: levels = [args.group] else: levels = [] # Loading data trought Interface dat = wideToDesign(args.input, args.design, args.uniqID, group=args.group, anno=args.levels, logger=logger) # Treat everything as numeric dat.wide = dat.wide.applymap(float) # Cleaning from missing data dat.dropMissing() # Get colors for each sample based on the group palette.getColors(design=dat.design, groups=levels) # Transpose data dat.trans = dat.transpose() # Run PLS df_scores, df_weights = runPLS(dat.trans, dat.group, args.toCompare, args.nComp) # Update palette afterdrop selection of groups toCompare palette.design = palette.design.T[df_scores.index].T palette.ugColors = { ugc: palette.ugColors[ugc] for ugc in palette.ugColors.keys() if ugc in args.toCompare } # Plotting scatter plot for scores with PdfPages(args.figure) as pdfOut: logger.info(u"Plotting PLS scores") plotScores(data=df_scores, palette=palette, pdf=pdfOut) # Save df_scores and df_weights to tsv files df_scores.to_csv(args.outScores, sep="\t", index_label='sampleID') df_weights.to_csv(args.outWeights, sep="\t", index_label=dat.uniqID) #Ending script logger.info(u"Finishing running of PLS")
def main(args): """Main script """ # Import data dat = wideToDesign(args.input, args.design, args.uniqID, logger=logger) # Run Stats logger.info("Calculatting stats") RTstat = runStats(args, dat.wide, dat) # Cleaning from missing data dat.dropMissing() # Set RT Flags logger.info("Creatting Flags") flag = Flags(index=RTstat.index) if args.p90p10: flag.addColumn(column='flag_RT_Q90Q10_outlier', mask=(RTstat['p90p10'] > args.minutes)) else: flag.addColumn(column='flag_RT_Q95Q05_outlier', mask=(RTstat['p95p05'] > args.minutes)) flag.addColumn(column='flag_RT_max_gt_threshold', mask=(RTstat['max'] - RTstat['median'] > args.minutes / 2)) flag.addColumn(column='flag_RT_min_lt_threshold', mask=(RTstat['min'] - RTstat['median'] < -args.minutes / 2)) flag.addColumn( column='flag_RT_min_max_outlier', mask=((RTstat['max'] - RTstat['mean'] > 3 * RTstat['std']) | (RTstat['min'] - RTstat['mean'] < -3 * RTstat['std']))) if not args.CVcutoff: CVcutoff = np.nanpercentile(RTstat['cv'].values, q=90) CVcutoff = round(CVcutoff, -int(floor(log(CVcutoff, 10))) + 2) else: CVcutoff = args.CVcutoff flag.addColumn(column='flag_RT_big_CV', mask=(RTstat['cv'] > CVcutoff)) # Plot data logger.info("Plotting data") with PdfPages(args.figure) as pdfOut: plotCV(data=RTstat, cutoff=CVcutoff, pdf=pdfOut) # Output flags flag.df_flags.to_csv(args.flag, sep="\t") logger.info("Script Complete!")
def main(args): """ Function to call all other functions """ # Checking if levels if args.levels and args.group: levels = [args.group] + args.levels elif args.group and not args.levels: levels = [args.group] else: levels = [] logger.info(u"Groups used to color by: {0}".format(",".join(levels))) # Parsing files with interface logger.info(u"Loading data with the Interface") dat = wideToDesign(args.input, args.design, args.uniqID, args.group, anno=args.levels, runOrder=args.order, logger=logger) # Cleaning from missing data dat.dropMissing() # Sort data by runOrder if provided if args.order: logger.info(u"Sorting by runOrder") dat.sortByRunOrder() # Get colors for each sample based on the group palette.getColors(design=dat.design, groups=levels) # Open PDF pages to output figures with PdfPages(args.figure) as pdf: # Plot density plot logger.info(u"Plotting density for sample distribution") plotDensityDistribution(pdf=pdf, wide=dat.wide, palette=palette) # Plot boxplots logger.info(u"Plotting boxplot for sample distribution") plotBoxplotDistribution(pdf=pdf, wide=dat.wide, palette=palette) logger.info(u"Script complete!")
def main(args): """Runs eveything""" # Importing data dat = wideToDesign(args.input, args.design, args.uniqID, logger=logger) # Cleaning from missing data dat.dropMissing() # Getting labels to drop from arguments x = True y = True if "x" in args.labels: x = False if "y" in args.labels: y = False #Plotting with dendogram Hierarchical cluster heatmap (HCH) logger.info("Plotting heatmaps") if args.dendogram == True: fh = hm.plotHCHeatmap(dat.wide, hcheatmap=True, cmap=palette.mpl_colormap, xlbls=x, ylbls=y) fh.savefig(args.fig, format="pdf") #Plotting without a dendogram single heatmap else: # Creating figure Handler object fh = figureHandler(proj='2d', figsize=(14, 14)) # Creating plot hm.plotHeatmap(dat.wide, fh.ax[0], cmap=palette.mpl_colormap, xlbls=x, ylbls=y) # formating axis fh.formatAxis(xTitle="sampleID") # Saving figure fh.export(out=args.fig, dpi=300) # Finishing script logger.info("Script Complete!")
def main(args): """ Main Script """ #Getting palettes for data and cutoffs global cutPalette cutPalette = ch.colorHandler(pal="tableau",col="TrafficLight_9") # Checking if levels if args.levels and args.group: levels = [args.group]+args.levels elif args.group and not args.levels: levels = [args.group] else: levels = [] #Parsing data with interface dat = wideToDesign(args.input, args.design, args.uniqID, group=args.group, anno=args.levels, logger=logger, runOrder=args.order) #Dropping missing values and remove groups with just one sample dat.dropMissing() if args.group: dat.removeSingle() #Select colors for data dataPalette.getColors(design=dat.design, groups=levels) dat.design=dataPalette.design #Open pdfPages Calculate SED with PdfPages(os.path.abspath(args.figure)) as pdf: SEDtoMean,SEDpairwise=calculateSED(dat, dataPalette.ugColors, dataPalette.combName, pdf, args.p) #Outputing files for tsv files SEDtoMean.to_csv(os.path.abspath(args.toMean), index_label="sampleID", columns=["SED_to_Mean"],sep='\t') SEDpairwise.drop(["colors"],axis=1,inplace=True) if args.group: SEDpairwise.drop(["colors_x","colors_y"],axis=1,inplace=True) SEDpairwise.to_csv(os.path.abspath(args.pairwise),index_label="sampleID", sep='\t') #Ending script logger.info("Script complete.")
def main(args): """ Function to call all other functions """ # Loading files with interface logger.info(u"Loading data with the Interface") dat = wideToDesign(args.input,args.design,args.uniqID,group=args.group, logger=logger) # Cleaning from missing data dat.dropMissing() # Subseting wide to get features for wide files with more that 50 features if len(dat.wide.index) > 50: wide = dat.wide.sample(n=50,axis=0) wide = wide.T else: wide = dat.wide.T # Saving figure with PdfPages(args.figure) as pdf: # Iterating over groups if args.group: # Getting colors for groups palette.getColors(design=dat.design, groups=[dat.group]) # Iterating over groups for name, group in dat.design.groupby(args.group): logger.info(u"Plotting for group {0}".format(name)) # Plotting Density and Box plot for the group plotDensity(data=wide.T[group.index],name=name,pdf=pdf) # Get colors for each feature for "All groups" logger.info(u"Plotting for group {0}".format("samples")) palette.getColors(design=dat.design, groups=[]) # Plotting density and boxplots for all plotDensity(data=wide, name="samples", pdf=pdf) #Ending script logger.info(u"Ending script")
def main(args): # Checking if levels if args.levels and args.group: levels = [args.group] + args.levels elif args.group and not args.levels: levels = [args.group] else: levels = [] #Loading data trought Interface dat = wideToDesign(args.input, args.design, args.uniqID, group=args.group, anno=args.levels, logger=logger) # Cleaning from missing data dat.dropMissing() # Get colors for each sample based on the group palette.getColors(design=dat.design, groups=levels) # Transpossing matrix dat.wide = dat.wide.T # RunPCA df_scores, df_loadings, df_summary = runPCA(dat.wide) #Plotting scatter plot 3D logger.info(u"Plotting PCA scores") with PdfPages(args.figure) as pdfOut: plotScatterplot2D(data=df_scores, palette=palette, pdf=pdfOut) plotScatterplot3D(data=df_scores, palette=palette, pdf=pdfOut) # Save Scores, Loadings and Summary df_scores.to_csv(args.score_out, sep="\t", index_label='sampleID') df_loadings.to_csv(args.load_out, sep="\t", index_label=dat.uniqID) df_summary.to_csv(args.summary_out, sep="\t", index_label="PCs") #Ending script logger.info(u"Finishing running of PCA")
def main(args): # Import data logger.info("Importing data with the interface") dat = wideToDesign(args.input, args.design, args.uniqID) # Cleaning from missing data dat.dropMissing() # Iterate through each group to add flags for if a group has over half of # its data above the cutoff logger.info("Running threshold based flags") df_offFlags = Flags(index=dat.wide.index) for title, group in dat.design.groupby(args.group): mask = (dat.wide[group.index] < args.cutoff) meanOn = mask.mean(axis=1) df_offFlags.addColumn(column='flag_feature_' + title + '_off', mask=meanOn > 0.5) logger.info("Creating output") df_offFlags.df_flags.to_csv(args.output, sep="\t")
def main(args): # Import data with the interface dat = wideToDesign(wide=args.input, design=args.design, uniqID=args.uniqID, logger=logger) # Cleaning from missing data dat.dropMissing() # Read flag file df_flags = pd.read_table(args.flags) # Select index on flag file if none then rise an error if args.flagUniqID: df_flags.set_index(args.flagUniqID, inplace=True) else: logger.error("Not flagUniqID provided") raise # Drop either rows or columns logger.info("Running drop flags by {0}".format(args.flagfiletype)) if args.flagfiletype == "column": kpd_wide, kpd_flag = dropColumns(df_wide=dat.wide, df_flags=df_flags, cut_value=args.value, condition=args.condition, args=args) else: kpd_wide, kpd_flag = dropRows(df_wide=dat.wide, df_flags=df_flags, cut_value=args.value, condition=args.condition, args=args) # Wide and flags kpd_wide.to_csv(args.outWide, sep='\t') kpd_flag.to_csv(args.outFlags, sep='\t') # Finishing script logger.info("Script complete.")
def main(args): # Checking if levels if args.levels and args.group: levels = [args.group] + args.levels elif args.group and not args.levels: levels = [args.group] else: levels = [] #Loading data trought Interface dat = wideToDesign(args.input, args.design, args.uniqID, group=args.group, anno=args.levels, logger=logger) # Treat everything as numeric dat.wide = dat.wide.applymap(float) # Cleaning from missing data dat.dropMissing() # Get colors for each sample based on the group palette.getColors(design=dat.design, groups=levels) #Run LDA logger.info(u"Runing LDA on data") scores_df = runLDA(dat, nComp=args.nComponents) # Plotting scatter plot for scores logger.info(u"Plotting LDA scores") with PdfPages(args.figure) as pdfOut: plotScores(data=scores_df, palette=palette, pdf=pdfOut) # Save scores scores_df.to_csv(args.out, sep="\t", index_label="sampleID") #Ending script logger.info(u"Finishing running of LDA")
def main(args): # Importing data trough logger.info("Importing data through wideToDesign data manager") dat = wideToDesign(args.input, args.design, args.uniqID, logger=logger) # Cleaning from missing data dat.dropMissing() # Making sure all the groups to drop actually exist on the design column if args.group: for todrop in args.drops: if todrop in list(set(dat.design[args.group].values)): pass else: logger.error("The group '{0}' is not located in the column '{1}' "\ "of your design file".format(todrop,args.group)) raise ValueError # If the subsetting is going to be made by group the select de sampleIDs # from the design file logger.info(u"Getting sampleNames to drop") if args.group: iToDrop = list() for name,group in dat.design.groupby(args.group): if name in args.drops: iToDrop+=(group.index.tolist()) else: iToDrop = args.drops # Remove weird characters iToDrop = [cleanStr(x) for x in iToDrop] # Dropping elements selectedDesign = dat.design.drop(iToDrop,axis=0, inplace=False) # Output wide results logger.info("Output wide file") selectedDesign.to_csv(args.out, sep='\t') logger.info("Script Complete!")
def main(args): #parsing data with interface dat = wideToDesign(wide=args.input, design=args.design, uniqID=args.uniqID, group=args.group, logger=logger) # Removing groups with just one elemen from dat dat.removeSingle() # Create folder for counts if html found if args.html is not None: logger.info(u"Using html output file") folderDir = args.htmlPath try: os.makedirs(folderDir) except Exception, e: logger.error("Error. {}".format(e)) # Initiation zip files html = createHTML() folderDir = folderDir + "/" + args.counts
def main(args): ## This part is not required for now but if we want to implement ## Aditional features this may be helpful. # Checking if levels #if args.levels and args.group: # levels = [args.group]+args.levels #elif args.group and not args.levels: # levels = [args.group] #else: # levels = [] # Import data through interface dat = wideToDesign(args.input, args.design, args.uniqID, args.group, anno=args.levels, clean_string=True, logger=logger) # Cleaning from missing data dat.dropMissing() #Run Random Forest Classifier on data. logger.info('Creating classifier') df_rev, df_transf, df_importance = runRFC(dat, nStim=args.snum) # Plot feature importances logger.info('Plotting Variable Importance Plot') with PdfPages(args.figure) as pdfOut: plotVarImportance(data=df_importance, pdf=pdfOut, var=args.num) # Exporting Transformed data and df_rev data logger.info('Exporting data to TSV format') df_transf.to_csv(args.oname, index=False, sep='\t', float_format="%.4f") df_rev.to_csv(args.oname2, index=False, sep='\t')
def main(args): # Import data dat = wideToDesign(args.input, args.design, args.uniqID, logger=logger) # Cleaning from missing data dat.dropMissing() # Generate formula Formula preFormula, categorical, numerical, levels, dat.design = preProcessing( design=dat.design, factorTypes=args.ftypes, factorNames=args.factors) # Transpose data dat.trans = dat.transpose() # if interactions if args.interactions: logger.info("Running ANOVA on interactions") dat.trans["_treatment_"] = dat.trans.apply(lambda x: \ "_".join(map(str,x[categorical].values)),axis=1) dat.design["_treatment_"] = dat.design.apply(lambda x: \ "_".join(map(str,x[categorical].values)),axis=1) # if numerical adde then to the formula if len(numerical) > 0: formula = ["C(_treatment_)"] + numerical else: formula = ["C(_treatment_)"] # Concatenatig the formula formula = "+".join(formula) # Getting new formula for interactions dictFormula = {feature:"{0}~{1}".format(str(feature),formula) \ for feature in dat.wide.index.tolist()} # Creating levelCombinations levels = sorted(list(set(dat.trans["_treatment_"].tolist()))) # Creating levelCombinations reverseLevels = copy.copy(levels) reverseLevels.reverse() lvlComb = list() generateDinamicCmbs([levels], lvlComb) # Running anova logger.info('Running anova models') results, significant, residDat, fitDat = runANOVA( dat=dat, categorical=["_treatment_"], levels=[levels], lvlComb=lvlComb, formula=dictFormula, numerical=numerical) else: logger.info("Running ANOVA without interactions") # Create combination of groups nLevels = [list(itertools.chain.from_iterable(levels))] reverseLevels = copy.copy(nLevels) reverseLevels.reverse() lvlComb = list() generateDinamicCmbs(reverseLevels, lvlComb) # Maps every metabolite to its formulas dictFormula = {feature:"{0}~{1}".format(str(feature),preFormula) for feature \ in dat.wide.index.values} # running anova logger.info('Running anova models') results, significant, residDat, fitDat = runANOVA( dat=dat, categorical=categorical, levels=levels, lvlComb=lvlComb, formula=dictFormula, numerical=numerical) # QQ plots logger.info('Generating q-q plots.') qqPlot(residDat.T, fitDat.T, args.ofig) # Generate Volcano plots logger.info('Generating volcano plots.') volcano(lvlComb, results, args.ofig2) # Round results to 4 digits and save results = results.round(4) results.index.name = dat.uniqID results.to_csv(args.oname, sep="\t") # Flags significant.index.name = dat.uniqID significant.to_csv(args.flags, sep="\t")
def main(args): # Checking if levels if args.levels and args.group: levels = [args.group] + args.levels elif args.group and not args.levels: levels = [args.group] else: levels = [] #Parsing data with interface logger.info("Loading data with the Interface") dat = wideToDesign(args.input, args.design, args.uniqID, args.group, runOrder=args.order, anno=args.levels, logger=logger) # Cleaning from missing data dat.dropMissing() # Get colors palette.getColors(dat.design, levels) # Transpose Data so compounds are columns, set the runOrder as index # and drop the colum with the groups from the tranposed wide. trans = dat.transpose() trans.set_index(dat.runOrder, inplace=True) trans.drop(dat.group, axis=1, inplace=True) # Run regressions logger.info("Running Regressions") ror_df = runRegression(trans) # Creating flags flags for pvals 0.05 and 0.1 ror_flags = Flags(index=ror_df.index) ror_flags.addColumn(column="flag_feature_runOrder_pval_05", mask=(ror_df["pval"] <= 0.05)) ror_flags.addColumn(column="flag_feature_runOrder_pval_01", mask=(ror_df["pval"] <= 0.1)) # Plot Results # Open a multiple page PDF for plots logger.info("Plotting Results") with PdfPages(args.figure) as pdf: plotSignificantROR(ror_df, pdf, palette) # If not pages if pdf.get_pagecount() == 0: fig = plt.figure() fig.text(0.5, 0.4, "There were no features significant for plotting.", fontsize=12) pdf.savefig(fig) # Write results and flasg to TSV files ror_df.to_csv(args.table, sep="\t", float_format="%.4f", index_label=args.uniqID, columns=["pval", "rsq", "slope"]) ror_flags.df_flags.to_csv(args.flags, sep="\t", index_label=args.uniqID)
def main(args): #Get R ready # Get current pathway myPath = os.path.abspath(os.path.dirname(os.path.realpath(__file__))) # Stablish path for LASSO script my_r_script_path = os.path.join(myPath, "lasso_enet.R") logger.info(my_r_script_path) # Activate pandas2ri pandas2ri.activate() # Running LASSO R sctrip with open(my_r_script_path, 'r') as f: rFile = f.read() lassoEnetScript = STAP(rFile, "lasso_enet") # Importing data trought interface dat = wideToDesign(args.input, args.design, args.uniqID, group=args.group, logger=logger) # Cleaning from missing data dat.dropMissing() # Transpossing data dat.trans = dat.transpose() dat.trans.columns.name = "" # Dropping nan columns from design removed = dat.design[dat.design[dat.group] == "nan"] dat.design = dat.design[dat.design[dat.group] != "nan"] dat.trans.drop(removed.index.values, axis=0, inplace=True) logger.info("{0} removed from analysis".format(removed.index.values)) dat.design.rename(columns={dat.group: "group"}, inplace=True) dat.trans.rename(columns={dat.group: "group"}, inplace=True) #Generate a group List groupList = [ title for title, group in dat.design.groupby("group") if len(group.index) > 2 ] #Turn group list into pairwise combinations comboMatrix = np.array(list(it.combinations(groupList, 2))) comboLength = len(comboMatrix) #Run R returns = lassoEnetScript.lassoEN(dat.trans, dat.design, comboMatrix, comboLength, args.alpha, args.plots) robjects.r['write.table'](returns[0], file=args.coefficients, sep='\t', quote=False, row_names=False, col_names=True) robjects.r['write.table'](returns[1], file=args.flags, sep='\t', quote=False, row_names=False, col_names=True) # Finishing logger.info("Script Complete!")
def main(args): # Import data dat = wideToDesign(args.input, args.design, args.uniqID, args.group, logger=logger) # Get a list of samples to process, if processOnly is specified only # analyze specified group. if args.processOnly: dat.design = dat.design[dat.design[args.group].isin(args.processOnly)] toProcess = dat.design.index dat.sampleIDs = toProcess.tolist() # Create dataframe with sampleIDs that are to be analyzed. dat.keep_sample(dat.sampleIDs) # Get list of pairwise combinations. If group is specified, only do # within group combinations. combos = list() if args.group: # If group is given, only do within group pairwise combinations logger.info('Only doing within group, pairwise comparisons.') for groupName, dfGroup in dat.design.groupby(dat.group): combos.extend(list(combinations(dfGroup.index, 2))) else: logger.info('Doing all pairwise comparisons. This could take a while!') # Get all pairwise combinations for all samples combos.extend(list(combinations(dat.sampleIDs, 2))) # Open a multiple page PDF for plots ppBA = PdfPages(args.baName) # Loop over combinations and generate plots and return a list of flags. logger.info('Generating flags and plots.') flags = map(lambda combo: iterateCombo(dat, combo, ppBA), combos) # Close PDF with plots ppBA.close() # Merge flags logger.info('Merging outlier flags.') merged = Flags.merge(flags) # Summarize flags logger.info('Summarizing outlier flags.') propSample, propFeature, propSample_p, propFeature_p, propSample_c, propFeature_c, propSample_d, propFeature_d = summarizeFlags( dat, merged, combos) plotFlagDist(propSample, propFeature, args.distName) # Create sample level flags flag_sample = Flags(index=dat.sampleIDs) flag_sample.addColumn(column='flag_sample_BA_outlier', mask=(propSample >= args.sampleCutoff)) flag_sample.addColumn(column='flag_sample_BA_pearson', mask=(propSample_p >= args.sampleCutoff)) flag_sample.addColumn(column='flag_sample_BA_cooks', mask=(propSample_c >= args.sampleCutoff)) flag_sample.addColumn(column='flag_sample_BA_dffits', mask=(propSample_d >= args.sampleCutoff)) flag_sample.df_flags.index.name = "sampleID" flag_sample.df_flags.to_csv(args.flagSample, sep='\t') # Create metabolite level flags flag_metabolite = Flags(dat.wide.index) flag_metabolite.addColumn(column='flag_feature_BA_outlier', mask=(propFeature >= args.featureCutoff)) flag_metabolite.addColumn(column='flag_feature_BA_pearson', mask=(propFeature_p >= args.featureCutoff)) flag_metabolite.addColumn(column='flag_feature_BA_cooks', mask=(propFeature_c >= args.featureCutoff)) flag_metabolite.addColumn(column='flag_feature_BA_dffits', mask=(propFeature_d >= args.featureCutoff)) flag_metabolite.df_flags.to_csv(args.flagFeature, sep='\t') # Finish Script logger.info("Script Complete!")
def main(args): # Load test dataset test_design = read_table(args.test_design) # Loading target dataset trought the interface if args.group in test_design.columns: target = wideToDesign(wide=args.test_wide, design=args.test_design, uniqID=args.uniqID, group=args.group, logger=logger) else: target = wideToDesign(wide=args.test_wide, design=args.test_design, uniqID=args.uniqID, logger=logger) # Load training dataset trought the interface train = wideToDesign(wide=args.train_wide, design=args.train_design, uniqID=args.uniqID, group=args.group, logger=logger) # Dropping missing values train.dropMissing() train = train.transpose() # Dropping missing values target.dropMissing() target = target.transpose() # make sure test and train have the same features for i in target.columns: if i not in train.columns: del target[i] #trainig the SVM classes = train[args.group].copy() del train[args.group] try: logger.info("Running SVM model") model = svm.SVC(kernel=args.kernel, C=float(args.C), gamma=float(args.a), coef0=float(args.b), degree=int(args.degree)) except: logger.info("Model failed with gamma = {0} trying automatic gamma "\ "instead.".format(float(args.a))) model = svm.SVC(kernel=args.kernel, C=float(args.C), gamma="auto", coef0=float(args.b), degree=int(args.degree)) model.fit(train, classes) #predicting classes with the SVM if args.group in target.columns: del target[args.group] try: target['predicted_class'] = model.predict(target) except: print "Error: the train set and target set do not appear to have the "\ "same features (attributes)" target.to_csv(args.outfile1, index=False, sep='\t') #omputing the accuracy on the training set train['predicted_class'] = model.predict(train) train[args.group] = classes accuracy = str(getAccuracy(train) * 100) + ' percent' os.system("echo %s > %s" % (accuracy, args.accuracy_on_training)) # Finishing script logger.info("Script Complete!")