def main(args): # Import data with interface logger.info("Importig data with interface") dat = wideToDesign(args.input, args.design, uniqID=args.uniqID, group=args.group, logger=logger) # Preprocessing logger.info("Preprocessing") dat.wide = preprocess(noz=args.noZero, non=args.noNegative, ex=args.exclude, data=dat.wide) # Choosing knn as imputation method logger.info("Inpute") if args.strategy == "knn": pdFull = imputeKNN(rc=float(args.rowCutoff), cc=float(args.colCutoff), k=int(args.knn), dat=dat) else: # Iterate over groups and perform either a mean or median imputation. pdFull = iterateGroups(dat=dat, strategy=args.strategy, dist=args.dist, rc=args.rowCutoff) # Convert dataframe to float and round results to 4 digits pdFull.applymap(float) pdFull = pdFull.round(4) # Maake sure that the output has the same unique.ID pdFull.index.name = args.uniqID # Saving inputed data pdFull.to_csv(args.output, sep="\t") logger.info("Script Complete!")
def main(args): """ Function to input all the arguments""" # Checking if levels if args.levels and args.group: levels = [args.group] + args.levels elif args.group and not args.levels: levels = [args.group] else: levels = [] logger.info(u"Groups used to color by: {0}".format(",".join(levels))) # Import data dat = wideToDesign(args.input, args.design, args.uniqID, group=args.group, anno=args.levels, logger=logger) # Remove groups with just one element dat.removeSingle() # Cleaning from missing data dat.dropMissing() # Treat everything as float and round it to 3 digits dat.wide = dat.wide.applymap(lambda x: round(x, 3)) # Get colors palette.getColors(dat.design, levels) # Use group separation or not depending on user input CV, CVcutoff = calculateCV(data=dat.wide, design=palette.design, cutoff=args.CVcutoff, levels=palette.combName) # Plot CVplots for each group and a distribution plot for all groups together logger.info("Plotting Data") with PdfPages(args.figure) as pdf: plotCVplots(data=CV, cutoff=CVcutoff, palette=palette, pdf=pdf) plotDistributions(data=CV, cutoff=CVcutoff, palette=palette, pdf=pdf) # Create flag file instance and output flags by group logger.info("Creatting Flags") flag = Flags(index=CV['cv'].index) for name, group in palette.design.groupby(palette.combName): flag.addColumn(column="flag_feature_big_CV_{0}".format(name), mask=((CV['cv_' + name].get_values() > CVcutoff[name]) | CV['cv_' + name].isnull())) # Write flag file flag.df_flags.to_csv(args.flag, sep='\t') # Finishing script logger.info("Script Complete!")
def main(args): # Imput data dat = wideToDesign(args.input, args.design, args.uniqID, logger=logger) # Convert objects to numeric norm = dat.wide.applymap(float) # The following steps depend whether we perform log transformation or g-log transformation. # LOG Transformation if args.transformation == 'log': # According to the tipe of log-base selected perform log transformation if args.log_base == 'log': logger.info(u"Running Log transformation with log e") norm = norm.apply(lambda x: np.log(x)) elif args.log_base == 'log2': logger.info(u"Running Log transformation with log 2") norm = norm.apply(lambda x: np.log2(x)) elif args.log_base == 'log10': logger.info(u"Running Log transformation with log 10") norm = norm.apply(lambda x: np.log10(x)) # G-LOG Transformation # Generalized log transformation formula is: log(y + sqrt(y^2 + lambda_value)) # It reduced to sqrt(2) rescaled version of log when lambda_value = 0 # i.e. lambda_value == 0 implies log(y + y) = sqrt(2) * log(y) if args.transformation == 'glog': # According to the tipe of log-base selected perform log transformation if args.log_base == 'log': logger.info(u"Running G-Log transformation with log e") norm = np.log(norm + np.sqrt(np.square(norm) + float(args.lambda_value))) elif args.log_base == 'log2': logger.info(u"Running G-Log transformation with log 2") norm = np.log2(norm + np.sqrt(np.square(norm) + float(args.lambda_value))) elif args.log_base == 'log10': logger.info(u"Running G-Log transformation with log 10") norm = np.log10( norm + np.sqrt(np.square(norm) + float(args.lambda_value))) # Round results to 8 digits norm = norm.apply(lambda x: x.round(8)) # Treat inf as NaN norm.replace([np.inf, -np.inf], np.nan, inplace=True) # Debugging step # print "norm", norm # Save file to CSV norm.to_csv(args.oname, sep="\t") logger.info("Finishing Script")
def main(args): """ Main Script """ #Getting palettes for data and cutoffs global cutPalette cutPalette = ch.colorHandler(pal="tableau", col="TrafficLight_9") # Checking if levels if args.levels and args.group: levels = [args.group] + args.levels elif args.group and not args.levels: levels = [args.group] else: levels = [] #Parsing data with interface dat = wideToDesign(args.input, args.design, args.uniqID, group=args.group, anno=args.levels, logger=logger, runOrder=args.order) #Dropping missing values and remove groups with just one sample dat.dropMissing() if args.group: dat.removeSingle() #Select colors for data dataPalette.getColors(design=dat.design, groups=levels) dat.design = dataPalette.design #Open pdfPages Calculate SED with PdfPages(os.path.abspath(args.figure)) as pdf: SEDtoMean, SEDpairwise = calculateSED(dat, dataPalette.ugColors, dataPalette.combName, pdf, args.p) #Outputing files for tsv files SEDtoMean.to_csv(os.path.abspath(args.toMean), index_label="sampleID", columns=["SED_to_Mean"], sep='\t') SEDpairwise.drop(["colors"], axis=1, inplace=True) if args.group: SEDpairwise.drop(["colors_x", "colors_y"], axis=1, inplace=True) SEDpairwise.to_csv(os.path.abspath(args.pairwise), index_label="sampleID", sep='\t') #Ending script logger.info("Script complete.")
def main(args): #Importing data logger.info("Importing data with the Interface") dat = wideToDesign(args.input, args.design, args.uniqID, args.group, logger=logger) # Cleaning from missing data dat.dropMissing() # Calculate the means of each group but blanks logger.info("Calcualting group means") df_nobMeans = pd.DataFrame(index=dat.wide.index) for name, group in dat.design.groupby(dat.group): if name == args.blank: df_blank = dat.wide[group.index].copy() else: df_nobMeans[name] = dat.wide[group.index].mean(axis=1) # Calculating the LOD # Calculates the average of the blanks plus3 times the SD of the same. # If value calculated is 0 then use the default lod (default = 5000) # NOTE: ["lod"]!=0 expression represents that eveything that is not 0 is fine # and shoud remain as it is, and eveything that is 0 shoud be replaced # http://pandas.pydata.org/pandas-docs/stable/generated/pandas.DataFrame.where.html logger.info( "Calculating limit of detection for each group default value [{0}].". format(args.bff)) df_blank.loc[:, "lod"] = np.average( df_blank, axis=1) + (3 * np.std(df_blank, ddof=1, axis=1)) df_blank["lod"].where(df_blank["lod"] != 0, args.bff, inplace=True) # Apoply the limit of detection to the rest of the data, these values will be # compared agains the criteria value for flagging. logger.info( "Comparing value of limit of detection to criteria [{0}].".format( args.criteria)) nob_bff = pd.DataFrame(index=dat.wide.index, columns=df_nobMeans.columns) for group in nob_bff: nob_bff.loc[:, group] = (df_nobMeans[group] - df_blank["lod"]) / df_blank["lod"] # We create flags based on the criteria value (user customizable) logger.info("Creating flags.") df_offFlags = Flags(index=nob_bff.index) for group in nob_bff: df_offFlags.addColumn(column='flag_bff_' + group + '_off', mask=(nob_bff[group] < args.criteria)) # Output BFF values and flags nob_bff.to_csv(args.outbff, sep='\t') df_offFlags.df_flags.to_csv(args.outflags, sep='\t') logger.info("Script Complete!")
def main(args): # Import data through the SECIMTools interface dat = wideToDesign(wide=args.input, design=args.design, uniqID=args.uniqID, logger=logger) logger.info('Number of variables: {0}'.format(dat.wide.shape[0])) logger.info('Number of observations per variable: {0}'.format( dat.wide.shape[1])) ## If there is no variance in a row, the correlations cannot be computed. dat.wide["variance"] = dat.wide.apply(lambda x: ((x - x.mean()).sum()**2), axis=1) dat.wide = dat.wide[dat.wide["variance"] != 0.0] dat.wide.drop("variance", axis=1, inplace=True) logger.info("Table arranged") # Compute the matrix of correlation coefficients. C = dat.wide.T.corr(method=args.correlation).values logger.info("Correlated") # For now, ignore the possibility that a variable # will have negligible variation. mask = np.ones(dat.wide.shape[0], dtype=bool) # Count the number of variables not excluded from the clustering. p = np.count_nonzero(mask) # Consider all values of tuning parameter sigma in this array. sigmas, step = np.linspace(args.sigmaLow, args.sigmaHigh, num=args.sigmaNum, retstep=True) # Compute the clustering for each of the several values of sigma. # Each sigma corresponds to a different affinity matrix, # so the modularity matrix is also different for each sigma. # The goal is to the clustering whose modularity is greatest # across all joint (sigma, partition) pairs. # In practice, we will look for an approximation of this global optimum. #exit() logger.info("Begin clustering") clustering, sigma, m = get_clustering(C, sigmas) # Report a summary of the results of the technical analysis. logger.info("After partition refinement:") logger.info("Sigma: {0}".format(sigma)) logger.info("Number of clusters: {0}".format(clustering.max() + 1)) logger.info("Modulated modularity: {0}".format(m)) # Run the nontechnical analysis using the data frame and the less nerdy # of the outputs from the technical analysis. nontechnical_analysis(args, dat.wide, mask, C, clustering) logger.info("Script Complete!")
def main(args): # Checking if levels if args.levels and args.group: levels = [args.group]+args.levels elif args.group and not args.levels: levels = [args.group] else: levels = [] #Parsing data with interface logger.info("Loading data with the Interface") dat = wideToDesign(args.input, args.design, args.uniqID, args.group, runOrder=args.order, anno=args.levels, logger=logger) # Cleaning from missing data dat.dropMissing() # Get colors palette.getColors(dat.design,levels) # Transpose Data so compounds are columns, set the runOrder as index # and drop the colum with the groups from the tranposed wide. trans = dat.transpose() trans.set_index(dat.runOrder, inplace=True) trans.drop(dat.group, axis=1, inplace=True) # Run regressions logger.info("Running Regressions") ror_df = runRegression(trans) # Creating flags flags for pvals 0.05 and 0.1 ror_flags = Flags(index=ror_df.index) ror_flags.addColumn(column="flag_feature_runOrder_pval_05", mask=(ror_df["pval"]<=0.05)) ror_flags.addColumn(column="flag_feature_runOrder_pval_01", mask=(ror_df["pval"]<=0.01)) # Plot Results # Open a multiple page PDF for plots logger.info("Plotting Results") with PdfPages(args.figure) as pdf: plotSignificantROR(ror_df, pdf, palette) # If not pages if pdf.get_pagecount() == 0: fig = plt.figure() fig.text(0.5, 0.4, "There were no features significant for plotting.", fontsize=12) pdf.savefig(fig) # Write results and flasg to TSV files ror_df.to_csv(args.table, sep="\t", float_format="%.4f", index_label=args.uniqID, columns=["pval","rsq","slope"]) ror_flags.df_flags.to_csv(args.flags, sep="\t", index_label=args.uniqID)
def main(args): """ Function to call all other functions """ # Checking if levels if args.levels and args.group: levels = [args.group] + args.levels elif args.group and not args.levels: levels = [args.group] else: levels = [] logger.info(u"Groups used to color by: {0}".format(",".join(levels))) # Parsing files with interface logger.info(u"Loading data with the Interface") dat = wideToDesign(args.input, args.design, args.uniqID, args.group, anno=args.levels, runOrder=args.order, logger=logger) # Cleaning from missing data dat.dropMissing() # Sort data by runOrder if provided if args.order: logger.info(u"Sorting by runOrder") design_final = dat.design.sort_values(by=args.order, axis=0) wide_final = dat.wide.reindex(columns=design_final.index) else: design_final = dat.design wide_final = dat.wide # Get colors for each sample based on the group palette.getColors(design=design_final, groups=levels) # Open PDF pages to output figures with PdfPages(args.figure) as pdf: # Plot density plot logger.info(u"Plotting density for sample distribution") plotDensityDistribution(pdf=pdf, wide=wide_final, palette=palette) # Plot boxplots logger.info(u"Plotting boxplot for sample distribution") plotBoxplotDistribution(pdf=pdf, wide=wide_final, palette=palette) logger.info(u"Script complete!")
def main(args): # Checking if levels if args.levels and args.group: levels = [args.group]+args.levels elif args.group and not args.levels: levels = [args.group] else: levels = [] # Loading data trought Interface dat = wideToDesign(args.input, args.design, args.uniqID, group=args.group, anno=args.levels, logger=logger) # Treat everything as numeric dat.wide = dat.wide.applymap(float) # Cleaning from missing data dat.dropMissing() # Get colors for each sample based on the group palette.getColors(design=dat.design, groups=levels) # Transpose data dat.trans = dat.transpose() # Run PLS df_scores,df_weights,df_classification = runPLS(dat.trans,dat.group,args.toCompare,args.nComp,args.cross_validation) # Update palette afterdrop selection of groups toCompare palette.design = palette.design.T[df_scores.index].T palette.ugColors = {ugc:palette.ugColors[ugc] for ugc in palette.ugColors.keys() if ugc in args.toCompare} # Plotting scatter plot for scores with PdfPages(args.figure) as pdfOut: logger.info(u"Plotting PLS scores") plotScores(data=df_scores, palette=palette, pdf=pdfOut) # Save df_scores, df_weights and df_classification to tsv files. df_scores.to_csv( args.outScores, sep = "\t", index_label = 'sampleID' ) df_weights.to_csv( args.outWeights, sep = "\t", index_label = dat.uniqID ) df_classification.to_csv( args.outClassification, sep = "\t", index_label = 'sampleID' ) # Computing mismatches between original data and final data. classification_mismatch_percent = 100 * sum( df_classification['Group_Observed'] == df_classification['Group_Predicted_Rounded'] )/df_classification.shape[0] classification_mismatch_percent_string = str( classification_mismatch_percent ) + ' Percent' os.system("echo %s > %s"%( classification_mismatch_percent_string, args.outClassificationAccuracy ) ) #Ending script logger.info(u"Finishing running of PLS")
def main(args): # import data with interface dat = wideToDesign(wide=args.input, design=args.design, uniqID=args.uniqID, logger=logger) kpd_wide = dropRowCol(df_col_UPD=dat.wide, rowID=args.row, colID=args.col, args=args) # output new wide dataset kpd_wide.to_csv(args.outWide, sep='\t')
def main(args): """Runs eveything""" # Importing data dat = wideToDesign(args.input, args.design, args.uniqID, logger=logger) # Cleaning from missing data dat.dropMissing() # Getting labels to drop from arguments x = True y = True if "x" in args.labels: x = False if "y" in args.labels: y = False print("x =", x) print("y =", y) #Plotting with dendogram Hierarchical cluster heatmap (HCH) logger.info("Plotting heatmaps") if args.dendogram == True: fh = hm.plotHCHeatmap(dat.wide, hcheatmap=True, cmap=palette.mpl_colormap, xlbls=x, ylbls=y) fh.savefig(args.fig, format="pdf") #Plotting without a dendogram single heatmap else: # Creating figure Handler object fh = figureHandler(proj='2d', figsize=(14, 14)) # Creating plot hm.plotHeatmap(dat.wide, fh.ax[0], cmap=palette.mpl_colormap, xlbls=x, ylbls=y) # formating axis fh.formatAxis(xTitle="sampleID") # Saving figure fh.export(out=args.fig, dpi=300) # Finishing script logger.info("Script Complete!")
def main(args): """ Function to call all other functions """ # Loading files with interface logger.info(u"Loading data with the Interface") dat = wideToDesign(args.input, args.design, args.uniqID, group=args.group, logger=logger) # Cleaning from missing data dat.dropMissing() # Subseting wide to get features for wide files with more that 50 features if len(dat.wide.index) > 50: wide = dat.wide.sample(n=50, axis=0) wide = wide.T else: wide = dat.wide.T # Saving figure with PdfPages(args.figure) as pdf: # Iterating over groups if args.group: # Getting colors for groups palette.getColors(design=dat.design, groups=[dat.group]) # Iterating over groups for name, group in dat.design.groupby(args.group): logger.info(u"Plotting for group {0}".format(name)) # Plotting Density and Box plot for the group plotDensity(data=wide.T[group.index], name=name, pdf=pdf) # Get colors for each feature for "All groups" logger.info(u"Plotting for group {0}".format("samples")) palette.getColors(design=dat.design, groups=[]) # Plotting density and boxplots for all plotDensity(data=wide, name="samples", pdf=pdf) #Ending script logger.info(u"Ending script")
def main(args): # Import data logger.info("Importing data with the interface") dat = wideToDesign(args.input, args.design, args.uniqID) # Cleaning from missing data dat.dropMissing() # Iterate through each group to add flags for if a group has over half of # its data above the cutoff logger.info("Running threshold based flags") df_offFlags = Flags(index=dat.wide.index) for title, group in dat.design.groupby(args.group): mask = (dat.wide[group.index] < args.cutoff) meanOn = mask.mean(axis=1) df_offFlags.addColumn(column='flag_feature_' + title + '_off', mask=meanOn > 0.5) logger.info("Creating output") df_offFlags.df_flags.to_csv(args.output, sep="\t")
def main(args): # Import data with the interface dat = wideToDesign(wide=args.input, design=args.design, uniqID=args.uniqID, logger=logger) # Cleaning from missing data dat.dropMissing() # Read flag file df_flags = pd.read_table(args.flags) # Select index on flag file if none then rise an error if args.flagUniqID: df_flags.set_index(args.flagUniqID, inplace=True) else: logger.error("Not flagUniqID provided") raise # Drop either rows or columns logger.info("Running drop flags by {0}".format(args.flagfiletype)) if args.flagfiletype == "column": kpd_wide, kpd_flag = dropColumns(df_wide=dat.wide, df_flags=df_flags, cut_value=args.value, condition=args.condition, args=args) else: kpd_wide, kpd_flag = dropRows(df_wide=dat.wide, df_flags=df_flags, cut_value=args.value, condition=args.condition, args=args) # Wide and flags kpd_wide.to_csv(args.outWide, sep='\t') kpd_flag.to_csv(args.outFlags, sep='\t') # Finishing script logger.info("Script complete.")
def main(args): # Importing data trough logger.info("Importing data through wideToDesign data manager") dat = wideToDesign(args.input, args.design, args.uniqID, logger=logger) # Cleaning from missing data dat.dropMissing() # Making sure all the groups to drop actually exist on the design column if args.group: for todrop in args.drops: if todrop in list(set(dat.design[args.group].values)): pass else: logger.error("The group '{0}' is not located in the column '{1}' "\ "of your design file".format(todrop,args.group)) raise ValueError # If the subsetting is going to be made by group the select de sampleIDs # from the design file logger.info(u"Getting sampleNames to drop") if args.group: iToDrop = list() for name, group in dat.design.groupby(args.group): if name in args.drops: iToDrop += (group.index.tolist()) else: iToDrop = args.drops # Remove weird characters iToDrop = [cleanStr(x) for x in iToDrop] # Dropping elements selectedDesign = dat.design.drop(iToDrop, axis=0, inplace=False) # Output wide results logger.info("Output wide file") selectedDesign.to_csv(args.out, sep='\t') logger.info("Script Complete!")
def main(args): # Checking if levels if args.levels and args.group: levels = [args.group]+args.levels elif args.group and not args.levels: levels = [args.group] else: levels = [] #Loading data trought Interface dat = wideToDesign(args.input, args.design, args.uniqID, group=args.group, anno=args.levels, logger=logger) # Cleaning from missing data dat.dropMissing() # Get colors for each sample based on the group palette.getColors(design=dat.design, groups=levels) # Transpossing matrix dat.wide = dat.wide.T # RunPCA df_scores, df_loadings, df_summary = runPCA(dat.wide) #Plotting scatter plot 3D logger.info(u"Plotting PCA scores") with PdfPages(args.figure) as pdfOut: plotScatterplot2D(data=df_scores, palette=palette, pdf=pdfOut) plotScatterplot3D(data=df_scores, palette=palette, pdf=pdfOut) # Save Scores, Loadings and Summary df_scores.to_csv(args.score_out, sep="\t", index_label='sampleID') df_loadings.to_csv(args.load_out, sep="\t", index_label=dat.uniqID) df_summary.to_csv(args.summary_out, sep="\t", index_label="PCs") #Ending script logger.info(u"Finishing running of PCA")
def main(args): #parsing data with interface dat = wideToDesign(wide=args.input, design=args.design, uniqID=args.uniqID, group=args.group, logger=logger) # Removing groups with just one elemen from dat dat.removeSingle() # Create folder for counts if html found if args.html is not None: logger.info(u"Using html output file") folderDir = args.htmlPath try: os.makedirs(folderDir) except Exception, e: logger.error("Error. {}".format(e)) # Initiation zip files html = createHTML() folderDir = folderDir + "/" + args.counts
def main(args): # Loading data trought Interface dat = wideToDesign(args.input, args.design, args.uniqueID, group=args.group, logger=logger) # Treat everything as numeric dat.wide = dat.wide.applymap(float) # Cleaning from missing data dat.dropMissing() # Getting the uinique pairs and all pairwise prermutations # son that we will feed them to Kruscal-Wallis. group_values_series = dat.transpose()[dat.group].T.squeeze() group_values_series_unique = group_values_series.unique() number_of_unique_groups = group_values_series_unique.shape[0] groups_pairwise = list(combinations(group_values_series_unique, 2)) number_of_groups_pairwise = len(groups_pairwise) # Extracting data from the interface. data_frame = dat.transpose() # Extracting number of features. number_of_features = data_frame.shape[1] - 1 # Saving treatment group name from the arguments. # Running overall Kruscall-Wallis test for all group levels combined. # Creating p_values_all and flag_values_all for 3 significance levels as emply lists of length equal to the number_of_features. # This will be used for all groups. p_value_all = [0] * number_of_features H_value_all = [0] * number_of_features mean_value_all = [0] * number_of_features variance_value_all = [0] * number_of_features flag_value_all_0p01 = [0] * number_of_features flag_value_all_0p05 = [0] * number_of_features flag_value_all_0p10 = [0] * number_of_features for j in range(0, number_of_features): # Creating duplicate for manipulation. data_frame_manipulate = data_frame # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. data_frame_manipulate_transpose = data_frame_manipulate.drop( args.group, 1).transpose() # Pulling indexes list from the current data frame. indexes_list_complete = data_frame_manipulate_transpose.index.tolist() # Computing dataset summaries. mean_value_all[j] = np.mean( data_frame_manipulate_transpose.loc[indexes_list_complete[j]]) variance_value_all[j] = np.var( data_frame_manipulate_transpose.loc[indexes_list_complete[j]], ddof=1) for i in range(0, number_of_unique_groups): # Extracting the pieces of the data frame that belong to ith unique group. data_frame_current_group = data_frame.loc[data_frame[ args.group].isin([group_values_series_unique[i]])] # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. data_frame_current_group = data_frame_current_group.drop( args.group, 1).transpose() # Pulling indexes list from the current data frame. indexes_list = data_frame_current_group.index.tolist() # Series current for group i and row (feature) j. series_current = data_frame_current_group.loc[indexes_list[j]] # This piece of code depends on whether it is the first group in the list or not. if i == 0: series_total = [series_current] else: series_total.append(series_current) # Checking if the compared elements are different. # Combining for checking. combined_list = data_frame_manipulate_transpose.loc[ indexes_list_complete[j]].tolist() combined_list_unique = np.unique(combined_list) # Checking if the number of unique elements is exactly 1. if len(combined_list_unique) == 1: # Performing Kruscal-Wallis for all groups for feature j. p_value_all[j] = float("nan") H_value_all[j] = float("nan") if p_value_all[j] < 0.01: flag_value_all_0p01[j] = 1 if p_value_all[j] < 0.05: flag_value_all_0p05[j] = 1 if p_value_all[j] < 0.10: flag_value_all_0p10[j] = 1 else: # Performing Kruscal-Wallis for all groups for feature j. kruscal_wallis_args = series_total p_value_all[j] = kruskalwallis(*kruscal_wallis_args)[1] H_value_all[j] = kruskalwallis(*kruscal_wallis_args)[0] if p_value_all[j] < 0.01: flag_value_all_0p01[j] = 1 if p_value_all[j] < 0.05: flag_value_all_0p05[j] = 1 if p_value_all[j] < 0.10: flag_value_all_0p10[j] = 1 # The loop over features has to be finished by now. Converting them into the data frame. # The pariwise results will be added later. summary_df = pd.DataFrame(data=mean_value_all, columns=["GrandMean"], index=indexes_list) summary_df['SampleVariance'] = variance_value_all summary_df['H_value_for_all'] = H_value_all summary_df['prob_greater_than_H_for_all'] = p_value_all flag_df = pd.DataFrame(data=flag_value_all_0p01, columns=["flag_significant_0p01_on_all_groups"], index=indexes_list) flag_df["flag_significant_0p05_on_all_groups"] = flag_value_all_0p05 flag_df["flag_significant_0p10_on_all_groups"] = flag_value_all_0p10 # Informing that KW for all group has been performed. logger.info( u"Kruscal-Wallis test for all groups together has been performed.") # Computing means for each group # This part just produces sumamry statistics for the output table. # This has nothing to do with Kruscal-Wallis for i in range(0, number_of_unique_groups): # Extracting the pieces of the data frame that belong to ith group. data_frame_current_group = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[i]])] # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. data_frame_current_group = data_frame_current_group.drop( args.group, 1).transpose() # Pulling indexes list from the current group. indexes_list = data_frame_current_group.index.tolist() # Creating array of means for the current group that will be filled. means_value = [0] * number_of_features for j in range(0, number_of_features): series_current = data_frame_current_group.loc[indexes_list[j]] means_value[j] = series_current.mean() means_value_column_name_current = 'mean_treatment_' + group_values_series_unique[ i] summary_df[means_value_column_name_current] = means_value # Running pairwise Kruscall-Wallis test for all pairs of group levels that are saved in groups_pairwise. for i in range(0, number_of_groups_pairwise): # Extracting the pieces of the data frame that belong to groups saved in the i-th unique pair. groups_subset = groups_pairwise[i] data_frame_first_group = data_frame.loc[data_frame[args.group].isin( [groups_subset[0]])] data_frame_second_group = data_frame.loc[data_frame[args.group].isin( [groups_subset[1]])] # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. data_frame_first_group = data_frame_first_group.drop(args.group, 1).transpose() data_frame_second_group = data_frame_second_group.drop(args.group, 1).transpose() # Pulling indexes list from the first one (they are the same) indexes_list = data_frame_first_group.index.tolist() # Creating p_values, neg_log10_p_value, flag_values, difference_value lists filled wiht 0es. p_value = [0] * number_of_features H_value = [0] * number_of_features neg_log10_p_value = [0] * number_of_features flag_value_0p01 = [0] * number_of_features flag_value_0p05 = [0] * number_of_features flag_value_0p10 = [0] * number_of_features difference_value = [0] * number_of_features for j in range(0, number_of_features): series_first = data_frame_first_group.loc[indexes_list[j]] series_second = data_frame_second_group.loc[indexes_list[j]] # Checking if the compared elements are different. # Combining for checking. first_list = data_frame_first_group.loc[indexes_list[j]].tolist() second_list = data_frame_second_group.loc[indexes_list[j]].tolist() combined_list = first_list + second_list combined_list_unique = np.unique(combined_list) # Checking if the number of unique elements is exactly 1. if len(combined_list_unique) == 1: p_value[j] = float("nan") H_value[j] = float("nan") # Possible alternative for two groups. # p_value[j] = kruskalwallis(series_first, series_second)[1] neg_log10_p_value[j] = -np.log10(p_value[j]) difference_value[j] = series_first.mean() - series_second.mean( ) if p_value[j] < 0.01: flag_value_0p01[j] = 1 if p_value[j] < 0.05: flag_value_0p05[j] = 1 if p_value[j] < 0.10: flag_value_0p10[j] = 1 else: kruscal_wallis_args = [series_first, series_second] p_value[j] = kruskalwallis(*kruscal_wallis_args)[1] H_value[j] = kruskalwallis(*kruscal_wallis_args)[0] # Possible alternative for two groups. # p_value[j] = kruskalwallis(series_first, series_second)[1] neg_log10_p_value[j] = -np.log10(p_value[j]) difference_value[j] = series_first.mean() - series_second.mean( ) if p_value[j] < 0.01: flag_value_0p01[j] = 1 if p_value[j] < 0.05: flag_value_0p05[j] = 1 if p_value[j] < 0.10: flag_value_0p10[j] = 1 # Adding current p_value and flag_value column to the data frame and assigning the name p_value_column_name_current = 'prob_greater_than_H_for_diff_' + groups_subset[ 0] + '_' + groups_subset[1] H_value_column_name_current = 'H_value_for_diff_' + groups_subset[ 0] + '_' + groups_subset[1] neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + groups_subset[ 0] + '_' + groups_subset[1] difference_value_column_name_current = 'diff_of_' + groups_subset[ 0] + '_' + groups_subset[1] summary_df[p_value_column_name_current] = p_value summary_df[H_value_column_name_current] = H_value summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value summary_df[difference_value_column_name_current] = difference_value flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_' + groups_subset[ 0] + '_' + groups_subset[1] flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_' + groups_subset[ 0] + '_' + groups_subset[1] flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_' + groups_subset[ 0] + '_' + groups_subset[1] flag_df[flag_value_column_name_current_0p01] = flag_value_0p01 flag_df[flag_value_column_name_current_0p05] = flag_value_0p05 flag_df[flag_value_column_name_current_0p10] = flag_value_0p10 # Roundign the results up to 4 precison digits. summary_df = summary_df.apply(lambda x: x.round(4)) # Adding name for the unique ID column that was there oroginally. summary_df.index.name = args.uniqueID flag_df.index.name = args.uniqueID # Save summary_df to the ouptut summary_df.to_csv(args.summaries, sep="\t") # Save flag_df to the output flag_df.to_csv(args.flags, sep="\t") # Informing that KW for pairwise group has been performed. logger.info( u"Kruscal-Wallis test for all groups pairwise has been performed.") # Generating Indexing for volcano plots. # Getting data for lpvals lpvals = {col.split("_value_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \ if col.startswith("neg_log10_p_value")} # Gettign data for diffs difs = {col.split("_of_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \ if col.startswith("diff_of_")} # The cutoff value for significance. cutoff = 2 # Making volcano plots with PdfPages(args.volcano) as pdf: for i in range(0, number_of_groups_pairwise): # Set Up Figure volcanoPlot = figureHandler(proj="2d") groups_subset = groups_pairwise[i] current_key = groups_subset[0] + '_' + groups_subset[1] # Plot all results scatter.scatter2D(x=list(difs[current_key]), y=list(lpvals[current_key]), colorList=list('b'), ax=volcanoPlot.ax[0]) # Color results beyond treshold red cutLpvals = lpvals[current_key][lpvals[current_key] > cutoff] if not cutLpvals.empty: cutDiff = difs[current_key][cutLpvals.index] scatter.scatter2D(x=list(cutDiff), y=list(cutLpvals), colorList=list('r'), ax=volcanoPlot.ax[0]) # Drawing cutoffs lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0]) # Format axis (volcanoPlot) volcanoPlot.formatAxis( axTitle=current_key, grid=False, yTitle="-log10(p-value) for Diff of treatment means for {0}". format(current_key), xTitle="Difference of treatment means for {0}".format( current_key)) # Add figure to PDF volcanoPlot.addToPdf(pdfPages=pdf) # Informing that the volcano plots are done logger.info(u"Pairwise volcano plots have been created.") # Ending script logger.info(u"Finishing running of Kruscal-Wallis tests.")
def main(args): #Get R ready # Get current pathway myPath = os.path.abspath(os.path.dirname(os.path.realpath(__file__))) # Stablish path for LASSO script my_r_script_path = os.path.join(myPath, "lasso_enet.R") logger.info(my_r_script_path) # Activate pandas2ri pandas2ri.activate() # Running LASSO R sctrip with open(my_r_script_path, 'r') as f: rFile = f.read() lassoEnetScript = STAP(rFile, "lasso_enet") # Importing data trought interface dat = wideToDesign(args.input, args.design, args.uniqID, group=args.group, logger=logger) # Cleaning from missing data dat.dropMissing() # Transpossing data dat.trans = dat.transpose() dat.trans.columns.name = "" # Dropping nan columns from design removed = dat.design[dat.design[dat.group] == "nan"] dat.design = dat.design[dat.design[dat.group] != "nan"] dat.trans.drop(removed.index.values, axis=0, inplace=True) logger.info("{0} removed from analysis".format(removed.index.values)) dat.design.rename(columns={dat.group: "group"}, inplace=True) dat.trans.rename(columns={dat.group: "group"}, inplace=True) #Generate a group List groupList = [ title for title, group in dat.design.groupby("group") if len(group.index) > 2 ] #Turn group list into pairwise combinations comboMatrix = np.array(list(it.combinations(groupList, 2))) comboLength = len(comboMatrix) #Run R correct_list_of_names = np.array(dat.trans.columns.values.tolist()) returns = lassoEnetScript.lassoEN(dat.trans, dat.design, args.uniqID, correct_list_of_names, comboMatrix, comboLength, args.alpha, args.plots) robjects.r['write.table'](returns[0], file=args.coefficients, sep='\t', quote=False, row_names=False, col_names=True) robjects.r['write.table'](returns[1], file=args.flags, sep='\t', quote=False, row_names=False, col_names=True) # Finishing logger.info("Script Complete!")
def main(args): # Loading data through Interface logger.info("Loading data with the Interface") dat = wideToDesign(args.input, args.design, args.uniqueID, group = args.group, logger=logger) # Treat everything as numeric dat.wide = dat.wide.applymap(float) # Cleaning from missing data dat.dropMissing() # Unpaired permuted t-test. In this case there can be as many groups as possible. # Order variable is ignored and t-tests are performed pairwise for each pair of groups. logger.info("Unpaired t-test will be performed for all groups pairwise.") # Getting the unique pairs and all pairwise permutations to feed to pairwise unpaired t-tests. group_values_series = dat.transpose()[dat.group].T.squeeze() group_values_series_unique = group_values_series.unique() number_of_unique_groups = group_values_series_unique.shape[0] groups_pairwise = list(combinations(group_values_series_unique,2) ) number_of_groups_pairwise = len(groups_pairwise) # Extracting data from the interface. data_frame = dat.transpose() # Extracting number of features. # This variable not used in unpaired test. it just adds extra column to the data frame. # if args.order == False: number_of_features = data_frame.shape[1] - 1 # Saving treatment group name from the arguments. # Computing overall summaries (mean and variance). # This part just produces summary statistics for the output table. mean_value_all = [0] * number_of_features variance_value_all = [0] * number_of_features for j in range(0, number_of_features ): # Creating duplicate for manipulation. data_frame_manipulate = data_frame # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. # We should either drop 1 or 2 columns depending whether we fed the second one. data_frame_manipulate_transpose = data_frame_manipulate.drop( args.group, 1 ).transpose() # Pulling indexes list from the current data frame. indexes_list_complete = data_frame_manipulate_transpose.index.tolist() # Computing dataset summaries. mean_value_all[j] = np.mean(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ]) variance_value_all[j] = np.var(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ], ddof = 1) # Creating the table and putting the results there. summary_df = pd.DataFrame(data = mean_value_all, columns = ["GrandMean"], index = indexes_list_complete ) summary_df['SampleVariance'] = variance_value_all # Computing means for each group and outputting them. # This part just produces summary statistics for the output table. for i in range(0, number_of_unique_groups ): # Extracting the pieces of the data frame that belong to the ith group. data_frame_current_group = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[i]] )] # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. # We should either drop 1 or 2 columns depending whether we fed the second one. data_frame_current_group = data_frame_current_group.drop( args.group, 1 ).transpose() # Pulling indexes list from the current group. indexes_list = data_frame_current_group.index.tolist() # Creating array of means for the current group that will be filled. means_value = [0] * number_of_features for j in range(0, number_of_features ): series_current = data_frame_current_group.loc[ indexes_list[j] ] means_value[j] = series_current.mean() # Adding current mean_value column to the data frame and assigning the name. means_value_column_name_current = 'mean_treatment_' + group_values_series_unique[i] summary_df[means_value_column_name_current] = means_value # Running pairwise unpaired (two-sample) t-test for all pairs of group levels that are saved in groups_pairwise. for i in range(0, number_of_groups_pairwise ): # Extracting the pieces of the data frame that belong to groups saved in the i-th unique pair. groups_subset = groups_pairwise[i] data_frame_first_group = data_frame.loc[data_frame[args.group].isin( [groups_subset[0]] )] data_frame_second_group = data_frame.loc[data_frame[args.group].isin( [groups_subset[1]] )] # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. # We should either drop 1 or 2 columns depending whether we fed the second one. data_frame_first_group = data_frame_first_group.drop( args.group, 1 ).transpose() data_frame_second_group = data_frame_second_group.drop( args.group, 1 ).transpose() # Pulling indexes list from the first one (they are the same) indexes_list = data_frame_first_group.index.tolist() # Creating p_values, neg_log10_p_value, flag_values, difference_value lists filled wiht 0es. p_value = [0] * number_of_features t_value = [0] * number_of_features neg_log10_p_value = [0] * number_of_features flag_value_0p01 = [0] * number_of_features flag_value_0p05 = [0] * number_of_features flag_value_0p10 = [0] * number_of_features difference_value = [0] * number_of_features for j in range(0, number_of_features ): series_first = data_frame_first_group.loc[ indexes_list[j] ] series_second = data_frame_second_group.loc[ indexes_list[j] ] p_value[j] = two_sample(series_first, series_second, reps=int(args.reps), stat='t', alternative='two-sided', seed=None)[0] # print j # print p_value[j] t_value[j] = two_sample(series_first, series_second, reps=int(args.reps), stat='t', alternative='two-sided', seed=None)[1] # print j # print t_value[j] neg_log10_p_value[j] = - np.log10(p_value[j]) difference_value[j] = series_first.mean() - series_second.mean() if p_value[j] < 0.01: flag_value_0p01[j] = 1 if p_value[j] < 0.05: flag_value_0p05[j] = 1 if p_value[j] < 0.10: flag_value_0p10[j] = 1 # Creating column names for the data frame. p_value_column_name_current = 'perm_greater_than_t_for_diff_' + groups_subset[0] + '_' + groups_subset[1] t_value_column_name_current = 't_value_for_diff_' + groups_subset[0] + '_' + groups_subset[1] neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + groups_subset[0] + '_' + groups_subset[1] difference_value_column_name_current = 'diff_of_' + groups_subset[0] + '_' + groups_subset[1] flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_' + groups_subset[0] + '_' + groups_subset[1] flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_' + groups_subset[0] + '_' + groups_subset[1] flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_' + groups_subset[0] + '_' + groups_subset[1] # Adding current p_value and flag_value column to the data frame and assigning the name. # If the data frame has not been created yet we create it on the fly. i.e. if i == 0 create it. if i == 0: flag_df = pd.DataFrame(data = flag_value_0p01, columns = [flag_value_column_name_current_0p01], index = indexes_list ) else: flag_df[flag_value_column_name_current_0p01] = flag_value_0p01 # At this point data frame exists so only columns are added to the existing data frame. summary_df[p_value_column_name_current] = p_value summary_df[t_value_column_name_current] = t_value summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value summary_df[difference_value_column_name_current] = difference_value flag_df[flag_value_column_name_current_0p05] = flag_value_0p05 flag_df[flag_value_column_name_current_0p10] = flag_value_0p10 # Rounding the results up to 4 precision digits. summary_df = summary_df.apply(lambda x: x.round(4)) # Adding name for the unique ID column that was there originally. summary_df.index.name = args.uniqueID flag_df.index.name = args.uniqueID # Save summary_df to the ouptut summary_df.to_csv(args.summaries, sep="\t") # Save flag_df to the output flag_df.to_csv(args.flags, sep="\t") # Generating Indexing for volcano plots. # Getting data for lpvals lpvals = {col.split("_value_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \ if col.startswith("neg_log10_p_value")} # Getting data for diffs difs = {col.split("_of_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \ if col.startswith("diff_of_")} # The cutoff value for significance. cutoff=2 # Making volcano plots with PdfPages( args.volcano ) as pdf: for i in range(0, number_of_groups_pairwise ): # Set Up Figure volcanoPlot = figureHandler(proj="2d") groups_subset = groups_pairwise[i] current_key = groups_subset[0] + '_' + groups_subset[1] # Plot all results scatter.scatter2D(x=list(difs[current_key]), y=list(lpvals[current_key]), colorList=list('b'), ax=volcanoPlot.ax[0]) # Color results beyond threshold red cutLpvals = lpvals[current_key][lpvals[current_key]>cutoff] if not cutLpvals.empty: cutDiff = difs[current_key][cutLpvals.index] scatter.scatter2D(x=list(cutDiff), y=list(cutLpvals), colorList=list('r'), ax=volcanoPlot.ax[0]) # Drawing cutoffs lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0]) # Format axis (volcanoPlot) volcanoPlot.formatAxis(axTitle=current_key, grid=False, yTitle="-log10(p-value) for Diff of treatment means for {0}".format(current_key), xTitle="Difference of treatment means for {0}".format(current_key)) # Add figure to PDF volcanoPlot.addToPdf(pdfPages=pdf) # Informing that the volcano plots are done logger.info(u"Pairwise volcano plots have been created.") # Ending script logger.info(u"Finishing t-test run.")
def main(args): # Loading data trought Interface logger.info("Loading data with the Interface") dat = wideToDesign(args.input, args.design, args.uniqueID, group = args.group, runOrder=args.order, logger=logger) # Treat everything as numeric dat.wide = dat.wide.applymap(float) # Cleaning from missing data dat.dropMissing() # SCENARIO 1: Unpaired t-test. In this case there can be as many groups as possible. # Order variable is ignored and t-tests are performed pairwise for each pair of groups. if args.pairing == "unpaired": logger.info("Unpaired t-test will be performed for all groups pairwise.") # Getting the uinique pairs and all pairwise prermutations # son that we will feed them to pairwise unpaired t-tests. group_values_series = dat.transpose()[dat.group].T.squeeze() group_values_series_unique = group_values_series.unique() number_of_unique_groups = group_values_series_unique.shape[0] groups_pairwise = list(combinations(group_values_series_unique,2) ) number_of_groups_pairwise = len(groups_pairwise) # Extracting data from the interface. data_frame = dat.transpose() # Extracting number of features. This will depend on whether the user has provided ordering variable or not. # This variable is useless for unpared test. it just adds extra column to the data frame. if args.order == False: number_of_features = data_frame.shape[1] - 1 else: number_of_features = data_frame.shape[1] - 2 # Saving treatment group name from the arguments. # Computing overall summaries (mean and variance). # This part just produces sumamry statistics for the output table. # This has nothing to do with unpaired t-test. This is just summary for the table. mean_value_all = [0] * number_of_features variance_value_all = [0] * number_of_features for j in range(0, number_of_features ): # Creating duplicate for manipulation. data_frame_manipulate = data_frame # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. # We should either drop 1 or 2 columns depending whether we fed the second one. if args.order == False: data_frame_manipulate_transpose = data_frame_manipulate.drop( args.group, 1 ).transpose() else: data_frame_manipulate_transpose = data_frame_manipulate.drop( [args.group, args.order], 1 ).transpose() # Pulling indexes list from the current data frame. indexes_list_complete = data_frame_manipulate_transpose.index.tolist() # Computing dataset summaries. mean_value_all[j] = np.mean(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ]) variance_value_all[j] = np.var(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ], ddof = 1) # Creating the table and putting the results there. summary_df = pd.DataFrame(data = mean_value_all, columns = ["GrandMean"], index = indexes_list_complete ) summary_df['SampleVariance'] = variance_value_all # Computing means for each group and outputting them. # This part just produces summary statistics for the output table. # This has nothing to do with unpaired t-test. This is just summary for the table. for i in range(0, number_of_unique_groups ): # Extracting the pieces of the data frame that belong to the ith group. data_frame_current_group = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[i]] )] # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. # We should either drop 1 or 2 columns depending whether we fed the second one. if args.order == False: data_frame_current_group = data_frame_current_group.drop( args.group, 1 ).transpose() else: data_frame_current_group = data_frame_current_group.drop( [args.group, args.order], 1 ).transpose() # Pulling indexes list from the current group. indexes_list = data_frame_current_group.index.tolist() # Creating array of means for the current group that will be filled. means_value = [0] * number_of_features for j in range(0, number_of_features ): series_current = data_frame_current_group.loc[ indexes_list[j] ] means_value[j] = series_current.mean() # Adding current mean_value column to the data frame and assigning the name. means_value_column_name_current = 'mean_treatment_' + group_values_series_unique[i] summary_df[means_value_column_name_current] = means_value # Running pairwise unpaired (two-sample) t-test for all pairs of group levels that are saved in groups_pairwise. for i in range(0, number_of_groups_pairwise ): # Extracting the pieces of the data frame that belong to groups saved in the i-th unique pair. groups_subset = groups_pairwise[i] data_frame_first_group = data_frame.loc[data_frame[args.group].isin( [groups_subset[0]] )] data_frame_second_group = data_frame.loc[data_frame[args.group].isin( [groups_subset[1]] )] # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. # We should either drop 1 or 2 columns depending whether we fed the second one. if args.order == False: data_frame_first_group = data_frame_first_group.drop( args.group, 1 ).transpose() data_frame_second_group = data_frame_second_group.drop( args.group, 1 ).transpose() else: data_frame_first_group = data_frame_first_group.drop( [args.group, args.order], 1 ).transpose() data_frame_second_group = data_frame_second_group.drop( [args.group, args.order], 1 ).transpose() # Pulling indexes list from the first one (they are the same) indexes_list = data_frame_first_group.index.tolist() # Creating p_values, neg_log10_p_value, flag_values, difference_value lists filled wiht 0es. p_value = [0] * number_of_features t_value = [0] * number_of_features neg_log10_p_value = [0] * number_of_features flag_value_0p01 = [0] * number_of_features flag_value_0p05 = [0] * number_of_features flag_value_0p10 = [0] * number_of_features difference_value = [0] * number_of_features for j in range(0, number_of_features ): series_first = data_frame_first_group.loc[ indexes_list[j] ] series_second = data_frame_second_group.loc[ indexes_list[j] ] ttest_ind_args = [series_first, series_second] p_value[j] = ttest_ind( *ttest_ind_args )[1] t_value[j] = ttest_ind( *ttest_ind_args )[0] # Possible alternative for two groups. # p_value[j] = ttest_ind_args(series_first, series_second)[1] neg_log10_p_value[j] = - np.log10(p_value[j]) difference_value[j] = series_first.mean() - series_second.mean() if p_value[j] < 0.01: flag_value_0p01[j] = 1 if p_value[j] < 0.05: flag_value_0p05[j] = 1 if p_value[j] < 0.10: flag_value_0p10[j] = 1 # Creating column names for the data frame. p_value_column_name_current = 'prob_greater_than_t_for_diff_' + groups_subset[0] + '_' + groups_subset[1] t_value_column_name_current = 't_value_for_diff_' + groups_subset[0] + '_' + groups_subset[1] neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + groups_subset[0] + '_' + groups_subset[1] difference_value_column_name_current = 'diff_of_' + groups_subset[0] + '_' + groups_subset[1] flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_' + groups_subset[0] + '_' + groups_subset[1] flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_' + groups_subset[0] + '_' + groups_subset[1] flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_' + groups_subset[0] + '_' + groups_subset[1] # Adding current p_value and flag_value column to the data frame and assigning the name. # If the data frame has not been created yet we create it on the fly. i.e. if i == 0 create it. if i == 0: flag_df = pd.DataFrame(data = flag_value_0p01, columns = [flag_value_column_name_current_0p01], index = indexes_list ) else: flag_df[flag_value_column_name_current_0p01] = flag_value_0p01 # At this point data frame exists so only columns are added to the existing data frame. summary_df[p_value_column_name_current] = p_value summary_df[t_value_column_name_current] = t_value summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value summary_df[difference_value_column_name_current] = difference_value flag_df[flag_value_column_name_current_0p05] = flag_value_0p05 flag_df[flag_value_column_name_current_0p10] = flag_value_0p10 # SCENARIO 2: Paired t-test. In this case there should be EXACTLY TWO groups. # Each sample in one group should have exacty one matching pair in the other group. # The matching is controlled by args.order variable. if args.pairing == "paired": logger.info("Paired test will be performed for two groups pairwise based on pairing variable: {0}.".format(args.order)) # Getting the number of unique groups. If it is bigger than 2 return the warning and exit. group_values_series = dat.transpose()[dat.group].T.squeeze() group_values_series_unique = group_values_series.unique() number_of_unique_groups = group_values_series_unique.shape[0] if number_of_unique_groups != 2: logger.warning(u"The number of unique groups is {0} and not 2 as expected. The paired t-test cannot be performed.".format(number_of_unique_groups) ) exit() # This piece of code will be executed only if the number_of_unique_groups is exactly 2 so the group check is passed. # Creating pairwise combination of our two groups that we will use in the future. groups_pairwise = list( combinations(group_values_series_unique,2) ) number_of_groups_pairwise = len(groups_pairwise) # Extracting data from the interface. data_frame = dat.transpose() # Extracting number of features. This will depend on whether the user has provided ordering variable or not. # Checking that the requred pairing variable has been provided. if args.order == False: logger.info("The required t-test pairing variable has not been provided: The paired t-test cannot be performed.") exit() # This piece of code will be executed only if the args.order has been provided and the check is passed. # Defining the number of features. It should be the dimension of the data frame minus 2 columns that stand for arg.group and args.order number_of_features = data_frame.shape[1] - 2 # At this point is is confirmed that there are only 2 group and that pairing variable args.order has been provided. # Now we need to check that pairing is correct i.e. that each pairID corresponds to only two samples from different groups. # Getting the unique pairs and deleting those theat have more or less than three. pairid_values_series = dat.transpose()[dat.runOrder].T.squeeze() pairid_values_series_unique = pairid_values_series.unique() number_of_unique_pairid = pairid_values_series_unique.shape[0] # Extracting data from the interface. data_frame = dat.transpose() # Extracting the number of samples in the final frame. number_of_samples = data_frame.shape[0] # Performing the cleaning of the original data. We are removing samples that are not paired and not belonging to the two groups. # If the dataset has 1 or 3 or more matches for a pairid those samples are removed with a warning. # If pairdid corresponds to exactly two samples (which is correct) but groupid-s are NOT different those values will be also removed. for i in range(0, number_of_unique_pairid ): # Extracting the pieces of the data frame that belong to ith unique pairid. data_frame_current_pairid = data_frame.loc[data_frame[args.order].isin( [ pairid_values_series_unique[i] ] )] # We transpose here so it will be easier to operate with. data_frame_current_pairid = data_frame_current_pairid.transpose() sample_names_current_pairid = list(data_frame_current_pairid.columns.values) if data_frame_current_pairid.shape[1] != 2: # Pulling indexes list from the current data frame. logger.warning(u"Number of samples for the pairID: {0} is equal to {1} and NOT equal to 2. Sample(s) {2} will be removed from further analysis.".format(pairid_values_series_unique[i], data_frame_current_pairid.shape[1], sample_names_current_pairid) ) # Getting indexes we are trying to delete. boolean_indexes_to_delete = data_frame.index.isin( sample_names_current_pairid ) # Deleting the indexes and in the for loop going to next iteration. data_frame.drop(data_frame.index[boolean_indexes_to_delete], inplace=True) # This piece is executed if the numbe is correct i.e. data_frame_current_group.shape[1] == 2: # Here we are checking if the groupID-s for the given pair are indeed different. elif data_frame_current_pairid.transpose()[args.group][0] == data_frame_current_pairid.transpose()[args.group][1]: logger.warning(u"Samples in pairID {0} have groupIDs: {1} and {2}. Should be different! Sample(s) {3} will be removed from further analysis.".format(pairid_values_series_unique[i], data_frame_current_pairid.transpose()[args.group][1], data_frame_current_pairid.transpose()[args.group][0], sample_names_current_pairid) ) # Getting indexes we are trying to delete. boolean_indexes_to_delete = data_frame.index.isin( sample_names_current_pairid ) # Deleting the indexes. data_frame.drop(data_frame.index[boolean_indexes_to_delete], inplace=True) # Cheching if the data frame bacame empty after cleaning. if data_frame.shape[0] == 0: logger.warning(u"Number of paired samples in the final dataset is exactly 0! Please check the desing file for accuracy! Exiting the program." ) exit() # Computing overall summaries (mean and variance). # This part just produces sumamry statistics for the output table. # This has nothing to do with paired t-test. This is just summary for the table. mean_value_all = [0] * number_of_features variance_value_all = [0] * number_of_features for j in range(0, number_of_features ): # Creating duplicate for manipulation. data_frame_manipulate = data_frame # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. data_frame_manipulate_transpose = data_frame_manipulate.drop( [args.group,args.order], 1 ).transpose() # Pulling indexes list from the current data frame. indexes_list_complete = data_frame_manipulate_transpose.index.tolist() # Computing dataset summaries. mean_value_all[j] = np.mean(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ]) variance_value_all[j] = np.var(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ], ddof = 1) # Creating the table and putting the results there. summary_df = pd.DataFrame(data = mean_value_all, columns = ["GrandMean"], index = indexes_list_complete ) summary_df['SampleVariance'] = variance_value_all # Computing means for each group and outputting them. # This part just produces summary statistics for the output table. # This has nothing to do with paired t-test. This is just summary for the table. for i in range(0, number_of_unique_groups ): # Extracting the pieces of the data frame that belong to the ith group. data_frame_current_group = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[i]] )] # Dropping columns that characterize group. Only feature columns will remain. data_frame_current_group = data_frame_current_group.drop( [args.group, args.order], 1 ).transpose() # Pulling indexes list from the current group. indexes_list = data_frame_current_group.index.tolist() # Creating array of means for the current group that will be filled. means_value = [0] * number_of_features for j in range(0, number_of_features ): series_current = data_frame_current_group.loc[ indexes_list[j] ] means_value[j] = series_current.mean() # Adding current mean_value column to the data frame and assigning the name. means_value_column_name_current = 'mean_treatment_' + group_values_series_unique[i] summary_df[means_value_column_name_current] = means_value # Performing paired t-test for the two groups and saving the results. # Creating p_values and flag_values emply list of length number_of_features. # This will be used for thw two groups in paired t-test. p_value = [0] * number_of_features t_value = [0] * number_of_features flag_value_0p01 = [0] * number_of_features flag_value_0p05 = [0] * number_of_features flag_value_0p10 = [0] * number_of_features neg_log10_p_value = [0] * number_of_features difference_value = [0] * number_of_features # Performing paired t-test for each pair of features. for j in range(0, number_of_features ): # Extracting the pieces of the data frame that belong to 1st group. data_frame_first_group = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[0]] )] data_frame_second_group = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[1]] )] # Sorting data frame by args.group index # This will ensure datasets are aligned by pair when fed to the t-test. data_frame_first_group = data_frame_first_group.sort(args.order) data_frame_second_group = data_frame_second_group.sort(args.order) # Sorting data frame by args.group index data_frame_first_group = data_frame_first_group.drop( [args.group,args.order], 1 ).transpose() data_frame_second_group = data_frame_second_group.drop( [args.group,args.order], 1 ).transpose() # Pulling list of indexes. This is the same list for the first and for the second. indexes_list = data_frame_first_group.index.tolist() # Pullinng the samples out series_first = data_frame_first_group.loc[ indexes_list[j] ] series_second = data_frame_second_group.loc[ indexes_list[j] ] # Running t-test for the two given samples paired_ttest_args = [series_first, series_second] p_value[j] = ttest_rel( *paired_ttest_args )[1] t_value[j] = ttest_rel( *paired_ttest_args )[0] neg_log10_p_value[j] = - np.log10(p_value[j]) difference_value[j] = series_first.mean() - series_second.mean() if p_value[j] < 0.01: flag_value_0p01[j] = 1 if p_value[j] < 0.05: flag_value_0p05[j] = 1 if p_value[j] < 0.10: flag_value_0p10[j] = 1 # The loop over features has to be finished by now. Converting them into the data frame. # Creating column names for the data frame. p_value_column_name_current = 'prob_greater_than_t_for_diff_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] t_value_column_name_current = 't_value_for_diff_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] difference_value_column_name_current = 'diff_of_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] flag_value_column_name_current_0p01 = 'flag_value_diff_signif_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] + '_0p01' flag_value_column_name_current_0p05 = 'flag_value_diff_signif_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] + '_0p05' flag_value_column_name_current_0p10 = 'flag_value_diff_signif_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] + '_0p10' summary_df[t_value_column_name_current] = t_value summary_df[p_value_column_name_current] = p_value summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value summary_df[difference_value_column_name_current] = difference_value flag_df = pd.DataFrame(data = flag_value_0p01, columns = [flag_value_column_name_current_0p01], index = indexes_list ) flag_df[flag_value_column_name_current_0p05] = flag_value_0p05 flag_df[flag_value_column_name_current_0p10] = flag_value_0p10 # Roundign the results up to 4 precision digits. summary_df = summary_df.apply(lambda x: x.round(4)) # Adding name for the unique ID column that was there oroginally. summary_df.index.name = args.uniqueID flag_df.index.name = args.uniqueID # Save summary_df to the ouptut summary_df.to_csv(args.summaries, sep="\t") # Save flag_df to the output flag_df.to_csv(args.flags, sep="\t") # Generating Indexing for volcano plots. # Getting data for lpvals lpvals = {col.split("_value_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \ if col.startswith("neg_log10_p_value")} # Gettign data for diffs difs = {col.split("_of_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \ if col.startswith("diff_of_")} # The cutoff value for significance. cutoff=2 # Making volcano plots with PdfPages( args.volcano ) as pdf: for i in range(0, number_of_groups_pairwise ): # Set Up Figure volcanoPlot = figureHandler(proj="2d") groups_subset = groups_pairwise[i] current_key = groups_subset[0] + '_' + groups_subset[1] # Plot all results scatter.scatter2D(x=list(difs[current_key]), y=list(lpvals[current_key]), colorList=list('b'), ax=volcanoPlot.ax[0]) # Color results beyond treshold red cutLpvals = lpvals[current_key][lpvals[current_key]>cutoff] if not cutLpvals.empty: cutDiff = difs[current_key][cutLpvals.index] scatter.scatter2D(x=list(cutDiff), y=list(cutLpvals), colorList=list('r'), ax=volcanoPlot.ax[0]) # Drawing cutoffs lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0]) # Format axis (volcanoPlot) volcanoPlot.formatAxis(axTitle=current_key, grid=False, yTitle="-log10(p-value) for Diff of treatment means for {0}".format(current_key), xTitle="Difference of treatment means for {0}".format(current_key)) # Add figure to PDF volcanoPlot.addToPdf(pdfPages=pdf) # Informing that the volcano plots are done logger.info(u"Pairwise volcano plots have been created.") # Ending script logger.info(u"Finishing running of t-test.")
def main(args): """ Main Script """ #Checking if levels if args.levels and args.group: levels = [args.group] + args.levels elif args.group and not args.levels: levels = [args.group] else: levels = [] logger.info(u"Groups used to color by: {0}".format(",".join(levels))) # Parsing data with interface dat = wideToDesign(args.input, args.design, args.uniqID, group=args.group, anno=args.levels, logger=logger, runOrder=args.order) # Removing groups with just one sample and then clean from missing data. dat.removeSingle() dat.dropMissing() # Select colors for data (dat.design contains a copy of dat.design with an # additional columns for colors). dataPalette.getColors(design=dat.design, groups=levels) # Getting list of indexex to subset wide file if args.group: disGroups = [(group.index, level) for level, group in dataPalette.design.groupby(dataPalette.combName)] else: disGroups = [(dat.design.index, "samples")] # Iterating over subgroups pairwise_disCuts = list() toMean_disCuts = list() for indexes, name in disGroups: # If less than 3 elements in the group skip to the next if len(indexes) < 3: logger.error("Group {0} has less than 3 elements, it will not be"\ " included in the analysis".format(level)) continue #Subsetting wide currentFrame = pd.DataFrame(dat.wide[indexes].copy()) currentFrame.name = name # Calculate Penalized Sigma penalizedSigma = calculatePenalizedSigma(data=currentFrame, penalty=args.penalty) # Calculate Distances (dis estands for distance) disToMean, disPairwise = calculateDistances(data=currentFrame, V_VI=penalizedSigma) # Calculate cutoffs cutoff1, cutoff2 = calculateCutoffs(currentFrame, args.p) # Appending results pairwise_disCuts.append([disPairwise, cutoff2]) toMean_disCuts.append([disToMean, cutoff1]) if args.group: # Splitting results to mean and pairwise pairwise_dis = [distance for distance, cutoff in pairwise_disCuts] toMean_dis = [distance for distance, cutoff in toMean_disCuts] # Merging to get distance for all pairwise pairwise_dis_all = pd.DataFrame(columns=["group"]) for dis in pairwise_dis: dis.loc[:, "group"] = [dis.name] * len(dis.columns) pairwise_dis_all = pd.DataFrame.merge(pairwise_dis_all, dis, on=["group"], left_index=True, right_index=True, how='outer', sort=False) pairwise_dis_all.sort_values(by="group", inplace=True) pairwise_dis_all.drop("group", axis=1, inplace=True) pairwise_dis_all.name = "samples" # Merging to get distance for all to mean toMean_dis_all = pd.DataFrame(columns=["group", "distance_to_mean"]) for dis in toMean_dis: dis.loc[:, "group"] = [dis.name] * len(dis.columns) toMean_dis_all = pd.DataFrame.merge( toMean_dis_all, dis, on=['distance_to_mean', 'group'], left_index=True, right_index=True, how='outer', sort=False) toMean_dis_all.sort_values(by="group", inplace=True) toMean_dis_all.drop("group", axis=1, inplace=True) toMean_dis_all.name = "samples" # Geting cuttoffs for distances cutoff1, cutoff2 = calculateCutoffs(dat.wide, args.p) # Appending toMean_dis_all and pairwise_dis_all to toMean_dis_cuts and # pairwise_dis_cuts respectively. toMean_disCuts.append([toMean_dis_all, cutoff1]) pairwise_disCuts.append([pairwise_dis_all, cutoff2]) # Iterating over each pair of (distance,cutoff) for toMean and pairwise to # plot distances. with PdfPages((args.figure)) as pdf: # Iterating over toMean,pairwise distances in parallel for toMean, pairwise in zip(toMean_disCuts, pairwise_disCuts): # Making plots plotDistances(df_distance=toMean[0], palette=dataPalette, p=args.p, plotType="Scatterplot", disType="Mahalanobis", cutoff=toMean[1], pdf=pdf) plotDistances(df_distance=pairwise[0], palette=dataPalette, p=args.p, plotType="Scatterplot", disType="Mahalanobis", cutoff=pairwise[1], pdf=pdf) plotDistances(df_distance=pairwise[0], palette=dataPalette, p=args.p, plotType="Box-plots", disType="Mahalanobis", cutoff=pairwise[1], pdf=pdf) # Since its a list of dataframes and we are only interested in the last one # we are using [-1] to access it and [0] to getit out of the list. # Outputting distances to mean and pairwise toMean_disCuts[-1][0].to_csv(args.toMean, index_label="sampleID", sep='\t') pairwise_disCuts[-1][0].to_csv(args.pairwise, index_label="sampleID", sep='\t') # Ending script logger.info("Script complete.")
def main(args): # Import data dat = wideToDesign(args.input,args.design,args.uniqID,logger=logger) # Cleaning from missing data dat.dropMissing() # Generate formula Formula preFormula,categorical,numerical,levels,dat.design = preProcessing(design=dat.design, factorTypes=args.ftypes, factorNames=args.factors) # Transpose data dat.trans = dat.transpose() # if interactions if args.interactions: logger.info("Running ANOVA on interactions") dat.trans["_treatment_"] = dat.trans.apply(lambda x: \ "_".join(map(str,x[categorical].values)),axis=1) dat.design["_treatment_"] = dat.design.apply(lambda x: \ "_".join(map(str,x[categorical].values)),axis=1) # if numerical adde then to the formula if len(numerical)>0: formula = ["C(_treatment_)"]+numerical else: formula = ["C(_treatment_)"] # Concatenatig the formula formula = "+".join(formula) # Getting new formula for interactions dictFormula = {feature:"{0}~{1}".format(str(feature),formula) \ for feature in dat.wide.index.tolist()} # Creating levelCombinations levels=sorted(list(set(dat.trans["_treatment_"].tolist()))) # Creating levelCombinations reverseLevels = copy.copy(levels) reverseLevels.reverse() lvlComb = list() generateDinamicCmbs([levels],lvlComb) # Running anova logger.info('Running anova models') results,significant,residDat,fitDat = runANOVA(dat=dat, categorical=["_treatment_"], levels=[levels], lvlComb=lvlComb, formula=dictFormula, numerical=numerical) else: logger.info("Running ANOVA without interactions") # Create combination of groups nLevels = [list(itertools.chain.from_iterable(levels))] reverseLevels = copy.copy(nLevels) reverseLevels.reverse() lvlComb = list() generateDinamicCmbs(reverseLevels,lvlComb) # Maps every metabolite to its formulas dictFormula = {feature:"{0}~{1}".format(str(feature),preFormula) for feature \ in dat.wide.index.values} # running anova logger.info('Running anova models') results,significant,residDat,fitDat = runANOVA(dat=dat, categorical=categorical, levels=levels, lvlComb=lvlComb, formula=dictFormula, numerical=numerical) # QQ plots logger.info('Generating q-q plots.') qqPlot(residDat.T, fitDat.T, args.ofig) # Generate Volcano plots logger.info('Generating volcano plots.') volcano(lvlComb, results, args.ofig2) # Round results to 4 digits and save results = results.round(4) results.index.name = dat.uniqID results.to_csv(args.oname, sep="\t") # Flags significant.index.name = dat.uniqID significant.to_csv(args.flags, sep="\t")
def main(args): # Import data dat = wideToDesign(args.input, args.design, args.uniqID, args.group, logger=logger) # Get a list of samples to process, if processOnly is specified only # analyze specified group. if args.processOnly: dat.design = dat.design[dat.design[args.group].isin(args.processOnly)] toProcess = dat.design.index dat.sampleIDs = toProcess.tolist() # Create dataframe with sampleIDs that are to be analyzed. dat.keep_sample(dat.sampleIDs) # Get list of pairwise combinations. If group is specified, only do # within group combinations. combos = list() if args.group: # If group is given, only do within group pairwise combinations logger.info('Only doing within group, pairwise comparisons.') for groupName, dfGroup in dat.design.groupby(dat.group): combos.extend(list(combinations(dfGroup.index, 2))) else: logger.info('Doing all pairwise comparisons. This could take a while!') # Get all pairwise combinations for all samples combos.extend(list(combinations(dat.sampleIDs, 2))) # Open a multiple page PDF for plots ppBA = PdfPages(args.baName) # Loop over combinations and generate plots and return a list of flags. logger.info('Generating flags and plots.') flags = map(lambda combo: iterateCombo(dat, combo, ppBA), combos) # Close PDF with plots ppBA.close() # Merge flags logger.info('Merging outlier flags.') merged = Flags.merge(flags) # Summarize flags logger.info('Summarizing outlier flags.') propSample, propFeature, propSample_p, propFeature_p, propSample_c, propFeature_c, propSample_d, propFeature_d = summarizeFlags( dat, merged, combos) plotFlagDist(propSample, propFeature, args.distName) # Create sample level flags flag_sample = Flags(index=dat.sampleIDs) flag_sample.addColumn(column='flag_sample_BA_outlier', mask=(propSample >= args.sampleCutoff)) flag_sample.addColumn(column='flag_sample_BA_pearson', mask=(propSample_p >= args.sampleCutoff)) flag_sample.addColumn(column='flag_sample_BA_cooks', mask=(propSample_c >= args.sampleCutoff)) flag_sample.addColumn(column='flag_sample_BA_dffits', mask=(propSample_d >= args.sampleCutoff)) flag_sample.df_flags.index.name = "sampleID" flag_sample.df_flags.to_csv(args.flagSample, sep='\t') # Create metabolite level flags flag_metabolite = Flags(dat.wide.index) flag_metabolite.addColumn(column='flag_feature_BA_outlier', mask=(propFeature >= args.featureCutoff)) flag_metabolite.addColumn(column='flag_feature_BA_pearson', mask=(propFeature_p >= args.featureCutoff)) flag_metabolite.addColumn(column='flag_feature_BA_cooks', mask=(propFeature_c >= args.featureCutoff)) flag_metabolite.addColumn(column='flag_feature_BA_dffits', mask=(propFeature_d >= args.featureCutoff)) flag_metabolite.df_flags.to_csv(args.flagFeature, sep='\t') # Finish Script logger.info("Script Complete!")
def main(args): # If the user provides grouping variable we test each group against the null (my supplied by user, 0 is the default). if args.group != False: logger.info( u"""t-test will be performed for all groups saved in [{0}] variable in the desing file pairwise with the H_0: mu = {1}.""" .format(args.group, args.mu)) # Loading data trought Interface. logger.info("Loading data with the Interface") dat = wideToDesign(args.input, args.design, args.uniqueID, group=args.group, logger=logger) # Treat everything as numeric. dat.wide = dat.wide.applymap(float) # Cleaning from the missing data. dat.dropMissing() # Getting the uinique group values so that we will feed them to the t-tests. group_values_series = dat.transpose()[dat.group].T.squeeze() group_values_series_unique = group_values_series.unique() number_of_unique_groups = group_values_series_unique.shape[0] # Extracting data from the interface. data_frame = dat.transpose() # Extracting number of features. We subtract 1 since we have provided args.group number_of_features = data_frame.shape[1] - 1 # Saving treatment group name from the arguments. # Computing overall summaries (mean and variance). # This part just produces sumamry statistics for the output table. # This has nothing to do with the single sample t-test. mean_value_all = [0] * number_of_features variance_value_all = [0] * number_of_features for j in range(0, number_of_features): # Creating duplicate for manipulation. data_frame_manipulate = data_frame # Dropping columns that characterize group. Only feature columns will remain. # We also transpose here so it will be easier to operate with. data_frame_manipulate_transpose = data_frame_manipulate.drop( args.group, 1).transpose() # Pulling indexes list from the current data frame. indexes_list_complete = data_frame_manipulate_transpose.index.tolist( ) # Computing dataset summaries for feature j. mean_value_all[j] = np.mean( data_frame_manipulate_transpose.loc[indexes_list_complete[j]]) variance_value_all[j] = np.var( data_frame_manipulate_transpose.loc[indexes_list_complete[j]], ddof=1) # Creating the table and putting the results there. summary_df = pd.DataFrame(data=mean_value_all, columns=["GrandMean"], index=indexes_list_complete) summary_df['SampleVariance'] = variance_value_all # Running single sample t-test for all groups. # We are also computing means for each group and outputting them. for i in range(0, number_of_unique_groups): # Extracting the pieces of the data frame that belong to the ith group. data_frame_current_group = data_frame.loc[data_frame[ args.group].isin([group_values_series_unique[i]])] # Dropping columns that characterize group. Only feature columns will remain. # We also trnaspose here so it will be easier to operate with. data_frame_current_group = data_frame_current_group.drop( args.group, 1).transpose() # Pulling indexes list from the current group. indexes_list = data_frame_current_group.index.tolist() # Creating array of means for the current group that will be filled. # Creating p values, difference values, neg_log10_p_value, t-value, flag_value lists filled wiht 0es. means_value = [0] * number_of_features difference_value = [0] * number_of_features p_value = [0] * number_of_features t_value = [0] * number_of_features neg_log10_p_value = [0] * number_of_features flag_value_0p01 = [0] * number_of_features flag_value_0p05 = [0] * number_of_features flag_value_0p10 = [0] * number_of_features for j in range(0, number_of_features): series_current = data_frame_current_group.loc[indexes_list[j]] means_value[j] = series_current.mean() # Performing one sample t-test ttest_1samp_args = [series_current, float(args.mu)] p_value[j] = ttest_1samp(*ttest_1samp_args)[1] t_value[j] = ttest_1samp(*ttest_1samp_args)[0] neg_log10_p_value[j] = -np.log10(p_value[j]) difference_value[j] = means_value[j] - float(args.mu) if p_value[j] < 0.01: flag_value_0p01[j] = 1 if p_value[j] < 0.05: flag_value_0p05[j] = 1 if p_value[j] < 0.10: flag_value_0p10[j] = 1 # Creating names for the current analysis columns and adding result columns to the data frame. means_value_column_name_current = 'mean_treatment_' + group_values_series_unique[ i] p_value_column_name_current = 'prob_greater_than_t_for_diff_' + group_values_series_unique[ i] + '_' + args.mu t_value_column_name_current = 't_value_for_diff_' + group_values_series_unique[ i] + '_' + args.mu neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + group_values_series_unique[ i] + '_' + args.mu difference_value_column_name_current = 'diff_of_' + group_values_series_unique[ i] + '_' + args.mu flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_' + group_values_series_unique[ i] + '_' + args.mu flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_' + group_values_series_unique[ i] + '_' + args.mu flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_' + group_values_series_unique[ i] + '_' + args.mu # Adding flag_value column to the data frame and assigning the name. # If the data frame for flags has not been created yet we create it on the fly. i.e. if i == 0 create it. if i == 0: flag_df = pd.DataFrame( data=flag_value_0p01, columns=[flag_value_column_name_current_0p01], index=indexes_list) else: flag_df[flag_value_column_name_current_0p01] = flag_value_0p01 # At this point data frames (summary and flags) exist so only columns are added to the existing data frame. summary_df[means_value_column_name_current] = means_value summary_df[p_value_column_name_current] = p_value summary_df[t_value_column_name_current] = t_value summary_df[ neg_log10_p_value_column_name_current] = neg_log10_p_value summary_df[difference_value_column_name_current] = difference_value flag_df[flag_value_column_name_current_0p05] = flag_value_0p05 flag_df[flag_value_column_name_current_0p10] = flag_value_0p10 # If the user does not provide grouping variable we test all dataset as a single group against the null (my supplied by user, 0 is the default). if args.group == False: logger.info( u"""t-test will be performed for the entire dataset since goruping variable was not provided.""" ) # Loading data trough the interface logger.info("Loading data with the Interface") dat = wideToDesign(args.input, args.design, args.uniqueID, logger=logger) # Treat everything as numeric dat.wide = dat.wide.applymap(float) # Cleaning from missing data dat.dropMissing() # Saving the number of unique groups that will be used for plotting. # Since we did not feed any grouping variable it is exactly one. number_of_unique_groups = 1 # Extracting data from the interface. data_frame = dat.wide.transpose() # Extracting number of features. We do not subtract 1 since we have not provided args.group number_of_features = data_frame.shape[1] # Saving treatment group name from the arguments. # Computing overall summaries (mean and variance). # This part just produces sumamry statistics for the output table. # This has nothing to do with single sample t-test. This is just summary for the table. mean_value_all = [0] * number_of_features variance_value_all = [0] * number_of_features # Creating array of means for the current group that will be filled. # Creating p_values, neg_log10_p_value, flag_values, difference_value lists filled wiht 0es. p_value = [0] * number_of_features t_value = [0] * number_of_features neg_log10_p_value = [0] * number_of_features difference_value = [0] * number_of_features flag_value_0p01 = [0] * number_of_features flag_value_0p05 = [0] * number_of_features flag_value_0p10 = [0] * number_of_features for j in range(0, number_of_features): # We transpose here so data will be easier to operate on. data_frame_manipulate_transpose = data_frame.transpose() # Pulling indexes list from the current data frame. indexes_list_complete = data_frame_manipulate_transpose.index.tolist( ) # Computing dataset summaries. mean_value_all[j] = np.mean( data_frame_manipulate_transpose.loc[indexes_list_complete[j]]) variance_value_all[j] = np.var( data_frame_manipulate_transpose.loc[indexes_list_complete[j]], ddof=1) # Performing one sample t-test for the entire dataset. ttest_1samp_args = [ data_frame_manipulate_transpose.loc[indexes_list_complete[j]], float(args.mu) ] p_value[j] = ttest_1samp(*ttest_1samp_args)[1] t_value[j] = ttest_1samp(*ttest_1samp_args)[0] neg_log10_p_value[j] = -np.log10(p_value[j]) difference_value[j] = mean_value_all[j] - float(args.mu) if p_value[j] < 0.01: flag_value_0p01[j] = 1 if p_value[j] < 0.05: flag_value_0p05[j] = 1 if p_value[j] < 0.10: flag_value_0p10[j] = 1 # Creating the table and putting the results there. summary_df = pd.DataFrame(data=mean_value_all, columns=["GrandMean"], index=indexes_list_complete) summary_df['SampleVariance'] = variance_value_all # Creating names for the current analysis columns and adding result columns to the data frame. means_value_column_name_current = 'mean_treatment_all' p_value_column_name_current = 'prob_greater_than_t_for_diff_all_' + args.mu t_value_column_name_current = 't_value_for_diff_all_' + args.mu neg_log10_p_value_column_name_current = 'neg_log10_p_value_all_' + args.mu difference_value_column_name_current = 'diff_of_all_' + args.mu flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_all_' + args.mu flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_all_' + args.mu flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_all_' + args.mu summary_df[means_value_column_name_current] = mean_value_all summary_df[p_value_column_name_current] = p_value summary_df[t_value_column_name_current] = t_value summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value summary_df[difference_value_column_name_current] = difference_value flag_df = pd.DataFrame(data=flag_value_0p01, columns=[flag_value_column_name_current_0p01], index=indexes_list_complete) flag_df[flag_value_column_name_current_0p05] = flag_value_0p05 flag_df[flag_value_column_name_current_0p10] = flag_value_0p10 # Roundign the results up to 4 precision digits. summary_df = summary_df.apply(lambda x: x.round(4)) # Adding name for the unique ID column that was there oroginally. summary_df.index.name = args.uniqueID flag_df.index.name = args.uniqueID # Save summary_df to the ouptut summary_df.to_csv(args.summaries, sep="\t") # Save flag_df to the output flag_df.to_csv(args.flags, sep="\t") # Generating Indexing for volcano plots. # Getting data for lpvals lpvals = {col.split("_value_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \ if col.startswith("neg_log10_p_value")} # Gettign data for diffs difs = {col.split("_of_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \ if col.startswith("diff_of_")} # The cutoff value for significance. cutoff = 2 # Making volcano plots with PdfPages(args.volcano) as pdf: for i in range(0, number_of_unique_groups): # Set Up Figure volcanoPlot = figureHandler(proj="2d") # If no grouping variable is provided. if number_of_unique_groups == 1: current_key = 'all_' + args.mu else: current_key = group_values_series_unique[i] + '_' + args.mu # Plot all results scatter.scatter2D(x=list(difs[current_key]), y=list(lpvals[current_key]), colorList=list('b'), ax=volcanoPlot.ax[0]) # Color results beyond treshold red cutLpvals = lpvals[current_key][lpvals[current_key] > cutoff] if not cutLpvals.empty: cutDiff = difs[current_key][cutLpvals.index] scatter.scatter2D(x=list(cutDiff), y=list(cutLpvals), colorList=list('r'), ax=volcanoPlot.ax[0]) # Drawing cutoffs lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0]) # Format axis (volcanoPlot) volcanoPlot.formatAxis( axTitle=current_key, grid=False, yTitle="-log10(p-value) for Diff of treatment means for {0}". format(current_key), xTitle="Difference of the means from H0 for {0}".format( current_key)) # Add figure to PDF volcanoPlot.addToPdf(pdfPages=pdf) logger.info(u"Volcano plots have been created.") logger.info(u"Finishing running of t-test.")
def main(args): target = wideToDesign(wide=args.test_wide, design=args.test_design, uniqID=args.uniqID, group=args.group, logger=logger) train = wideToDesign(wide=args.train_wide, design=args.train_design, uniqID=args.uniqID, group=args.group, logger=logger) train.wide = train.wide.applymap(float) target.wide = train.wide.applymap(float) train.dropMissing() train = train.transpose() target.dropMissing() target = target.transpose() for i in target.columns: if i not in train.columns: del target[i] cv_status = args.cross_validation kernel_final = args.kernel gamma_final = float(args.a) coef0_final = float(args.b) degree_final = int(args.degree) # Definging the data to use for the model training. train_classes_to_feed = train[args.group].copy() train_data_to_feed = train del train_data_to_feed[args.group] # Definging the data to use for the model target. target_classes_to_feed = target[args.group].copy() target_data_to_feed = target del target_data_to_feed[args.group] if cv_status == "none": logger.info(u"Using the value of C specified by the user.") C_final = float(args.C) if cv_status == "single": logger.info(u"Using the value of C determined via a single \ cross-validation.") if (len(train_classes_to_feed) < 100): logger.info(u"The required number of samples for a single \ cross-validation procedure is at least 100. The \ dataset has {0}.".format(len(train_classes_to_feed))) logger.info(u"Exiting the tool.") exit(1) C_lower = float(args.C_lower_bound) C_upper = float(args.C_upper_bound) C_list_of_values = np.linspace(C_lower, C_upper, 20) gamma_param_dict = { "kernel": [kernel_final], "C": C_list_of_values, "gamma": [gamma_final], "coef0": [coef0_final], "degree": [degree_final] } auto_gamma_param_dict = { "kernel": [kernel_final], "C": C_list_of_values, "gamma": ["auto"], "coef0": [coef0_final], "degree": [degree_final] } try: logger.info("Running SVM model") internal_cv = GridSearchCV(estimator=SVC(), param_grid=gamma_param_dict) except ValueError: logger.info("Model failed with gamma = {0} trying automatic gamma \ instead of.".format(float(args.a))) internal_cv = GridSearchCV(estimator=SVC(), param_grid=auto_gamma_param_dict) internal_cv.fit(train_data_to_feed, train_classes_to_feed) C_final = internal_cv.best_params_['C'] if cv_status == "double": logger.info(u"Using the value of C determined via a double \ cross-validation.") if (len(train_classes_to_feed) < 100): logger.info(u"The required number of samples for a double \ cross-validation procedure is at least 100. The \ dataset has {0}.".format(len(train_classes_to_feed))) logger.info(u"Exiting the tool.") exit() C_lower = float(args.C_lower_bound) C_upper = float(args.C_upper_bound) C_list_of_values = np.linspace(C_lower, C_upper, 20) C_final = C_list_of_values[0] for index_current in range(0, 20): C_list_of_values_current = np.linspace( C_list_of_values[0], C_list_of_values[index_current], (index_current + 1)) # Creating dictionary for the single cross-validation procedure. # In this dictionary gamma is speficied by the user. gamma_param_dict = { "kernel": [kernel_final], "C": C_list_of_values_current, "gamma": [gamma_final], "coef0": [coef0_final], "degree": [degree_final] } # gamma is determined automatically if the first dictionary fails. auto_gamma_param_dict = { "kernel": [kernel_final], "C": C_list_of_values_current, "gamma": ["auto"], "coef0": [coef0_final], "degree": [degree_final] } try: logger.info("Running SVM model") internal_cv = GridSearchCV(estimator=SVC(), param_grid=gamma_param_dict) except ValueError: logger.info("Model failed with gamma = {0} trying automatic \ gamma instead of.".format(float(args.a))) internal_cv = GridSearchCV(estimator=SVC(), param_grid=auto_gamma_param_dict) internal_cv.fit(train_data_to_feed, train_classes_to_feed) external_cv = cross_val_score(internal_cv, train_data_to_feed, train_classes_to_feed) if index_current == 0: best_predction_proportion = external_cv.mean() else: if external_cv.mean() > best_predction_proportion: best_predction_proportion = external_cv.mean() C_final = C_list_of_values[index_current] C_final = float(C_final) print("The value of C used for the SVM classifier is {}".format(C_final)) try: logger.info("Running SVM model") svm_model = svm.SVC(kernel=args.kernel, C=C_final, gamma=float(args.a), coef0=float(args.b), degree=int(args.degree)) except ValueError: logger.info("Model failed with gamma = {0} trying automatic gamma \ instead.".format(float(args.a))) svm_model = svm.SVC(kernel=args.kernel, C=C_final, gamma="auto", coef0=float(args.b), degree=int(args.degree)) svm_model.fit(train_data_to_feed, train_classes_to_feed) train_fitted_values = svm_model.predict(train_data_to_feed) train_fitted_values_series = pd.Series(train_fitted_values, index=train_classes_to_feed.index) train_classes_to_feed_series = pd.Series(train_classes_to_feed, index=train_classes_to_feed.index) classification_df = pd.DataFrame({ 'Group_Observed': train_classes_to_feed_series, 'Group_Predicted': train_fitted_values_series }) classification_df.to_csv(args.outClassification, index='sampleID', sep='\t') classification_mismatch_percent = 100 * sum( classification_df['Group_Observed'] == classification_df['Group_Predicted']) / classification_df.shape[0] classification_mismatch_percent_string = str( classification_mismatch_percent) + ' Percent' os.system("echo {0} > {1}".format(classification_mismatch_percent_string, args.outClassificationAccuracy)) target_fitted_values = svm_model.predict(target_data_to_feed) target_fitted_values_series = pd.Series(target_fitted_values, index=target_classes_to_feed.index) target_classes_to_feed_series = pd.Series( target_classes_to_feed, index=target_classes_to_feed.index) prediction_df = pd.DataFrame({ 'Group_Observed': target_classes_to_feed_series, 'Group_Predicted': target_fitted_values_series }) prediction_df.to_csv(args.outPrediction, index='sampleID', sep='\t') prediction_mismatch_percent = 100 * sum( prediction_df['Group_Observed'] == prediction_df['Group_Predicted']) / prediction_df.shape[0] prediction_mismatch_percent_string = str( prediction_mismatch_percent) + ' Percent' os.system("echo {0} > {1}".format(prediction_mismatch_percent_string, args.outPredictionAccuracy)) logger.info("Script Complete!")
def main(args): # Importing data trough logger.info("Loading data trough the interface") dat = wideToDesign(args.input, args.design, args.uniqID, logger=logger) # Cleaning from missing data dat.dropMissing() # Transpose data to normalize toNormalize_df = dat.wide.T # Telling the user about the selected normalization method. logger.info("Normalizing data using {0} method.".format(args.method)) # mean, median and sum are applied per sample across features!!!! if args.method == "mean" or args.method == "sum" or args.method == "median": if args.method == "mean": toNormalize_df[args.method] = toNormalize_df.mean(axis=1) logger.info( "Mean scaling is used for each sample across features.") if args.method == "sum": toNormalize_df[args.method] = toNormalize_df.sum(axis=1) logger.info("Sum scaling is used for each sample across features.") if args.method == "median": toNormalize_df[args.method] = toNormalize_df.median(axis=1) logger.info( "Median scaling is used for each sample across features.") # Dividing by factor toNormalize_df = toNormalize_df.apply(lambda x: x / x[args.method], axis=1) # Dropping extra column toNormalize_df.drop(args.method, axis=1, inplace=True) # "centering", "auto", "range", "pareto", "level", "vast" are performed per feature across samples!!!! else: # Computing mean for each feature. feature_value_means = toNormalize_df.mean(axis=0) if args.method == "centering": # Performing centering for each feature. # In this case the value fo each feature will have mean zero across samples. logger.info("Centering is used for each feature across samples.") toNormalize_df = toNormalize_df - feature_value_means if args.method == "auto": # Computing standard deviation for each feature. feature_value_std = toNormalize_df.std(axis=0, ddof=1) # Performing auto-sclaing. # In this case the value fo each feature will have mean zero and std = 1 across samples. logger.info("Autoscaling is used for each feature across samples.") toNormalize_df = (toNormalize_df - feature_value_means) / feature_value_std if args.method == "pareto": # Computing standard deviation and the square root of it for each feature. feature_value_std = toNormalize_df.std(axis=0, ddof=1) feature_value_std_sqrt = np.sqrt(feature_value_std) # Performing Pareto Scaling. The only difference from auto-scaling is that we use sqrt(standar_deviation). # In this case the value fo each feature will have mean zero and std will NOT be 1 across samples. logger.info( "Pareto scaling is used for each feature across samples.") toNormalize_df = (toNormalize_df - feature_value_means) / feature_value_std_sqrt if args.method == "range": # Computing mean, min, max and range for each feature. feature_value_min = toNormalize_df.min(axis=0) feature_value_max = toNormalize_df.max(axis=0) feature_value_max_min = feature_value_max - feature_value_min # Performing range scaling. Each feature is centered and divided by the range of that feature. logger.info( "Range scaling is used for each feature across samples.") toNormalize_df = (toNormalize_df - feature_value_means) / feature_value_max_min if args.method == "level": # Performing level scaling. Each feature is centered and divided by the the mean of that feature. logger.info( "Level scaling is used for each feature across samples.") toNormalize_df = (toNormalize_df - feature_value_means) / feature_value_means if args.method == "vast": # Computing standard deviation and coefficient of variaiton for each feature. feature_value_std = toNormalize_df.std(axis=0, ddof=1) feature_value_cv = feature_value_std / feature_value_means # Performing VarianceStabilizing (VAST) scaling. Each feature is centered and divided by the coefficient of variation. logger.info( "VAST scaling is used for each feature across samples.") toNormalize_df = (toNormalize_df - feature_value_means) / feature_value_std toNormalize_df = toNormalize_df / feature_value_cv # Transposing normalized data normalized_df = toNormalize_df.T # Saving data normalized_df.to_csv(args.out, sep="\t") logger.info("Script Complete!")