def plotDistributions(data, cutoff, palette, pdf): # Open new figureHandler instance fh = figureHandler(proj='2d', figsize=(14, 8)) #Get xmin and xmax xmin = -np.nanpercentile(data['cv'].values, 99) * 0.2 xmax = np.nanpercentile(data['cv'].values, 99) * 1.5 # Split design file by treatment group and plot density plot for name, group in palette.design.groupby(palette.combName): dist.plotDensityDF(data=data["cv_" + name], ax=fh.ax[0], colors=palette.ugColors[name], lb="{0}".format(name)) # Plot cutoff lines.drawCutoffVert(ax=fh.ax[0], x=cutoff, lb="Cutoff at: {0}".format(cutoff)) # Plot legend fh.makeLegendLabel(ax=fh.ax[0]) # Give format to the axis fh.formatAxis( yTitle="Density", xlim=(xmin, xmax), figTitle="Density Plot of Coefficients of Variation by {0}".format( palette.combName)) # Shrink figure fh.shrink() # Add figure to PDF fh.addToPdf(pdfPages=pdf)
def plotDensity (data, name, pdf): """ This function takes pandas dataframe data and plots a density plot and a boxplot. """ # Stablishing figure layout (x,y,colspan,rowspan) axisLayout = [(0,0,1,3),(3,0,1,1)] # Creating a figure template figure = figureHandler(proj='2d', numAx=2, numRow=4, numCol=1, figsize=(8,13), arrangement=axisLayout) # Adding figure Title figure.formatAxis(figTitle="Distribution by Features {0}".format(name), xlim="ignore", ylim="ignore",axnum=0,showX=False) #Creting list of len(wide.T) maximu 50 with the colors for each index colors = [palette.ugColors[name]] * len(data.index) # Plotting boxPlot box.boxDF(ax=figure.ax[0], colors=colors, dat=data.T, vert=False,rot=0) # Plotting density plot density.plotDensityDF(data=data.T.unstack(), ax=figure.ax[1], colors=colors[0]) # Adding figure to pdf object figure.addToPdf(pdf)
def plotDensityDistribution(pdf, wide, palette): # Instanciating figureHandler object figure = figureHandler(proj="2d", figsize=(12, 7)) # Formating axis figure.formatAxis(figTitle="Distribution by Samples Density", xlim="ignore", ylim="ignore", grid=False) # Plotting density plot density.plotDensityDF(colors=palette.design["colors"], ax=figure.ax[0], data=wide) # Add legend to the plot figure.makeLegend(ax=figure.ax[0], ucGroups=palette.ugColors, group=palette.combName) # Shrinking figure figure.shrink() # Adding to PDF figure.addToPdf(pdf, dpi=600)
def plotScatterplot3D(data, palette, pdf): """ Plots Scatterplots 3D for a given number of loadngs for PCA. :Arguments: :type data: pandas.DataFrame :param data: Loadings of the PCA. :type pdf: pdf object :param pdf: PDF object to save all the generated figures. """ # Open figure handler with 3D projection fh = figureHandler(proj="3d", figsize=(14, 8)) # Plot scatterplot3D ax = scatter.scatter3D(ax=fh.ax[0], colorList=palette.design.colors.tolist(), x=list(data["PC1"]), y=list(data["PC2"]), z=list(data["PC3"])) # Make legends fh.makeLegend(ax=fh.ax[0], ucGroups=palette.ugColors, group=palette.combName) # Add Titles to the PCA fh.format3D(xTitle="PC1", yTitle="PC2", zTitle="PC3") # Add Figure to the PDf fh.addToPdf(dpi=600, pdfPages=pdf)
def main(args): # Loading design if args.design: design = pd.DataFrame.from_csv(args.design, sep="\t") design.reset_index(inplace=True) else: design = False # Loading wide file wide = pd.DataFrame.from_csv(args.input, sep="\t") # Open Figure handler fh = figureHandler(proj="3d", figsize=(14, 8)) # If design file with group and the uniqID is "sampleID" then color by group if args.group and args.uniqID == "sampleID": glist = list(design[args.group]) colorList, ucGroups = palette.getColorsByGroup(design=design, group=args.group, uGroup=sorted( set(glist))) else: glist = list() colorList = palette.mpl_colors[0] ucGroups = dict() # Plot scatterplot 3D scatter.scatter3D(ax=fh.ax[0], x=list(wide[args.x]), y=list(wide[args.y]), z=list(wide[args.z]), colorList=colorList) # Despine axis (spine = tick) fh.despine(fh.ax[0]) # Give format to the plot fh.format3D(title=args.x + " vs " + args.y + " vs " + args.z, xTitle=args.x, yTitle=args.y, zTitle=args.z, rotation=float(args.rotation), elevation=float(args.elevation)) # If groups are provided create a legend if args.group and args.uniqID == "sampleID": fh.makeLegend(ax=fh.ax[0], ucGroups=ucGroups, group=args.group) fh.shrink() # Saving figure to file with PdfPages(args.figure) as pdfOut: fh.addToPdf(dpi=600, pdfPages=pdfOut) logger.info("Script Complete!")
def plotScores(data, palette, pdf): """ Runs LDA over a wide formated dataset :Arguments: :type data: pandas.DataFrame :param data: Scores of the LDA. :type palette: colorManager.object :param palette: Object from color manager :type pdf: pdf object :param pdf: PDF object to save all the generated figures. :Returns: :rtype scores_df: pandas.DataFrame :return scores_df: Scores of the LDA. """ # Create a scatter plot for each combination of the scores for x, y in list(combinations(data.columns.tolist(), 2)): # Create a single-figure figure handler object fh = figureHandler(proj="2d", figsize=(14, 8)) # Create a title for the figure title = "{0} vs {1}".format(x, y) # Plot the scatterplot based on data scatter.scatter2D(x=list(data[x]), y=list(data[y]), colorList=palette.design.colors.tolist(), ax=fh.ax[0]) # Create legend fh.makeLegend(ax=fh.ax[0], ucGroups=palette.ugColors, group=palette.combName) # Shrink axis to fit legend fh.shrink() # Despine axis fh.despine(fh.ax[0]) # Formatting axis fh.formatAxis(figTitle=title, xTitle="Scores on {0}".format(x), yTitle="Scores on {0}".format(y), grid=False) # Adding figure to pdf fh.addToPdf(dpi=600, pdfPages=pdf)
def plotScores(data, palette, pdf): """ This function creates a PDF file with 3 scatter plots for the combinations of the 3 principal components. PC1 vs PC2, PC1 vs PC3, PC2 vs PC3. :Arguments: :type data: pandas.core.frame.DataFrame :param data: Data frame with the data to plot. :type outpath: string :param outpath: Path for the output file :type group: string :param group: Name of the column that contains the group information on the design file. :Return: :rtype PDF: file :retrn PDF: file with the 3 scatter plots for PC1 vs PC2, PC1 vs PC3, PC2 vs PC3. """ for x, y in list(itertools.combinations(data.columns.tolist(), 2)): # Creating a figure handler object fh = figureHandler(proj="2d", figsize=(14, 8)) # Creating title for the figure title = "{0} vs {1}".format(x, y) # Creating the scatterplot 2D scatter.scatter2D(ax=fh.ax[0], x=list(data[x]), y=list(data[y]), colorList=palette.design.colors.tolist()) # Despine axis fh.despine(fh.ax[0]) # Print Legend fh.makeLegend(ax=fh.ax[0], ucGroups=palette.ugColors, group=palette.combName) # Shinking the plot so everything fits fh.shrink() # Format Axis fh.formatAxis(figTitle=title, xTitle="Scores on {0}".format(x), yTitle="Scores on {0}".format(y), grid=False) # Adding figure to pdf fh.addToPdf(dpi=90, pdfPages=pdf)
def plotVarImportance(data, pdf, var): """ Runs LDA over a wide formated dataset :Arguments: :type scores: pandas.DataFrame :param scores: Scores of the LDA. :type pdf: pdf object :param pdf: PDF object to save all the generated figures. :type var: int :param var: Number of variables to plot. :Returns: :rtype scores_df: pandas.DataFrame :return scores_df: Scores of the LDA. """ # Subset data upToTheNumberOf Features data = data[:var] # Sort data data = data.sort_values(by="ranked_importance", ascending=True, axis=0) # Creating a figure handler instance fh = figureHandler(proj='2d', figsize=(8, 8)) # Chomp palette palette.chompColors(start=3, end=palette.number) # Get color list colors = palette.getColorsCmapPalette(data["ranked_importance"]) # Multiply by 100 to get percentages instead of proportions data["ranked_importance"] = data["ranked_importance"] * 100 # Creating plot quickHBar(ax=fh.ax[0], values=data["ranked_importance"], xticks=data["feature"], colors=colors, lw=0) # Formatting axis fh.formatAxis(figTitle="Variable Importance Plot", xTitle="%", grid=False, yTitle="Features") # Adding figure to pdf fh.addToPdf(dpi=600, pdfPages=pdf)
def plotScatterplot2D(data, palette, pdf, nloads=3): """ Plots Scatterplots 2D for a number of loadngs for PCA. :Arguments: :type data: pandas.DataFrame :param data: Loadings of the PCA. :type pdf: pdf object :param pdf: PDF object to save all the generated figures. :type nloads: int :param nloads: Number of principal components to create pairwise combs. """ # Selecting amount of pairwise combinations to plot scaterplots for loads. for x, y in list(combinations(data.columns.tolist()[:nloads], 2)): # Create a single-figure figure handler object fh = figureHandler(proj="2d", figsize=(14, 8)) # Create a title for the figure title = "{0} vs {1}".format(x, y) # Plot the scatterplot based on data scatter.scatter2D(x=list(data[x]), y=list(data[y]), colorList=palette.design.colors.tolist(), ax=fh.ax[0]) # Create legend fh.makeLegend(ax=fh.ax[0], ucGroups=palette.ugColors, group=palette.combName) # Shrink axis to fit legend fh.shrink() # Despine axis fh.despine(fh.ax[0]) # Formatting axis fh.formatAxis(figTitle=title, xTitle="Scores on {0}".format(x), yTitle="Scores on {0}".format(y), grid=False) # Adding figure to pdf fh.addToPdf(dpi=600, pdfPages=pdf)
def main(args): """Runs eveything""" # Importing data dat = wideToDesign(args.input, args.design, args.uniqID, logger=logger) # Cleaning from missing data dat.dropMissing() # Getting labels to drop from arguments x = True y = True if "x" in args.labels: x = False if "y" in args.labels: y = False #Plotting with dendogram Hierarchical cluster heatmap (HCH) logger.info("Plotting heatmaps") if args.dendogram == True: fh = hm.plotHCHeatmap(dat.wide, hcheatmap=True, cmap=palette.mpl_colormap, xlbls=x, ylbls=y) fh.savefig(args.fig, format="pdf") #Plotting without a dendogram single heatmap else: # Creating figure Handler object fh = figureHandler(proj='2d', figsize=(14, 14)) # Creating plot hm.plotHeatmap(dat.wide, fh.ax[0], cmap=palette.mpl_colormap, xlbls=x, ylbls=y) # formating axis fh.formatAxis(xTitle="sampleID") # Saving figure fh.export(out=args.fig, dpi=300) # Finishing script logger.info("Script Complete!")
def plotCVplots(data, cutoff, palette, pdf): #Iterate over groups for name, group in palette.design.groupby(palette.combName): # Open figure handler fh = figureHandler(proj='2d', figsize=(14, 8)) # Get xmin and xmax xmin = -np.nanpercentile(data['cv_' + name].values, 99) * 0.2 xmax = np.nanpercentile(data['cv_' + name].values, 99) * 1.5 # Plot histogram hist.serHist(ax=fh.ax[0], dat=data['cv_' + name], color='grey', normed=1, range=(xmin, xmax), bins=15) # Plot density plot dist.plotDensityDF(data=data['cv_' + name], ax=fh.ax[0], lb="CV density", colors=palette.ugColors[name]) # Plot cutoff lines.drawCutoffVert(ax=fh.ax[0], x=cutoff, lb="Cutoff at: {0}".format(cutoff)) # Plot legend fh.makeLegendLabel(ax=fh.ax[0]) # Give format to the axis fh.formatAxis( yTitle='Density', xlim=(xmin, xmax), ylim="ignore", figTitle="Density Plot of Coefficients of Variation in {0}".format( name)) # Shrink figure to fit legend fh.shrink() # Add plot to PDF fh.addToPdf(pdfPages=pdf)
def plotBoxplotDistribution(pdf, wide, palette): # Instanciating figureHandler object figure = figureHandler(proj="2d", figsize=(max(len(wide.columns) / 4, 12), 7)) # Formating axis figure.formatAxis(figTitle="Distribution by Samples Boxplot", ylim="ignore", grid=False, xlim="ignore") # Plotting boxplot box.boxDF(ax=figure.ax[0], colors=palette.design["colors"], dat=wide) # Shrinking figure figure.shrink() #Adding to PDF figure.addToPdf(pdf, dpi=600)
def plotCV(data, cutoff, pdf): # Creating figure instance fh = figureHandler(proj='2d') # Getting xmin and xmax xmin = -np.nanpercentile(data['cv'].values, 99) * 0.2 xmax = np.nanpercentile(data['cv'].values, 99) * 1.5 # plotting histogra, hist.serHist(ax=fh.ax[0], dat=data['cv'], range=(xmin, xmax), bins=15, normed=1, color=palette.mpl_colors[8]) # Plotting distribution dist.plotDensityDF(data=data['cv'], ax=fh.ax[0], colors=palette.mpl_colors[1], lb='CV density') # Draw cutoffs lines.drawCutoffVert(ax=fh.ax[0], x=cutoff, cl=palette.mpl_colors[0], lb="Cutoff at:\n{0}".format(cutoff)) # making legend fh.makeLegendLabel(ax=fh.ax[0]) # Formating axis fh.formatAxis( yTitle='Density', xlim=(xmin, xmax), ylim="ignore", figTitle="Density Plot of Coef. of Variation of the Retention Time") #Shrinking figure fh.shrink() #Export figure fh.addToPdf(dpi=600, pdfPages=pdf)
def plotCDhistogram(count, pdf, group): """ This function counts digits on a given file. :Arguments: :type count: pandas.DataFrame. :param count: DataFrama with the counted digits and min, max and diff among rows. :type pdf: matplotlib.backends.backend_pdf.PdfPages. :param pdf: PDF object to plot figures in. :type group: str. :param group: Name of the group to plot. """ #Creating title title="Distribution of difference between \n(min and max) for {0} compounds".\ format(group) if count['diff'].any(): #Opening figure handler fh = figureHandler(proj='2d') #Plot histogram hist.quickHist(ax=fh.ax[0], dat=count['diff']) #Giving format to the axis fh.formatAxis(xTitle='Difference in Number of Digits (max - min)', yTitle='Number of Features', figTitle=title, ylim="ignore") # Explort figure fh.addToPdf(pdf, dpi=600) else: logger.warn( "There were no differences in digit counts for {0}, no plot will be generated" .format(group))
def makePlots (SEDData, design, pdf, groupName, cutoff, p, plotType, ugColors, levels): """ Manage all the plots for this script :Arguments: :type SEDData: pandas.dataFrame :param SEDData: Contains SED data either to Mean or pairwise :type design: pandas.dataFrame :param design: Design file after getColor :type pdf: PDF object :param pdf: PDF for output plots :type groupName: string :param groupName: Name of the group (figure title). :type cutoff: pandas.dataFrame :param cutoff: Cutoff values, beta, chi-sqr and normal. :type p: float :param p: Percentil for cutoff. :type plotType: string :param plotType: Type of plot, the possible types are scatterplot to mean scatterplot pairwise and boxplot pairwise. """ #Geting number of features in dataframe nFeatures = len(SEDData.index) #Calculates the widht for the figure base on the number of features figWidth = max(nFeatures/2, 16) # Create figure object with a single axis and initiate the figss figure = figureHandler(proj='2d', figsize=(figWidth, 8)) # Keeping the order on the colors SEDData["colors"]=design["colors"] # Choose type of plot # Plot scatterplot to mean if(plotType=="scatterToMean"): #Adds Figure title, x axis limits and set the xticks figure.formatAxis(figTitle="Standardized Euclidean Distance from samples {} to the mean". format(groupName),xlim=(-0.5,-0.5+nFeatures),ylim="ignore", xticks=SEDData.index.values,xTitle="Index", yTitle="Standardized Euclidean Distance") #Plot scatterplot quickplot scatter.scatter2D(ax=figure.ax[0],colorList=SEDData["colors"], x=range(len(SEDData.index)), y=SEDData["SED_to_Mean"]) #Plot scatterplot pairwise elif(plotType=="scatterPairwise"): # Adds Figure title, x axis limits and set the xticks figure.formatAxis(figTitle="Pairwise standardized Euclidean Distance from samples {}". format(groupName),xlim=(-0.5,-0.5+nFeatures),ylim="ignore", xticks=SEDData.index.values,xTitle="Index", yTitle="Standardized Euclidean Distance") # Plot scatterplot for index in SEDData.index.values: scatter.scatter2D(ax=figure.ax[0],colorList=design["colors"][index], x=range(len(SEDData.index)), y=SEDData[index]) #Plot boxplot pairwise elif(plotType=="boxplotPairwise"): # Add Figure title, x axis limits and set the xticks figure.formatAxis(figTitle="Box-plots for pairwise standardized Euclidean Distance from samples {}". format(groupName),xlim=(-0.5,-0.5+nFeatures),ylim="ignore", xticks=SEDData.index.values,xTitle="Index", yTitle="Standardized Euclidean Distance") # Plot Box plot box.boxDF(ax=figure.ax[0], colors=SEDData["colors"].values, dat=SEDData) #Add a cutoof line cutoff.apply(lambda x: plotCutoffs(x,ax=figure.ax[0],p=p),axis=0) figure.shrink() # Plot legend #if group: figure.makeLegend(figure.ax[0], ugColors, levels) # Add figure to PDF and close the figure afterwards figure.addToPdf(pdf)
def iterateCombo(dat, combo, pdf): """ A function to iterate generate all plots and flags. :Arguments: :type dat: interface.wideToDesign :param dat: A wideToDesign object containing wide and design information. :param tuple combo: A tuple of pairwise combination for current sample. :type pdf: matplotlib.backends.backend_pdf.PdfPages :param pdf: Handler for multi-page PDF that will contain all plots. :Updates: :type pdf: matplotlib.backends.backend_pdf.PdfPages :param pdf: Handler for multi-page PDF that will contain all plots. :Returns: :rtype flag: interface.Flags :param flag: A Flags object with outlier flags. """ # Current combination c1 = combo[0] c2 = combo[1] # Set up figure with 2 subplots fh = figureHandler(proj='2d', numAx=2, numRow=2, numCol=2, arrangement=[(0, 0, 1, 2), (0, 1, 1, 2)]) # Scatter Plot of c1 vs c2 makeScatter(dat.wide.loc[:, c1], dat.wide.loc[:, c2], fh.ax[0], fh) # BA plot of c1 vs c2 outlier, pearson, cooks, dffits = makeBA(dat.wide.loc[:, c1], dat.wide.loc[:, c2], fh.ax[1], fh) # Build plot title title = buildTitle(dat, c1, c2) # Add plot title to the figure fh.formatAxis(figTitle=title) # Stablishing a tight layout for the figure plt.tight_layout(pad=2, w_pad=.05) # Shinking figure fh.shrink(top=.85, bottom=.25, left=.15, right=.9) # Output figure to pdf fh.addToPdf(dpi=90, pdfPages=pdf) # Create flags flag = Flags(index=dat.wide.index) flag.addColumn(column='flag_{0}_{1}'.format(c1, c2), mask=outlier) flag.addColumn(column='flag_pearson_{0}_{1}'.format(c1, c2), mask=pearson) flag.addColumn(column='flag_cooks_{0}_{1}'.format(c1, c2), mask=cooks) flag.addColumn(column='flag_dffits_{0}_{1}'.format(c1, c2), mask=dffits) return flag.df_flags
def plotFlagDist(propSample, propFeature, pdf): """ Plot the distribution of proportion of samples and features that were outliers. :Arguments: :type propSample: pandas.DataFrame :param propSample: Data frame of the proportion of samples flagged as an outlier. :type propFeature: pandas.DataFrame :param propFeature: Data frame of the proportion of features flagged as an outlier. :type pdf: string :param pdf: Filename of pdf to save plots. :Returns: :rtype: matplotlib.backends.backend_pdf.PdfPages :returns: Saves two bar plots to pdf. """ # sort samples propSample.sort_values(inplace=True, ascending=False) # sort compounds propFeature.sort_values(inplace=True, ascending=False) # Make Plots ## Open pdf for plotting ppFlag = PdfPages(pdf) # Open figure handler instance fh = figureHandler(proj='2d') keys = list(propSample.head(30).keys()) # Plotting quickBar bar.quickBar(ax=fh.ax[0], y=list(propSample.head(30).get_values()), x=keys) # Formating axis fh.formatAxis(xlim=(0, len(keys) + 1), ylim="ignore", xTitle="Sample ID", yTitle="Proportion of features that were outliers.") # Save Figure in PDF ppFlag.savefig(fh.fig, bbox_inches='tight') ## Plot samples # Open figure handler instance fh = figureHandler(proj='2d') keys = list(propFeature.head(30).keys()) # Plot bar plot bar.quickBar(ax=fh.ax[0], y=list(propFeature.head(30).get_values()), x=keys) # Format Axis fh.formatAxis( xlim=(0, len(keys) + 1), ylim="ignore", xTitle="Feature ID", yTitle="Proportion of samples that a feature was an outlier.") # Plot samples ppFlag.savefig(fh.fig, bbox_inches="tight") ## Close pdf ppFlag.close()
def qqPlot(tresid, tfit, oname): """ Plot the residual diagnostic plots by sample. Output q-q plot, boxplots and distributions of the residuals. These plots will be used diagnose if residuals are approximately normal. :Arguments: :type tresid: pandas.Series :param tresid: Pearson normalized residuals. (transposed) (residuals / sqrt(MSE)) :type tfit: pandas DataFrame :param tfit: output of the ANOVA (transposed) :type oname: string :param oname: Name of the output file in pdf format. :Returns: :rtype: PDF :returns: Outputs a pdf file containing all plots. """ #Open pdf with PdfPages(oname) as pdf: # Stablishing axisLayout axisLayout = [(0,0,1,1),(0,1,1,1),(0,2,1,1),(1,0,3,1)] # Start plotting for col in tresid.columns: #Creating figure fig = figureHandler(proj='2d',numAx=4,numRow=2,numCol=3, arrangement=axisLayout) data = tresid[col].values.ravel() noColors = list() for j in range(0,len(data)): noColors.append('b')#blue df_data = pd.DataFrame(data) # Plot qqplot on axis 0 sm.graphics.qqplot(tresid[col],fit=True,line='r',ax=fig.ax[0]) # Plot boxplot on axis 1 box.boxSeries(ser=data,ax=fig.ax[1]) # Plot histogram on axis 2 hist.quickHist(ax=fig.ax[2],dat=df_data,orientation='horizontal') # Plot scatterplot on axis 3 scatter.scatter2D(ax=fig.ax[3],x=tfit[col], y=tresid[col], colorList=list('b')) # Draw cutoff line for scatterplot on axis 3 lines.drawCutoffHoriz(ax=fig.ax[3],y=0) # Format axis 0 fig.formatAxis(figTitle=col,axnum=0,grid=False,showX=True, yTitle="Sample Quantiles", xTitle=" ") # Format axis 1 fig.formatAxis(axnum=1,axTitle="Standardized Residuals", grid=False,showX=False,showY=True, xTitle=" ") # Format axis 2 fig.formatAxis(axnum=2,grid=False,showX=True,showY=True, axTitle=" ",xTitle=" ") # Format axis 3 fig.formatAxis(axnum=3,axTitle="Fitted Values vs Residual Values", xTitle="Fitted Values",yTitle="Residual Values", grid=False) #Add figure to pdf fig.addToPdf(pdfPages=pdf)
def nontechnical_analysis(args, df, mask, C, clustering): # Re-order things more palatably for the user, # based on the results of the technical analysis. # Get the map from the name to the original row index. all_row_names = df.index.values row_index_map = {s: i for i, s in enumerate(all_row_names)} # If some variables are uninformative for clustering, # the correlation matrix and the cluster vector will have smaller # dimensions than the number of rows in the original data frame. remaining_row_names = df[mask].index.values # Count the variables included in the clustering. p = clustering.shape[0] # Count the clusters. k = clustering.max() + 1 # To sort the modules and to sort the variables within the modules, # we want to use absolute values of correlations. C_abs = np.abs(C) # For each cluster, get its indices and its submatrix of C_abs. selections = [] submatrices = [] degrees = np.zeros(p, dtype=float) for i in range(k): selection = np.flatnonzero(clustering == i) selections.append(selection) submatrix = C_abs[np.ix_(selection, selection)] submatrices.append(submatrix) if selection.size > 1: denom = selection.size - 1 degrees[selection] = (submatrix.sum(axis=0) - 1) / denom # Modules should be reordered according to decreasing "average degree". cluster_sizes = [] average_degrees = [] for selection in selections: cluster_sizes.append(selection.size) average_degrees.append(degrees[selection].mean()) module_to_cluster = np.argsort(average_degrees)[::-1] cluster_to_module = {v: k for k, v in enumerate(module_to_cluster)} triples = [( cluster_to_module[clustering[i]], -degrees[i], i, ) for i in range(p)] _a, _b, new_to_old_idx = zip(*sorted(triples)) # Make a csv file if requested. header = ('Gene', 'Module', 'Entry Index', 'Average Degree', 'Degree') with open(args.out, 'wb') as fout: writer = csv.writer( fout, 'excel-tab') #problematic; need to switch to tsv file! writer.writerow(header) for old_i in new_to_old_idx: name = remaining_row_names[old_i] cluster = clustering[old_i] row = ( name, cluster_to_module[cluster] + 1, row_index_map[name] + 1, average_degrees[cluster], degrees[old_i], ) writer.writerow(row) #Create Output fh1 = figureHandler(proj="2d") fh2 = figureHandler(proj="2d") fh3 = figureHandler(proj="2d") # Prepare to create the sorted heatmaps. (fh2) C_sorted = C[np.ix_(new_to_old_idx, new_to_old_idx)] clustering_new = clustering[np.ix_(new_to_old_idx)] # Draw the third heatmap (smoothed). # Make a smoothed correlation array. (fh3) S = expansion(clustering_new) block_mask = S.dot(S.T) denom = np.outer(S.sum(axis=0), S.sum(axis=0)) small = S.T.dot(C_sorted).dot(S) / denom C_all_smoothed = S.dot(small).dot(S.T) C_smoothed = (C_all_smoothed * (1 - block_mask) + C_sorted * block_mask) # Getting list of names for heatmaps 2 and 3 hpnames = [remaining_row_names[old_i] for old_i in new_to_old_idx] # Plot using something like http://stackoverflow.com/questions/15988413/ # Drawing heatmaps # Draw first heatmap [C] hm.plotHeatmap(C, fh1.ax[0], cmap=palette.mpl_colormap, xlbls=remaining_row_names, ylbls=remaining_row_names) fh1.formatAxis(xTitle="sampleID", figTitle="Correlations") # Draw second heatmap [C_sorted](reordered according to the clustering). hm.plotHeatmap(C_sorted, fh2.ax[0], cmap=palette.mpl_colormap, xlbls=hpnames, ylbls=hpnames) fh2.formatAxis(xTitle="sampleID", figTitle="Re-Ordered correlations") # Draw the heatmap [C_smoothed](smoothed version of C_sorted) hm.plotHeatmap(C_smoothed, fh3.ax[0], cmap=palette.mpl_colormap, xlbls=hpnames, ylbls=hpnames) fh3.formatAxis(xTitle="sampleID", figTitle="Smoothed correlations") #Create output from maps with PdfPages(args.figure) as pdf: fh1.addToPdf(pdf) fh2.addToPdf(pdf) fh3.addToPdf(pdf)
def plotSignificantROR(data, pdf, palette): """ Plot a scatter plot of x vs y. :Arguments: :type row: :param row: :type pdf: PdfPages :param pdf: pdf object to store scatterplots :type des: pandas DataFrame :param des: design file :type groupName: string :param groupName: name of group """ # Iterates over all rows in the dataframe # Make scatter plot if p-pvalue is less than 0.05 for index, row in data.iterrows(): if row["pval"] > 0.05: continue #plotSignificantROR(row,pdf,dat.design,args.group) # Get 95% CI prstd, lower, upper = wls_prediction_std(row["res"]) # Sort CIs for Plotting toPlot = pd.DataFrame({"x": row["x"], "lower": lower, "upper": upper}) toPlot.sort_values(by="x", inplace=True) # Create plot fh = figureHandler(proj="2d", figsize=(14, 8)) #Plot scatterplot scatter.scatter2D(ax=fh.ax[0], x=row["x"], y=row["y"], colorList=palette.list_colors) # Plot cutoffs lines.drawCutoff(ax=fh.ax[0], x=row["x"], y=row["fitted"], c="c") lines.drawCutoff(ax=fh.ax[0], x=toPlot["x"], y=toPlot["lower"], c="r") lines.drawCutoff(ax=fh.ax[0], x=toPlot["x"], y=toPlot["upper"], c="r") # Formatting ymin, ymax = fh.ax[0].get_ylim() fh.formatAxis(xTitle="Run Order", yTitle="Value", ylim=(ymin,ymax*1.2), figTitle=u"{} Scatter plot (fitted regression line and prediction bands"\ " included)".format(row["name"])) # Shrink figure fh.shrink() # Add legend to figure fh.makeLegend(ax=fh.ax[0], ucGroups=palette.ugColors, group=palette.combName) #Add text to the ax fh.ax[0].text(.7, .85, u"Slope= {0:.4f}\n(p-value = {1:.4f})\n"\ "$R^2$ = {2:4f}".format(round(row["slope"],4), round(row["pval"],4), round(row["rsq"],4)),transform=fh.ax[0].transAxes, fontsize=12) # Save to PDF fh.addToPdf(pdf)
def volcano(combo, results, oname, cutoff=4): """ Plot volcano plots. Creates volcano plots to compare means, for all pairwise differences. :Arguments: :type combo: dictionary :param combo: A dictionary of dictionaries with all possible pairwise combinations. Used this to create the various column headers in the results table. :type results: pandas DataFrame :param results: TODO :type oname: string :param oname: Name of the output file in pdf format. :type cutoff: int :param cutoff: The cutoff value for significance. :Returns: :rtype: PD :returns: Outputs a pdf file containing all plots. """ # Getting data for lpvals lpvals = {col.split("_")[-1]:results[col] for col in results.columns.tolist() \ if col.startswith("-log10_p-value_")} # Gettign data for diffs difs = {col.split("_")[-1]:results[col] for col in results.columns.tolist() \ if col.startswith("diff_of")} # Making plots with PdfPages(oname) as pdf: for key in sorted(difs.keys()): # Set Up Figure volcanoPlot = figureHandler(proj="2d") # Plot all results scatter.scatter2D(x=list(difs[key]), y=list(lpvals[key]), colorList=list('b'), ax=volcanoPlot.ax[0]) # Color results beyond treshold red cutLpvals = lpvals[key][lpvals[key]>cutoff] if not cutLpvals.empty: cutDiff = difs[key][cutLpvals.index] scatter.scatter2D(x=list(cutDiff), y=list(cutLpvals), colorList=list('r'), ax=volcanoPlot.ax[0]) # Drawing cutoffs lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0]) # Format axis (volcanoPlot) volcanoPlot.formatAxis(axTitle=key, grid=False, yTitle="-log10(p-value) for Diff of treatment = {0}".format(key), xTitle="Diff of treatment = {0}".format(key)) # Add figure to PDF volcanoPlot.addToPdf(pdfPages=pdf)