def plotDistributions(data, cutoff, palette, pdf):
    # Open new figureHandler instance
    fh = figureHandler(proj='2d', figsize=(14, 8))

    #Get xmin and xmax
    xmin = -np.nanpercentile(data['cv'].values, 99) * 0.2
    xmax = np.nanpercentile(data['cv'].values, 99) * 1.5

    # Split design file by treatment group and plot density plot
    for name, group in palette.design.groupby(palette.combName):
        dist.plotDensityDF(data=data["cv_" + name],
                           ax=fh.ax[0],
                           colors=palette.ugColors[name],
                           lb="{0}".format(name))

    # Plot cutoff
    lines.drawCutoffVert(ax=fh.ax[0],
                         x=cutoff,
                         lb="Cutoff at: {0}".format(cutoff))

    # Plot legend
    fh.makeLegendLabel(ax=fh.ax[0])

    # Give format to the axis
    fh.formatAxis(
        yTitle="Density",
        xlim=(xmin, xmax),
        figTitle="Density Plot of Coefficients of Variation by {0}".format(
            palette.combName))

    # Shrink figure
    fh.shrink()

    # Add figure to PDF
    fh.addToPdf(pdfPages=pdf)
예제 #2
0
def plotDensity (data, name, pdf):
    """
    This function takes pandas dataframe data and plots a
    density plot and a boxplot.
    """
    # Stablishing figure layout (x,y,colspan,rowspan)
    axisLayout = [(0,0,1,3),(3,0,1,1)]

    # Creating a figure template
    figure = figureHandler(proj='2d', numAx=2, numRow=4, numCol=1, 
                            figsize=(8,13), arrangement=axisLayout)
    # Adding figure Title
    figure.formatAxis(figTitle="Distribution by Features {0}".format(name),
                    xlim="ignore", ylim="ignore",axnum=0,showX=False)

    #Creting list of len(wide.T) maximu 50 with the colors for each index
    colors =  [palette.ugColors[name]] * len(data.index)

    # Plotting boxPlot
    box.boxDF(ax=figure.ax[0], colors=colors, dat=data.T,
             vert=False,rot=0)

    # Plotting density plot
    density.plotDensityDF(data=data.T.unstack(), 
                    ax=figure.ax[1], colors=colors[0])

    # Adding figure to pdf object
    figure.addToPdf(pdf)
예제 #3
0
def plotDensityDistribution(pdf, wide, palette):
    # Instanciating figureHandler object
    figure = figureHandler(proj="2d", figsize=(12, 7))

    # Formating axis
    figure.formatAxis(figTitle="Distribution by Samples Density",
                      xlim="ignore",
                      ylim="ignore",
                      grid=False)

    # Plotting density plot
    density.plotDensityDF(colors=palette.design["colors"],
                          ax=figure.ax[0],
                          data=wide)

    # Add legend to the plot
    figure.makeLegend(ax=figure.ax[0],
                      ucGroups=palette.ugColors,
                      group=palette.combName)

    # Shrinking figure
    figure.shrink()

    # Adding to PDF
    figure.addToPdf(pdf, dpi=600)
예제 #4
0
def plotScatterplot3D(data, palette, pdf):
    """
    Plots Scatterplots 3D for a given number of loadngs for PCA.

    :Arguments:
        :type data: pandas.DataFrame
        :param data: Loadings of the PCA.

        :type pdf: pdf object
        :param pdf: PDF object to save all the generated figures.
    """

    # Open figure handler with 3D projection
    fh = figureHandler(proj="3d", figsize=(14, 8))

    # Plot scatterplot3D
    ax = scatter.scatter3D(ax=fh.ax[0],
                           colorList=palette.design.colors.tolist(),
                           x=list(data["PC1"]),
                           y=list(data["PC2"]),
                           z=list(data["PC3"]))

    # Make legends
    fh.makeLegend(ax=fh.ax[0],
                  ucGroups=palette.ugColors,
                  group=palette.combName)

    # Add Titles to the PCA
    fh.format3D(xTitle="PC1", yTitle="PC2", zTitle="PC3")

    # Add Figure to the PDf
    fh.addToPdf(dpi=600, pdfPages=pdf)
예제 #5
0
def main(args):
    # Loading design
    if args.design:
        design = pd.DataFrame.from_csv(args.design, sep="\t")
        design.reset_index(inplace=True)
    else:
        design = False

    # Loading wide file
    wide = pd.DataFrame.from_csv(args.input, sep="\t")

    # Open Figure handler
    fh = figureHandler(proj="3d", figsize=(14, 8))

    # If design file with group and the uniqID is "sampleID" then color by group
    if args.group and args.uniqID == "sampleID":
        glist = list(design[args.group])
        colorList, ucGroups = palette.getColorsByGroup(design=design,
                                                       group=args.group,
                                                       uGroup=sorted(
                                                           set(glist)))
    else:
        glist = list()
        colorList = palette.mpl_colors[0]
        ucGroups = dict()

    # Plot scatterplot 3D
    scatter.scatter3D(ax=fh.ax[0],
                      x=list(wide[args.x]),
                      y=list(wide[args.y]),
                      z=list(wide[args.z]),
                      colorList=colorList)

    # Despine axis (spine = tick)
    fh.despine(fh.ax[0])

    # Give format to the plot
    fh.format3D(title=args.x + " vs " + args.y + " vs " + args.z,
                xTitle=args.x,
                yTitle=args.y,
                zTitle=args.z,
                rotation=float(args.rotation),
                elevation=float(args.elevation))

    # If groups are provided create a legend
    if args.group and args.uniqID == "sampleID":
        fh.makeLegend(ax=fh.ax[0], ucGroups=ucGroups, group=args.group)
        fh.shrink()

    # Saving figure to file
    with PdfPages(args.figure) as pdfOut:
        fh.addToPdf(dpi=600, pdfPages=pdfOut)
    logger.info("Script Complete!")
예제 #6
0
def plotScores(data, palette, pdf):
    """
    Runs LDA over a wide formated dataset

    :Arguments:
        :type data: pandas.DataFrame
        :param data: Scores of the LDA.

        :type palette: colorManager.object
        :param palette: Object from color manager

        :type pdf: pdf object
        :param pdf: PDF object to save all the generated figures.

    :Returns:
        :rtype scores_df: pandas.DataFrame
        :return scores_df: Scores of the LDA.
    """
    # Create a scatter plot for each combination of the scores
    for x, y in list(combinations(data.columns.tolist(), 2)):

        # Create a single-figure figure handler object
        fh = figureHandler(proj="2d", figsize=(14, 8))

        # Create a title for the figure
        title = "{0} vs {1}".format(x, y)

        # Plot the scatterplot based on data
        scatter.scatter2D(x=list(data[x]),
                          y=list(data[y]),
                          colorList=palette.design.colors.tolist(),
                          ax=fh.ax[0])

        # Create legend
        fh.makeLegend(ax=fh.ax[0],
                      ucGroups=palette.ugColors,
                      group=palette.combName)

        # Shrink axis to fit legend
        fh.shrink()

        # Despine axis
        fh.despine(fh.ax[0])

        # Formatting axis
        fh.formatAxis(figTitle=title,
                      xTitle="Scores on {0}".format(x),
                      yTitle="Scores on {0}".format(y),
                      grid=False)

        # Adding figure to pdf
        fh.addToPdf(dpi=600, pdfPages=pdf)
예제 #7
0
def plotScores(data, palette, pdf):
    """
    This function creates a PDF file with 3 scatter plots for the combinations 
    of the 3 principal components. PC1 vs PC2, PC1 vs PC3, PC2 vs PC3.

    :Arguments:
        :type data: pandas.core.frame.DataFrame
        :param data: Data frame with the data to plot.
        
        :type outpath: string
        :param outpath: Path for the output file

        :type group: string
        :param group: Name of the column that contains the group information on the design file.

    :Return:
        :rtype PDF: file
        :retrn PDF: file with the 3 scatter plots for PC1 vs PC2, PC1 vs PC3, PC2  vs PC3.
    """
    for x, y in list(itertools.combinations(data.columns.tolist(), 2)):
        # Creating a figure handler object
        fh = figureHandler(proj="2d", figsize=(14, 8))

        # Creating title for the figure
        title = "{0} vs {1}".format(x, y)

        # Creating the scatterplot 2D
        scatter.scatter2D(ax=fh.ax[0],
                          x=list(data[x]),
                          y=list(data[y]),
                          colorList=palette.design.colors.tolist())

        # Despine axis
        fh.despine(fh.ax[0])

        # Print Legend
        fh.makeLegend(ax=fh.ax[0],
                      ucGroups=palette.ugColors,
                      group=palette.combName)

        # Shinking the plot so everything fits
        fh.shrink()

        # Format Axis
        fh.formatAxis(figTitle=title,
                      xTitle="Scores on {0}".format(x),
                      yTitle="Scores on {0}".format(y),
                      grid=False)

        # Adding figure to pdf
        fh.addToPdf(dpi=90, pdfPages=pdf)
예제 #8
0
def plotVarImportance(data, pdf, var):
    """
    Runs LDA over a wide formated dataset

    :Arguments:
        :type scores: pandas.DataFrame
        :param scores: Scores of the LDA.

        :type pdf: pdf object
        :param pdf: PDF object to save all the generated figures.

        :type var: int
        :param var: Number of variables to plot.

    :Returns:
        :rtype scores_df: pandas.DataFrame
        :return scores_df: Scores of the LDA.
    """
    # Subset data upToTheNumberOf Features
    data = data[:var]

    # Sort data
    data = data.sort_values(by="ranked_importance", ascending=True, axis=0)

    # Creating a figure handler instance
    fh = figureHandler(proj='2d', figsize=(8, 8))

    # Chomp palette
    palette.chompColors(start=3, end=palette.number)

    # Get color list
    colors = palette.getColorsCmapPalette(data["ranked_importance"])

    # Multiply by 100 to get percentages instead of proportions
    data["ranked_importance"] = data["ranked_importance"] * 100

    # Creating plot
    quickHBar(ax=fh.ax[0],
              values=data["ranked_importance"],
              xticks=data["feature"],
              colors=colors,
              lw=0)

    # Formatting axis
    fh.formatAxis(figTitle="Variable Importance Plot",
                  xTitle="%",
                  grid=False,
                  yTitle="Features")

    # Adding figure to pdf
    fh.addToPdf(dpi=600, pdfPages=pdf)
예제 #9
0
def plotScatterplot2D(data, palette, pdf, nloads=3):
    """
    Plots Scatterplots 2D for a number of loadngs for PCA.

    :Arguments:
        :type data: pandas.DataFrame
        :param data: Loadings of the PCA.

        :type pdf: pdf object
        :param pdf: PDF object to save all the generated figures.

        :type nloads: int
        :param nloads: Number of principal components to create pairwise combs.
    """

    # Selecting amount of pairwise combinations to plot scaterplots for loads.
    for x, y in list(combinations(data.columns.tolist()[:nloads], 2)):

        # Create a single-figure figure handler object
        fh = figureHandler(proj="2d", figsize=(14, 8))

        # Create a title for the figure
        title = "{0} vs {1}".format(x, y)

        # Plot the scatterplot based on data
        scatter.scatter2D(x=list(data[x]),
                          y=list(data[y]),
                          colorList=palette.design.colors.tolist(),
                          ax=fh.ax[0])

        # Create legend
        fh.makeLegend(ax=fh.ax[0],
                      ucGroups=palette.ugColors,
                      group=palette.combName)

        # Shrink axis to fit legend
        fh.shrink()

        # Despine axis
        fh.despine(fh.ax[0])

        # Formatting axis
        fh.formatAxis(figTitle=title,
                      xTitle="Scores on {0}".format(x),
                      yTitle="Scores on {0}".format(y),
                      grid=False)

        # Adding figure to pdf
        fh.addToPdf(dpi=600, pdfPages=pdf)
def main(args):
    """Runs eveything"""
    # Importing data
    dat = wideToDesign(args.input, args.design, args.uniqID, logger=logger)

    # Cleaning from missing data
    dat.dropMissing()

    # Getting labels to drop from arguments
    x = True
    y = True
    if "x" in args.labels:
        x = False
    if "y" in args.labels:
        y = False

    #Plotting with dendogram Hierarchical cluster heatmap (HCH)
    logger.info("Plotting heatmaps")
    if args.dendogram == True:
        fh = hm.plotHCHeatmap(dat.wide,
                              hcheatmap=True,
                              cmap=palette.mpl_colormap,
                              xlbls=x,
                              ylbls=y)
        fh.savefig(args.fig, format="pdf")

    #Plotting without a dendogram single heatmap
    else:
        # Creating figure Handler object
        fh = figureHandler(proj='2d', figsize=(14, 14))

        # Creating plot
        hm.plotHeatmap(dat.wide,
                       fh.ax[0],
                       cmap=palette.mpl_colormap,
                       xlbls=x,
                       ylbls=y)

        # formating axis
        fh.formatAxis(xTitle="sampleID")

        # Saving figure
        fh.export(out=args.fig, dpi=300)

    # Finishing script
    logger.info("Script Complete!")
def plotCVplots(data, cutoff, palette, pdf):
    #Iterate over groups
    for name, group in palette.design.groupby(palette.combName):
        # Open figure handler
        fh = figureHandler(proj='2d', figsize=(14, 8))

        # Get xmin and xmax
        xmin = -np.nanpercentile(data['cv_' + name].values, 99) * 0.2
        xmax = np.nanpercentile(data['cv_' + name].values, 99) * 1.5

        # Plot histogram
        hist.serHist(ax=fh.ax[0],
                     dat=data['cv_' + name],
                     color='grey',
                     normed=1,
                     range=(xmin, xmax),
                     bins=15)

        # Plot density plot
        dist.plotDensityDF(data=data['cv_' + name],
                           ax=fh.ax[0],
                           lb="CV density",
                           colors=palette.ugColors[name])

        # Plot cutoff
        lines.drawCutoffVert(ax=fh.ax[0],
                             x=cutoff,
                             lb="Cutoff at: {0}".format(cutoff))

        # Plot legend
        fh.makeLegendLabel(ax=fh.ax[0])

        # Give format to the axis
        fh.formatAxis(
            yTitle='Density',
            xlim=(xmin, xmax),
            ylim="ignore",
            figTitle="Density Plot of Coefficients of Variation in {0}".format(
                name))

        # Shrink figure to fit legend
        fh.shrink()

        # Add plot to PDF
        fh.addToPdf(pdfPages=pdf)
예제 #12
0
def plotBoxplotDistribution(pdf, wide, palette):
    # Instanciating figureHandler object
    figure = figureHandler(proj="2d",
                           figsize=(max(len(wide.columns) / 4, 12), 7))

    # Formating axis
    figure.formatAxis(figTitle="Distribution by Samples Boxplot",
                      ylim="ignore",
                      grid=False,
                      xlim="ignore")

    # Plotting boxplot
    box.boxDF(ax=figure.ax[0], colors=palette.design["colors"], dat=wide)

    # Shrinking figure
    figure.shrink()

    #Adding to PDF
    figure.addToPdf(pdf, dpi=600)
예제 #13
0
def plotCV(data, cutoff, pdf):
    # Creating figure instance
    fh = figureHandler(proj='2d')

    # Getting xmin and xmax
    xmin = -np.nanpercentile(data['cv'].values, 99) * 0.2
    xmax = np.nanpercentile(data['cv'].values, 99) * 1.5

    # plotting histogra,
    hist.serHist(ax=fh.ax[0],
                 dat=data['cv'],
                 range=(xmin, xmax),
                 bins=15,
                 normed=1,
                 color=palette.mpl_colors[8])

    # Plotting distribution
    dist.plotDensityDF(data=data['cv'],
                       ax=fh.ax[0],
                       colors=palette.mpl_colors[1],
                       lb='CV density')

    # Draw cutoffs
    lines.drawCutoffVert(ax=fh.ax[0],
                         x=cutoff,
                         cl=palette.mpl_colors[0],
                         lb="Cutoff at:\n{0}".format(cutoff))

    # making legend
    fh.makeLegendLabel(ax=fh.ax[0])

    # Formating axis
    fh.formatAxis(
        yTitle='Density',
        xlim=(xmin, xmax),
        ylim="ignore",
        figTitle="Density Plot of Coef. of Variation of the Retention Time")

    #Shrinking figure
    fh.shrink()

    #Export figure
    fh.addToPdf(dpi=600, pdfPages=pdf)
def plotCDhistogram(count, pdf, group):
    """
    This function counts digits on a given file.

        :Arguments:
            :type count: pandas.DataFrame.
            :param count: DataFrama with the counted digits and min, max and 
                            diff among rows.

            :type pdf: matplotlib.backends.backend_pdf.PdfPages.
            :param pdf: PDF object to plot figures in.

            :type group: str.
            :param group: Name of the group to plot.
    """
    #Creating title
    title="Distribution of difference between \n(min and max) for {0} compounds".\
            format(group)
    if count['diff'].any():

        #Opening figure handler
        fh = figureHandler(proj='2d')

        #Plot histogram
        hist.quickHist(ax=fh.ax[0], dat=count['diff'])

        #Giving format to the axis
        fh.formatAxis(xTitle='Difference in Number of Digits (max - min)',
                      yTitle='Number of Features',
                      figTitle=title,
                      ylim="ignore")

        # Explort figure
        fh.addToPdf(pdf, dpi=600)

    else:
        logger.warn(
            "There were no differences in digit counts for {0}, no plot will be generated"
            .format(group))
예제 #15
0
def makePlots (SEDData, design, pdf, groupName, cutoff, p, plotType, ugColors, levels):
    """
    Manage all the plots for this script

    :Arguments:
        :type SEDData: pandas.dataFrame
        :param SEDData: Contains SED data either to Mean or pairwise

        :type design: pandas.dataFrame
        :param design: Design file after getColor

        :type pdf: PDF object
        :param pdf: PDF for output plots

        :type groupName: string
        :param groupName: Name of the group (figure title).

        :type cutoff: pandas.dataFrame
        :param cutoff: Cutoff values, beta, chi-sqr and normal.

        :type p: float
        :param p: Percentil for cutoff.

        :type plotType: string
        :param plotType: Type of plot, the possible types are scatterplot to mean
            scatterplot pairwise and boxplot pairwise.

    """

    #Geting number of features in dataframe
    nFeatures = len(SEDData.index)

    #Calculates the widht for the figure base on the number of features
    figWidth = max(nFeatures/2, 16)

    # Create figure object with a single axis and initiate the figss
    figure = figureHandler(proj='2d', figsize=(figWidth, 8))

    # Keeping the order on the colors
    SEDData["colors"]=design["colors"]

    # Choose type of plot
    # Plot scatterplot to mean
    if(plotType=="scatterToMean"):
        #Adds Figure title, x axis limits and set the xticks
        figure.formatAxis(figTitle="Standardized Euclidean Distance from samples {} to the mean".
                        format(groupName),xlim=(-0.5,-0.5+nFeatures),ylim="ignore",
                        xticks=SEDData.index.values,xTitle="Index",
                        yTitle="Standardized Euclidean Distance")

        #Plot scatterplot quickplot
        scatter.scatter2D(ax=figure.ax[0],colorList=SEDData["colors"],
                        x=range(len(SEDData.index)), y=SEDData["SED_to_Mean"])


    #Plot scatterplot pairwise
    elif(plotType=="scatterPairwise"):
        # Adds Figure title, x axis limits and set the xticks
        figure.formatAxis(figTitle="Pairwise standardized Euclidean Distance from samples {}".
                        format(groupName),xlim=(-0.5,-0.5+nFeatures),ylim="ignore",
                        xticks=SEDData.index.values,xTitle="Index",
                        yTitle="Standardized Euclidean Distance")

        # Plot scatterplot
        for index in SEDData.index.values:
            scatter.scatter2D(ax=figure.ax[0],colorList=design["colors"][index],
                            x=range(len(SEDData.index)), y=SEDData[index])

    #Plot boxplot pairwise
    elif(plotType=="boxplotPairwise"):
        # Add Figure title, x axis limits and set the xticks
        figure.formatAxis(figTitle="Box-plots for pairwise standardized Euclidean Distance from samples {}".
                        format(groupName),xlim=(-0.5,-0.5+nFeatures),ylim="ignore",
                        xticks=SEDData.index.values,xTitle="Index",
                        yTitle="Standardized Euclidean Distance")
        # Plot Box plot
        box.boxDF(ax=figure.ax[0], colors=SEDData["colors"].values, dat=SEDData)

    #Add a cutoof line
    cutoff.apply(lambda x: plotCutoffs(x,ax=figure.ax[0],p=p),axis=0)
    figure.shrink()
    # Plot legend
    #if group:
    figure.makeLegend(figure.ax[0], ugColors, levels)

    # Add figure to PDF and close the figure afterwards
    figure.addToPdf(pdf)
예제 #16
0
def iterateCombo(dat, combo, pdf):
    """ A function to iterate generate all plots and flags.

    :Arguments:
        :type dat: interface.wideToDesign
        :param dat: A wideToDesign object containing wide and design information.

        :param tuple combo: A tuple of pairwise combination for current sample.

        :type pdf: matplotlib.backends.backend_pdf.PdfPages
        :param pdf: Handler for multi-page PDF that will contain all plots.

    :Updates:
        :type pdf: matplotlib.backends.backend_pdf.PdfPages
        :param pdf: Handler for multi-page PDF that will contain all plots.

    :Returns:
        :rtype flag: interface.Flags
        :param flag: A Flags object with outlier flags.

    """

    # Current combination
    c1 = combo[0]
    c2 = combo[1]

    # Set up figure with 2 subplots
    fh = figureHandler(proj='2d',
                       numAx=2,
                       numRow=2,
                       numCol=2,
                       arrangement=[(0, 0, 1, 2), (0, 1, 1, 2)])

    # Scatter Plot of c1 vs c2
    makeScatter(dat.wide.loc[:, c1], dat.wide.loc[:, c2], fh.ax[0], fh)

    # BA plot of c1 vs c2
    outlier, pearson, cooks, dffits = makeBA(dat.wide.loc[:, c1],
                                             dat.wide.loc[:, c2], fh.ax[1], fh)

    # Build plot title
    title = buildTitle(dat, c1, c2)

    # Add plot title to the figure
    fh.formatAxis(figTitle=title)

    # Stablishing a tight layout for the figure
    plt.tight_layout(pad=2, w_pad=.05)

    # Shinking figure
    fh.shrink(top=.85, bottom=.25, left=.15, right=.9)

    # Output figure to pdf
    fh.addToPdf(dpi=90, pdfPages=pdf)

    # Create flags
    flag = Flags(index=dat.wide.index)
    flag.addColumn(column='flag_{0}_{1}'.format(c1, c2), mask=outlier)
    flag.addColumn(column='flag_pearson_{0}_{1}'.format(c1, c2), mask=pearson)
    flag.addColumn(column='flag_cooks_{0}_{1}'.format(c1, c2), mask=cooks)
    flag.addColumn(column='flag_dffits_{0}_{1}'.format(c1, c2), mask=dffits)

    return flag.df_flags
예제 #17
0
def plotFlagDist(propSample, propFeature, pdf):
    """ 
    Plot the distribution of proportion of samples and features that 
    were outliers.

    :Arguments:
        :type propSample: pandas.DataFrame
        :param propSample: Data frame of the proportion of samples flagged as
            an outlier.

        :type propFeature: pandas.DataFrame
        :param propFeature: Data frame of the proportion of features flagged as
            an outlier.

        :type pdf: string
        :param pdf: Filename of pdf to save plots.

    :Returns:
        :rtype: matplotlib.backends.backend_pdf.PdfPages
        :returns: Saves two bar plots to pdf.

    """
    # sort samples
    propSample.sort_values(inplace=True, ascending=False)

    # sort compounds
    propFeature.sort_values(inplace=True, ascending=False)

    # Make Plots
    ## Open pdf for plotting
    ppFlag = PdfPages(pdf)

    # Open figure handler instance
    fh = figureHandler(proj='2d')
    keys = list(propSample.head(30).keys())

    # Plotting quickBar
    bar.quickBar(ax=fh.ax[0], y=list(propSample.head(30).get_values()), x=keys)

    # Formating axis
    fh.formatAxis(xlim=(0, len(keys) + 1),
                  ylim="ignore",
                  xTitle="Sample ID",
                  yTitle="Proportion of features that were outliers.")

    # Save Figure in PDF
    ppFlag.savefig(fh.fig, bbox_inches='tight')

    ## Plot samples
    # Open figure handler instance
    fh = figureHandler(proj='2d')
    keys = list(propFeature.head(30).keys())

    # Plot bar plot
    bar.quickBar(ax=fh.ax[0],
                 y=list(propFeature.head(30).get_values()),
                 x=keys)

    # Format Axis
    fh.formatAxis(
        xlim=(0, len(keys) + 1),
        ylim="ignore",
        xTitle="Feature ID",
        yTitle="Proportion of samples that a feature was an outlier.")

    # Plot samples
    ppFlag.savefig(fh.fig, bbox_inches="tight")

    ## Close pdf
    ppFlag.close()
예제 #18
0
def qqPlot(tresid, tfit, oname):
    """ 
    Plot the residual diagnostic plots by sample.

    Output q-q plot, boxplots and distributions of the residuals. These plots
    will be used diagnose if residuals are approximately normal.

    :Arguments:
        :type tresid: pandas.Series
        :param tresid: Pearson normalized residuals. (transposed)
                        (residuals / sqrt(MSE))

        :type tfit: pandas DataFrame
        :param tfit: output of the ANOVA (transposed)

        :type oname: string
        :param oname: Name of the output file in pdf format.

    :Returns:
        :rtype: PDF
        :returns: Outputs a pdf file containing all plots.

    """
    #Open pdf
    with PdfPages(oname) as pdf:

        # Stablishing axisLayout
        axisLayout = [(0,0,1,1),(0,1,1,1),(0,2,1,1),(1,0,3,1)]

        # Start plotting
        for col in tresid.columns:
            #Creating figure
            fig = figureHandler(proj='2d',numAx=4,numRow=2,numCol=3,
                                arrangement=axisLayout)


            data = tresid[col].values.ravel()
            noColors = list()
            for j in range(0,len(data)):
                noColors.append('b')#blue
            df_data = pd.DataFrame(data)

            # Plot qqplot on axis 0
            sm.graphics.qqplot(tresid[col],fit=True,line='r',ax=fig.ax[0])

            # Plot boxplot on axis 1
            box.boxSeries(ser=data,ax=fig.ax[1])

            # Plot histogram on axis 2
            hist.quickHist(ax=fig.ax[2],dat=df_data,orientation='horizontal')

            # Plot scatterplot on axis 3
            scatter.scatter2D(ax=fig.ax[3],x=tfit[col], y=tresid[col],
                                colorList=list('b'))

            # Draw cutoff line for scatterplot on axis 3
            lines.drawCutoffHoriz(ax=fig.ax[3],y=0)

            # Format axis 0
            fig.formatAxis(figTitle=col,axnum=0,grid=False,showX=True,
                yTitle="Sample Quantiles", xTitle=" ")

            # Format axis 1
            fig.formatAxis(axnum=1,axTitle="Standardized Residuals",
                grid=False,showX=False,showY=True, xTitle=" ")

            # Format axis 2
            fig.formatAxis(axnum=2,grid=False,showX=True,showY=True,
                axTitle=" ",xTitle=" ")

            # Format axis 3
            fig.formatAxis(axnum=3,axTitle="Fitted Values vs Residual Values",
                xTitle="Fitted Values",yTitle="Residual Values",
                grid=False)

            #Add figure to pdf
            fig.addToPdf(pdfPages=pdf)
def nontechnical_analysis(args, df, mask, C, clustering):
    # Re-order things more palatably for the user,
    # based on the results of the technical analysis.

    # Get the map from the name to the original row index.
    all_row_names = df.index.values
    row_index_map = {s: i for i, s in enumerate(all_row_names)}

    # If some variables are uninformative for clustering,
    # the correlation matrix and the cluster vector will have smaller
    # dimensions than the number of rows in the original data frame.
    remaining_row_names = df[mask].index.values

    # Count the variables included in the clustering.
    p = clustering.shape[0]

    # Count the clusters.
    k = clustering.max() + 1

    # To sort the modules and to sort the variables within the modules,
    # we want to use absolute values of correlations.
    C_abs = np.abs(C)

    # For each cluster, get its indices and its submatrix of C_abs.
    selections = []
    submatrices = []
    degrees = np.zeros(p, dtype=float)
    for i in range(k):
        selection = np.flatnonzero(clustering == i)
        selections.append(selection)
        submatrix = C_abs[np.ix_(selection, selection)]
        submatrices.append(submatrix)
        if selection.size > 1:
            denom = selection.size - 1
            degrees[selection] = (submatrix.sum(axis=0) - 1) / denom

    # Modules should be reordered according to decreasing "average degree".
    cluster_sizes = []
    average_degrees = []
    for selection in selections:
        cluster_sizes.append(selection.size)
        average_degrees.append(degrees[selection].mean())

    module_to_cluster = np.argsort(average_degrees)[::-1]
    cluster_to_module = {v: k for k, v in enumerate(module_to_cluster)}

    triples = [(
        cluster_to_module[clustering[i]],
        -degrees[i],
        i,
    ) for i in range(p)]

    _a, _b, new_to_old_idx = zip(*sorted(triples))

    # Make a csv file if requested.
    header = ('Gene', 'Module', 'Entry Index', 'Average Degree', 'Degree')
    with open(args.out, 'wb') as fout:
        writer = csv.writer(
            fout, 'excel-tab')  #problematic; need to switch to tsv file!
        writer.writerow(header)
        for old_i in new_to_old_idx:
            name = remaining_row_names[old_i]
            cluster = clustering[old_i]
            row = (
                name,
                cluster_to_module[cluster] + 1,
                row_index_map[name] + 1,
                average_degrees[cluster],
                degrees[old_i],
            )
            writer.writerow(row)

    #Create Output
    fh1 = figureHandler(proj="2d")
    fh2 = figureHandler(proj="2d")
    fh3 = figureHandler(proj="2d")

    # Prepare to create the sorted heatmaps. (fh2)
    C_sorted = C[np.ix_(new_to_old_idx, new_to_old_idx)]
    clustering_new = clustering[np.ix_(new_to_old_idx)]

    # Draw the third heatmap (smoothed).
    # Make a smoothed correlation array. (fh3)
    S = expansion(clustering_new)
    block_mask = S.dot(S.T)
    denom = np.outer(S.sum(axis=0), S.sum(axis=0))
    small = S.T.dot(C_sorted).dot(S) / denom
    C_all_smoothed = S.dot(small).dot(S.T)
    C_smoothed = (C_all_smoothed * (1 - block_mask) + C_sorted * block_mask)

    # Getting list of names for heatmaps 2 and 3
    hpnames = [remaining_row_names[old_i] for old_i in new_to_old_idx]

    # Plot using something like http://stackoverflow.com/questions/15988413/
    # Drawing heatmaps
    # Draw first heatmap [C]
    hm.plotHeatmap(C,
                   fh1.ax[0],
                   cmap=palette.mpl_colormap,
                   xlbls=remaining_row_names,
                   ylbls=remaining_row_names)
    fh1.formatAxis(xTitle="sampleID", figTitle="Correlations")

    # Draw second heatmap [C_sorted](reordered according to the clustering).
    hm.plotHeatmap(C_sorted,
                   fh2.ax[0],
                   cmap=palette.mpl_colormap,
                   xlbls=hpnames,
                   ylbls=hpnames)
    fh2.formatAxis(xTitle="sampleID", figTitle="Re-Ordered correlations")

    # Draw the heatmap [C_smoothed](smoothed version of C_sorted)
    hm.plotHeatmap(C_smoothed,
                   fh3.ax[0],
                   cmap=palette.mpl_colormap,
                   xlbls=hpnames,
                   ylbls=hpnames)
    fh3.formatAxis(xTitle="sampleID", figTitle="Smoothed correlations")

    #Create output from maps
    with PdfPages(args.figure) as pdf:
        fh1.addToPdf(pdf)
        fh2.addToPdf(pdf)
        fh3.addToPdf(pdf)
예제 #20
0
def plotSignificantROR(data, pdf, palette):
    """
    Plot a scatter plot of x vs y. 

    :Arguments:

        :type row:
        :param row:

        :type pdf: PdfPages
        :param pdf: pdf object to store scatterplots

        :type des: pandas DataFrame
        :param des: design file

        :type groupName: string
        :param groupName: name of group
    """
    # Iterates over all rows in the dataframe
    # Make scatter plot if p-pvalue is less than 0.05
    for index, row in data.iterrows():
        if row["pval"] > 0.05: continue
        #plotSignificantROR(row,pdf,dat.design,args.group)

        # Get 95% CI
        prstd, lower, upper = wls_prediction_std(row["res"])

        # Sort CIs for Plotting
        toPlot = pd.DataFrame({"x": row["x"], "lower": lower, "upper": upper})
        toPlot.sort_values(by="x", inplace=True)

        # Create plot
        fh = figureHandler(proj="2d", figsize=(14, 8))

        #Plot scatterplot
        scatter.scatter2D(ax=fh.ax[0],
                          x=row["x"],
                          y=row["y"],
                          colorList=palette.list_colors)

        # Plot cutoffs
        lines.drawCutoff(ax=fh.ax[0], x=row["x"], y=row["fitted"], c="c")
        lines.drawCutoff(ax=fh.ax[0], x=toPlot["x"], y=toPlot["lower"], c="r")
        lines.drawCutoff(ax=fh.ax[0], x=toPlot["x"], y=toPlot["upper"], c="r")

        # Formatting
        ymin, ymax = fh.ax[0].get_ylim()
        fh.formatAxis(xTitle="Run Order", yTitle="Value", ylim=(ymin,ymax*1.2),
        figTitle=u"{} Scatter plot (fitted regression line and prediction bands"\
        " included)".format(row["name"]))

        # Shrink figure
        fh.shrink()

        # Add legend to figure
        fh.makeLegend(ax=fh.ax[0],
                      ucGroups=palette.ugColors,
                      group=palette.combName)

        #Add text to the ax
        fh.ax[0].text(.7, .85, u"Slope= {0:.4f}\n(p-value = {1:.4f})\n"\
            "$R^2$ = {2:4f}".format(round(row["slope"],4), round(row["pval"],4),
            round(row["rsq"],4)),transform=fh.ax[0].transAxes, fontsize=12)

        # Save to PDF
        fh.addToPdf(pdf)
예제 #21
0
def volcano(combo, results, oname, cutoff=4):
    """ 
    Plot volcano plots.

    Creates volcano plots to compare means, for all pairwise differences.

    :Arguments:

        :type combo: dictionary
        :param combo: A dictionary of dictionaries with all possible pairwise
            combinations. Used this to create the various column headers in the
            results table.

        :type results: pandas DataFrame
        :param results: TODO

        :type oname: string
        :param oname: Name of the output file in pdf format.
       
        :type cutoff: int
        :param cutoff: The cutoff value for significance.

    :Returns:
        :rtype: PD
        :returns: Outputs a pdf file containing all plots.

    """
    # Getting data for lpvals
    lpvals = {col.split("_")[-1]:results[col] for col in results.columns.tolist() \
            if col.startswith("-log10_p-value_")}

    # Gettign data for diffs
    difs   = {col.split("_")[-1]:results[col] for col in results.columns.tolist() \
            if col.startswith("diff_of")}

    # Making plots
    with PdfPages(oname) as pdf:
        for key in sorted(difs.keys()):
            # Set Up Figure
            volcanoPlot = figureHandler(proj="2d")

            # Plot all results
            scatter.scatter2D(x=list(difs[key]), y=list(lpvals[key]), 
                                colorList=list('b'), ax=volcanoPlot.ax[0])

            # Color results beyond treshold red
            cutLpvals = lpvals[key][lpvals[key]>cutoff]
            if not cutLpvals.empty:
                cutDiff = difs[key][cutLpvals.index]
                scatter.scatter2D(x=list(cutDiff), y=list(cutLpvals), 
                                colorList=list('r'), ax=volcanoPlot.ax[0])

            # Drawing cutoffs
            lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0])

            # Format axis (volcanoPlot)
            volcanoPlot.formatAxis(axTitle=key, grid=False,
                yTitle="-log10(p-value) for Diff of treatment = {0}".format(key),
                xTitle="Diff of treatment = {0}".format(key))

            # Add figure to PDF
            volcanoPlot.addToPdf(pdfPages=pdf)