예제 #1
0
def plotDistributions(data, cutoff, palette, pdf):
    # Open new figureHandler instance
    fh = figureHandler(proj='2d', figsize=(14, 8))

    #Get xmin and xmax
    xmin = -np.nanpercentile(data['cv'].values, 99) * 0.2
    xmax = np.nanpercentile(data['cv'].values, 99) * 1.5

    # Split design file by treatment group and plot density plot
    for name, group in palette.design.groupby(palette.combName):
        dist.plotDensityDF(data=data["cv_" + name],
                           ax=fh.ax[0],
                           colors=palette.ugColors[name],
                           lb="{0}".format(name))

    # Plot legend
    fh.makeLegendLabel(ax=fh.ax[0])

    # Give format to the axis
    fh.formatAxis(
        yTitle="Density",
        xlim=(xmin, xmax),
        ylim="ignore",
        figTitle="Density Plot of Coefficients of Variation by {0}".format(
            palette.combName))

    # Shrink figure
    fh.shrink()

    # Add figure to PDF
    fh.addToPdf(pdfPages=pdf)
예제 #2
0
def plotScatterplot3D(data, palette, pdf):
    """
    Plots Scatterplots 3D for a given number of loadngs for PCA.

    :Arguments:
        :type data: pandas.DataFrame
        :param data: Loadings of the PCA.

        :type pdf: pdf object
        :param pdf: PDF object to save all the generated figures.
    """
    
    # Open figure handler with 3D projection
    fh = figureHandler(proj="3d", figsize=(14,8))

    # Plot scatterplot3D 
    ax = scatter.scatter3D(ax=fh.ax[0], colorList=palette.design.colors.tolist(),
                        x=list(data["PC1"]), y=list(data["PC2"]),
                        z=list(data["PC3"]))

    # Make legends
    fh.makeLegend(ax=fh.ax[0], ucGroups=palette.ugColors, group=palette.combName)

    # Add Titles to the PCA
    fh.format3D(xTitle="PC1",yTitle="PC2",zTitle="PC3")

    # Add Figure to the PDf
    fh.addToPdf(dpi=600, pdfPages=pdf)
예제 #3
0
def plotDensityDistribution(pdf, wide, palette):
    # Instanciating figureHandler object
    figure = figureHandler(proj="2d", figsize=(12, 7))

    # Formating axis
    figure.formatAxis(figTitle="Distribution by Samples Density",
                      xlim="ignore",
                      ylim="ignore",
                      grid=False)

    # Plotting density plot
    density.plotDensityDF(colors=palette.design["colors"],
                          ax=figure.ax[0],
                          data=wide)

    # Add legend to the plot
    figure.makeLegend(ax=figure.ax[0],
                      ucGroups=palette.ugColors,
                      group=palette.combName)

    # Shrinking figure
    figure.shrink()

    # Adding to PDF
    figure.addToPdf(pdf, dpi=600)
예제 #4
0
def plotDensity(data, name, pdf):
    """
    This function takes a pandas dataframe and plots a
    density plot and a boxplot.
    """
    # Stablishing figure layout (x,y,colspan,rowspan)
    axisLayout = [(0, 0, 1, 3), (3, 0, 1, 1)]

    # Creating a figure template
    figure = figureHandler(proj='2d',
                           numAx=2,
                           numRow=4,
                           numCol=1,
                           figsize=(8, 13),
                           arrangement=axisLayout)
    # Adding figure Title
    figure.formatAxis(figTitle="Distribution by Features {0}".format(name),
                      xlim="ignore",
                      ylim="ignore",
                      axnum=0,
                      showX=False)

    #Creting list of len(wide.T) maximu 50 with the colors for each index
    colors = [palette.ugColors[name]] * len(data.index)

    # Plotting boxPlot
    box.boxDF(ax=figure.ax[0], colors=colors, dat=data.T, vert=False, rot=0)

    # Plotting density plot
    density.plotDensityDF(data=data.T.unstack(),
                          ax=figure.ax[1],
                          colors=colors[0])

    # Adding figure to pdf object
    figure.addToPdf(pdf)
예제 #5
0
def plotVenn2(data, title, name1, name2, innerLabels=None, circles=None):
    """ 
    Plots venn diagram for 2 sets (2 circles).

    :Arguments:

        :type data: list
        :param data: list of values for venn circles [1,2,3] = [Ab,AB,aB]
        
        :type title: str
        :param title: Title for the plot
        
        :type name1: str
        :param name1: Name of the first category (circle)
        
        :type name2: str
        :param name2: Name of the second category (circle)
        
        :type innerLabels: list
        :param innerLabels: List of labels for the inside of circles.
        
        :type circles: boolean
        :param circles: If true draws the edge of the circles


    :Returns:
        :rtype figInstance: figureHandler object
        :returns figInstance: Outputs a figureHandler object with the plot.

    """

    #Get figure instances
    figInstance = figureHandler(proj="2d")

    #Setting format of the figure
    figInstance.formatAxis(xTitle=name1, yTitle=name2, axTitle=title)

    #Plotting venn
    venn2fig = venn2(subsets=data,
                     set_labels=(name1, name2),
                     ax=figInstance.ax[0])

    #Plot circles
    if circles:
        circles = venn2_circles(subsets=data,
                                linestyle='dotted',
                                ax=figInstance.ax[0])

    # If not inner labels are provided use the data as a string
    if innerLabels is None:
        innerLabels = list(map(str, data))

    #Art of the venn diagram
    _artVenn2(venn2fig, innerLabels=innerLabels)

    #Return Plot
    return figInstance
예제 #6
0
def main(args):
    # Loading design
    if args.design:
        design = pd.DataFrame.from_csv(args.design, sep="\t")
        design.reset_index(inplace=True)
    else:
        design = False

    # Loading wide file
    wide = pd.DataFrame.from_csv(args.input, sep="\t")

    # Open Figure handler
    fh = figureHandler(proj="3d", figsize=(14, 8))

    # If design file with group and the uniqID is "sampleID" then color by group
    if args.group and args.uniqID == "sampleID":
        glist = list(design[args.group])
        colorList, ucGroups = palette.getColorsByGroup(design=design,
                                                       group=args.group,
                                                       uGroup=sorted(
                                                           set(glist)))
    else:
        glist = list()
        colorList = palette.mpl_colors[0]
        ucGroups = dict()

    # Plot scatterplot 3D
    scatter.scatter3D(ax=fh.ax[0],
                      x=list(wide[args.x]),
                      y=list(wide[args.y]),
                      z=list(wide[args.z]),
                      colorList=colorList)

    # Despine axis (spine = tick)
    fh.despine(fh.ax[0])

    # Give format to the plot
    fh.format3D(title=args.x + " vs " + args.y + " vs " + args.z,
                xTitle=args.x,
                yTitle=args.y,
                zTitle=args.z,
                rotation=float(args.rotation),
                elevation=float(args.elevation))

    # If groups are provided create a legend
    if args.group and args.uniqID == "sampleID":
        fh.makeLegend(ax=fh.ax[0], ucGroups=ucGroups, group=args.group)
        fh.shrink()

    # Saving figure to file
    with PdfPages(args.figure) as pdfOut:
        fh.addToPdf(dpi=600, pdfPages=pdfOut)
    logger.info("Script Complete!")
예제 #7
0
def main(args):
    """Runs eveything"""
    # Importing data
    dat = wideToDesign(args.input, args.design, args.uniqID, logger=logger)

    # Cleaning from missing data
    dat.dropMissing()

    # Getting labels to drop from arguments
    x = True
    y = True
    if "x" in args.labels:
        x = False
    if "y" in args.labels:
        y = False

    print("x =", x)
    print("y =", y)

    #Plotting with dendogram Hierarchical cluster heatmap (HCH)
    logger.info("Plotting heatmaps")
    if args.dendogram == True:
        fh = hm.plotHCHeatmap(dat.wide,
                              hcheatmap=True,
                              cmap=palette.mpl_colormap,
                              xlbls=x,
                              ylbls=y)
        fh.savefig(args.fig, format="pdf")

    #Plotting without a dendogram single heatmap
    else:
        # Creating figure Handler object
        fh = figureHandler(proj='2d', figsize=(14, 14))

        # Creating plot
        hm.plotHeatmap(dat.wide,
                       fh.ax[0],
                       cmap=palette.mpl_colormap,
                       xlbls=x,
                       ylbls=y)

        # formating axis
        fh.formatAxis(xTitle="sampleID")

        # Saving figure
        fh.export(out=args.fig, dpi=300)

    # Finishing script
    logger.info("Script Complete!")
예제 #8
0
def plotCVplots(data, cutoff, palette, pdf):
    #Iterate over groups
    for name, group in palette.design.groupby(palette.combName):
        # Open figure handler
        fh = figureHandler(proj='2d', figsize=(14, 8))

        # Get xmin and xmax
        xmin = -np.nanpercentile(data['cv_' + name].values, 99) * 0.2
        xmax = np.nanpercentile(data['cv_' + name].values, 99) * 1.5

        # Plot histogram
        hist.serHist(ax=fh.ax[0],
                     dat=data['cv_' + name],
                     color='grey',
                     normed=1,
                     range=(xmin, xmax),
                     bins=15)

        # Plot density plot
        dist.plotDensityDF(data=data['cv_' + name],
                           ax=fh.ax[0],
                           lb="CV density",
                           colors=palette.ugColors[name])

        # Plot cutoff
        lines.drawCutoffVert(ax=fh.ax[0],
                             x=cutoff[name],
                             lb="Cutoff at: {0}".format(cutoff[name]))

        # Plot legend
        fh.makeLegendLabel(ax=fh.ax[0])

        # Give format to the axis
        fh.formatAxis(
            yTitle='Density',
            xlim=(xmin, xmax),
            ylim="ignore",
            figTitle="Density Plot of Coefficients of Variation in {0}".format(
                name))

        # Shrink figure to fit legend
        fh.shrink()

        # Add plot to PDF
        fh.addToPdf(pdfPages=pdf)
def plotScores(data, palette, pdf):
    """
    This function creates a PDF file with 3 scatter plots for the combinations 
    of the 3 principal components. PC1 vs PC2, PC1 vs PC3, PC2 vs PC3.

    :Arguments:
        :type data: pandas.core.frame.DataFrame
        :param data: Data frame with the data to plot.
        
        :type outpath: string
        :param outpath: Path for the output file

        :type group: string
        :param group: Name of the column that contains the group information on the design file.

    :Return:
        :rtype PDF: file
        :retrn PDF: file with the 3 scatter plots for PC1 vs PC2, PC1 vs PC3, PC2  vs PC3.
    """
    for x,y in  list(itertools.combinations(data.columns.tolist(),2)):
        # Creating a figure handler object
        fh = figureHandler(proj="2d", figsize=(14,8))

        # Creating title for the figure
        title = "{0} vs {1}".format(x,y)
        
        # Creating the scatterplot 2D
        scatter.scatter2D(ax=fh.ax[0], x=list(data[x]), y=list(data[y]),
                        colorList=palette.design.colors.tolist())

        # Despine axis
        fh.despine(fh.ax[0])

        fh.makeLegend(ax=fh.ax[0], ucGroups=palette.ugColors, group=palette.combName)

        # Shinking the plot so everything fits
        fh.shrink()

        # Format Axis
        fh.formatAxis(figTitle=title, xTitle="Scores on {0}".format(x),
                    yTitle="Scores on {0}".format(y), grid=False)

        # Adding figure to pdf
        fh.addToPdf(dpi=90,pdfPages=pdf)
예제 #10
0
def plotScatterplot2D(data, palette, pdf, nloads=3):
    """
    Plots Scatterplots 2D for a number of loadngs for PCA.

    :Arguments:
        :type data: pandas.DataFrame
        :param data: Loadings of the PCA.

        :type pdf: pdf object
        :param pdf: PDF object to save all the generated figures.

        :type nloads: int
        :param nloads: Number of principal components to create pairwise combs.
    """

    # Selecting amount of pairwise combinations to plot scaterplots for loads.
    for x, y in list(combinations(data.columns.tolist()[:nloads],2)):

        # Create a single-figure figure handler object
        fh = figureHandler(proj="2d", figsize=(14,8))

        # Create a title for the figure
        title = "{0} vs {1}".format(x,y)

        # Plot the scatterplot based on data
        scatter.scatter2D(x=list(data[x]), y=list(data[y]),
                         colorList=palette.design.colors.tolist(), ax=fh.ax[0])

        # Create legend
        fh.makeLegend(ax=fh.ax[0], ucGroups=palette.ugColors, group=palette.combName)

        # Shrink axis to fit legend
        fh.shrink()

        # Despine axis
        fh.despine(fh.ax[0])

        # Formatting axis
        fh.formatAxis(figTitle=title,xTitle="Scores on {0}".format(x),
            yTitle="Scores on {0}".format(y),grid=False)

        # Adding figure to pdf
        fh.addToPdf(dpi=600,pdfPages=pdf)
예제 #11
0
def plotBoxplotDistribution(pdf, wide, palette):
    # Instanciating figureHandler object
    figure = figureHandler(proj="2d",
                           figsize=(max(len(wide.columns) / 4, 12), 7))

    # Formating axis
    figure.formatAxis(figTitle="Distribution by Samples Boxplot",
                      ylim="ignore",
                      grid=False,
                      xlim="ignore")

    # Plotting boxplot
    box.boxDF(ax=figure.ax[0], colors=palette.design["colors"], dat=wide)

    # Shrinking figure
    figure.shrink()

    #Adding to PDF
    figure.addToPdf(pdf, dpi=600)
def plotCDhistogram(count, pdf, group):
    """
    This function counts digits on a given file.

        :Arguments:
            :type count: pandas.DataFrame.
            :param count: DataFrama with the counted digits and min, max and 
                            diff among rows.

            :type pdf: matplotlib.backends.backend_pdf.PdfPages.
            :param pdf: PDF object to plot figures in.

            :type group: str.
            :param group: Name of the group to plot.
    """
    #Creating title
    title="Distribution of difference between \n(min and max) for {0} compounds".\
            format(group)
    if count['diff'].any():

        #Opening figure handler
        fh = figureHandler(proj='2d')

        #Plot histogram
        hist.quickHist(ax=fh.ax[0], dat=count['diff'])

        #Giving format to the axis
        fh.formatAxis(xTitle='Difference in Number of Digits (max - min)',
                      yTitle='Number of Features',
                      figTitle=title,
                      ylim="ignore")

        # Explort figure
        fh.addToPdf(pdf, dpi=600)

    else:
        logger.warn(
            "There were no differences in digit counts for {0}, no plot will be generated"
            .format(group))
예제 #13
0
def main(args):

    # Loading data trought Interface
    dat = wideToDesign(args.input,
                       args.design,
                       args.uniqueID,
                       group=args.group,
                       logger=logger)

    # Treat everything as numeric
    dat.wide = dat.wide.applymap(float)

    # Cleaning from missing data
    dat.dropMissing()

    # Getting the uinique pairs and all pairwise prermutations
    # son that we will feed them to Kruscal-Wallis.
    group_values_series = dat.transpose()[dat.group].T.squeeze()
    group_values_series_unique = group_values_series.unique()
    number_of_unique_groups = group_values_series_unique.shape[0]
    groups_pairwise = list(combinations(group_values_series_unique, 2))
    number_of_groups_pairwise = len(groups_pairwise)

    # Extracting data from the interface.
    data_frame = dat.transpose()
    # Extracting number of features.
    number_of_features = data_frame.shape[1] - 1
    # Saving treatment group name from the arguments.

    # Running overall Kruscall-Wallis test for all group levels combined.

    # Creating p_values_all and flag_values_all for 3 significance levels as emply lists of length equal to the number_of_features.
    # This will be used for all groups.
    p_value_all = [0] * number_of_features
    H_value_all = [0] * number_of_features
    mean_value_all = [0] * number_of_features
    variance_value_all = [0] * number_of_features
    flag_value_all_0p01 = [0] * number_of_features
    flag_value_all_0p05 = [0] * number_of_features
    flag_value_all_0p10 = [0] * number_of_features

    for j in range(0, number_of_features):

        # Creating duplicate for manipulation.
        data_frame_manipulate = data_frame

        # Dropping columns that characterize group. Only feature columns will remain.
        # We also trnaspose here so it will be easier to operate with.
        data_frame_manipulate_transpose = data_frame_manipulate.drop(
            args.group, 1).transpose()
        # Pulling indexes list from the current data frame.
        indexes_list_complete = data_frame_manipulate_transpose.index.tolist()

        # Computing dataset summaries.
        mean_value_all[j] = np.mean(
            data_frame_manipulate_transpose.loc[indexes_list_complete[j]])
        variance_value_all[j] = np.var(
            data_frame_manipulate_transpose.loc[indexes_list_complete[j]],
            ddof=1)

        for i in range(0, number_of_unique_groups):

            # Extracting the pieces of the data frame that belong to ith unique group.
            data_frame_current_group = data_frame.loc[data_frame[
                args.group].isin([group_values_series_unique[i]])]

            # Dropping columns that characterize group. Only feature columns will remain.
            # We also trnaspose here so it will be easier to operate with.
            data_frame_current_group = data_frame_current_group.drop(
                args.group, 1).transpose()
            # Pulling indexes list from the current data frame.
            indexes_list = data_frame_current_group.index.tolist()

            # Series current for group i and row (feature) j.
            series_current = data_frame_current_group.loc[indexes_list[j]]

            # This piece of code depends on whether it is the first group in the list or not.
            if i == 0:
                series_total = [series_current]
            else:
                series_total.append(series_current)

        # Checking if the compared elements are different.
        # Combining for checking.
        combined_list = data_frame_manipulate_transpose.loc[
            indexes_list_complete[j]].tolist()
        combined_list_unique = np.unique(combined_list)
        # Checking if the number of unique elements is exactly 1.
        if len(combined_list_unique) == 1:
            # Performing Kruscal-Wallis for all groups for feature j.
            p_value_all[j] = float("nan")
            H_value_all[j] = float("nan")
            if p_value_all[j] < 0.01: flag_value_all_0p01[j] = 1
            if p_value_all[j] < 0.05: flag_value_all_0p05[j] = 1
            if p_value_all[j] < 0.10: flag_value_all_0p10[j] = 1

        else:
            # Performing Kruscal-Wallis for all groups for feature j.
            kruscal_wallis_args = series_total
            p_value_all[j] = kruskalwallis(*kruscal_wallis_args)[1]
            H_value_all[j] = kruskalwallis(*kruscal_wallis_args)[0]
            if p_value_all[j] < 0.01: flag_value_all_0p01[j] = 1
            if p_value_all[j] < 0.05: flag_value_all_0p05[j] = 1
            if p_value_all[j] < 0.10: flag_value_all_0p10[j] = 1

    # The loop over features has to be finished by now. Converting them into the data frame.
    # The pariwise results will be added later.
    summary_df = pd.DataFrame(data=mean_value_all,
                              columns=["GrandMean"],
                              index=indexes_list)
    summary_df['SampleVariance'] = variance_value_all
    summary_df['H_value_for_all'] = H_value_all
    summary_df['prob_greater_than_H_for_all'] = p_value_all
    flag_df = pd.DataFrame(data=flag_value_all_0p01,
                           columns=["flag_significant_0p01_on_all_groups"],
                           index=indexes_list)
    flag_df["flag_significant_0p05_on_all_groups"] = flag_value_all_0p05
    flag_df["flag_significant_0p10_on_all_groups"] = flag_value_all_0p10

    # Informing that KW for all group has been performed.
    logger.info(
        u"Kruscal-Wallis test for all groups together has been performed.")

    # Computing means for each group
    # This part just produces sumamry statistics for the output table.
    # This has nothing to do with Kruscal-Wallis

    for i in range(0, number_of_unique_groups):

        # Extracting the pieces of the data frame that belong to ith group.
        data_frame_current_group = data_frame.loc[data_frame[args.group].isin(
            [group_values_series_unique[i]])]

        # Dropping columns that characterize group. Only feature columns will remain.
        # We also trnaspose here so it will be easier to operate with.
        data_frame_current_group = data_frame_current_group.drop(
            args.group, 1).transpose()
        # Pulling indexes list from the current group.
        indexes_list = data_frame_current_group.index.tolist()

        # Creating array of means for the current group that will be filled.
        means_value = [0] * number_of_features

        for j in range(0, number_of_features):

            series_current = data_frame_current_group.loc[indexes_list[j]]
            means_value[j] = series_current.mean()

        means_value_column_name_current = 'mean_treatment_' + group_values_series_unique[
            i]
        summary_df[means_value_column_name_current] = means_value

    # Running pairwise Kruscall-Wallis test for all pairs of group levels that are saved in groups_pairwise.

    for i in range(0, number_of_groups_pairwise):

        # Extracting the pieces of the data frame that belong to groups saved in the i-th unique pair.
        groups_subset = groups_pairwise[i]
        data_frame_first_group = data_frame.loc[data_frame[args.group].isin(
            [groups_subset[0]])]
        data_frame_second_group = data_frame.loc[data_frame[args.group].isin(
            [groups_subset[1]])]

        # Dropping columns that characterize group. Only feature columns will remain.
        # We also trnaspose here so it will be easier to operate with.
        data_frame_first_group = data_frame_first_group.drop(args.group,
                                                             1).transpose()
        data_frame_second_group = data_frame_second_group.drop(args.group,
                                                               1).transpose()
        # Pulling indexes list from the first one (they are the same)
        indexes_list = data_frame_first_group.index.tolist()

        # Creating p_values, neg_log10_p_value, flag_values, difference_value lists filled wiht 0es.
        p_value = [0] * number_of_features
        H_value = [0] * number_of_features
        neg_log10_p_value = [0] * number_of_features
        flag_value_0p01 = [0] * number_of_features
        flag_value_0p05 = [0] * number_of_features
        flag_value_0p10 = [0] * number_of_features
        difference_value = [0] * number_of_features

        for j in range(0, number_of_features):

            series_first = data_frame_first_group.loc[indexes_list[j]]
            series_second = data_frame_second_group.loc[indexes_list[j]]

            # Checking if the compared elements are different.
            # Combining for checking.
            first_list = data_frame_first_group.loc[indexes_list[j]].tolist()
            second_list = data_frame_second_group.loc[indexes_list[j]].tolist()
            combined_list = first_list + second_list
            combined_list_unique = np.unique(combined_list)
            # Checking if the number of unique elements is exactly 1.
            if len(combined_list_unique) == 1:
                p_value[j] = float("nan")
                H_value[j] = float("nan")
                # Possible alternative for two groups.
                # p_value[j] = kruskalwallis(series_first, series_second)[1]
                neg_log10_p_value[j] = -np.log10(p_value[j])
                difference_value[j] = series_first.mean() - series_second.mean(
                )
                if p_value[j] < 0.01: flag_value_0p01[j] = 1
                if p_value[j] < 0.05: flag_value_0p05[j] = 1
                if p_value[j] < 0.10: flag_value_0p10[j] = 1

            else:
                kruscal_wallis_args = [series_first, series_second]
                p_value[j] = kruskalwallis(*kruscal_wallis_args)[1]
                H_value[j] = kruskalwallis(*kruscal_wallis_args)[0]
                # Possible alternative for two groups.
                # p_value[j] = kruskalwallis(series_first, series_second)[1]
                neg_log10_p_value[j] = -np.log10(p_value[j])
                difference_value[j] = series_first.mean() - series_second.mean(
                )
                if p_value[j] < 0.01: flag_value_0p01[j] = 1
                if p_value[j] < 0.05: flag_value_0p05[j] = 1
                if p_value[j] < 0.10: flag_value_0p10[j] = 1

        # Adding current p_value and flag_value column to the data frame and assigning the name
        p_value_column_name_current = 'prob_greater_than_H_for_diff_' + groups_subset[
            0] + '_' + groups_subset[1]
        H_value_column_name_current = 'H_value_for_diff_' + groups_subset[
            0] + '_' + groups_subset[1]
        neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + groups_subset[
            0] + '_' + groups_subset[1]
        difference_value_column_name_current = 'diff_of_' + groups_subset[
            0] + '_' + groups_subset[1]
        summary_df[p_value_column_name_current] = p_value
        summary_df[H_value_column_name_current] = H_value
        summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value
        summary_df[difference_value_column_name_current] = difference_value

        flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_' + groups_subset[
            0] + '_' + groups_subset[1]
        flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_' + groups_subset[
            0] + '_' + groups_subset[1]
        flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_' + groups_subset[
            0] + '_' + groups_subset[1]
        flag_df[flag_value_column_name_current_0p01] = flag_value_0p01
        flag_df[flag_value_column_name_current_0p05] = flag_value_0p05
        flag_df[flag_value_column_name_current_0p10] = flag_value_0p10

    # Roundign the results up to 4 precison digits.
    summary_df = summary_df.apply(lambda x: x.round(4))

    # Adding name for the unique ID column that was there oroginally.
    summary_df.index.name = args.uniqueID
    flag_df.index.name = args.uniqueID

    # Save summary_df to the ouptut
    summary_df.to_csv(args.summaries, sep="\t")
    # Save flag_df to the output
    flag_df.to_csv(args.flags, sep="\t")

    # Informing that KW for pairwise group has been performed.
    logger.info(
        u"Kruscal-Wallis test for all groups pairwise has been performed.")

    # Generating Indexing for volcano plots.

    # Getting data for lpvals
    lpvals = {col.split("_value_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \
            if col.startswith("neg_log10_p_value")}

    # Gettign data for diffs
    difs   = {col.split("_of_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \
            if col.startswith("diff_of_")}

    # The cutoff value for significance.
    cutoff = 2

    # Making volcano plots
    with PdfPages(args.volcano) as pdf:
        for i in range(0, number_of_groups_pairwise):
            # Set Up Figure
            volcanoPlot = figureHandler(proj="2d")

            groups_subset = groups_pairwise[i]
            current_key = groups_subset[0] + '_' + groups_subset[1]

            # Plot all results
            scatter.scatter2D(x=list(difs[current_key]),
                              y=list(lpvals[current_key]),
                              colorList=list('b'),
                              ax=volcanoPlot.ax[0])

            # Color results beyond treshold red
            cutLpvals = lpvals[current_key][lpvals[current_key] > cutoff]
            if not cutLpvals.empty:
                cutDiff = difs[current_key][cutLpvals.index]
                scatter.scatter2D(x=list(cutDiff),
                                  y=list(cutLpvals),
                                  colorList=list('r'),
                                  ax=volcanoPlot.ax[0])

            # Drawing cutoffs
            lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0])

            # Format axis (volcanoPlot)
            volcanoPlot.formatAxis(
                axTitle=current_key,
                grid=False,
                yTitle="-log10(p-value) for Diff of treatment means for {0}".
                format(current_key),
                xTitle="Difference of treatment means for {0}".format(
                    current_key))

            # Add figure to PDF
            volcanoPlot.addToPdf(pdfPages=pdf)

    # Informing that the volcano plots are done
    logger.info(u"Pairwise volcano plots have been created.")

    # Ending script
    logger.info(u"Finishing running of Kruscal-Wallis tests.")
예제 #14
0
def qqPlot(tresid, tfit, oname):
    """ 
    Plot the residual diagnostic plots by sample.

    Output q-q plot, boxplots and distributions of the residuals. These plots
    will be used diagnose if residuals are approximately normal.

    :Arguments:
        :type tresid: pandas.Series
        :param tresid: Pearson normalized residuals. (transposed)
                        (residuals / sqrt(MSE))

        :type tfit: pandas DataFrame
        :param tfit: output of the ANOVA (transposed)

        :type oname: string
        :param oname: Name of the output file in pdf format.

    :Returns:
        :rtype: PDF
        :returns: Outputs a pdf file containing all plots.

    """
    #Open pdf
    with PdfPages(oname) as pdf:

        # Stablishing axisLayout
        axisLayout = [(0, 0, 1, 1), (0, 1, 1, 1), (0, 2, 1, 1), (1, 0, 3, 1)]

        # Start plotting
        for col in tresid.columns:
            #Creating figure
            fig = figureHandler(proj='2d',
                                numAx=4,
                                numRow=2,
                                numCol=3,
                                arrangement=axisLayout)

            data = tresid[col].values.ravel()
            noColors = list()
            for j in range(0, len(data)):
                noColors.append('b')  #blue
            df_data = pd.DataFrame(data)

            # Removing missing so that it will plot correctly.
            mask_nan_data = np.isnan(data)
            data = data[~mask_nan_data]

            # Plot qqplot on axis 0
            sm.graphics.qqplot(data, fit=True, line='r', ax=fig.ax[0])

            # Plot boxplot on axis 1
            box.boxSeries(ser=data, ax=fig.ax[1])

            # Plot histogram on axis 2
            hist.quickHist(ax=fig.ax[2], dat=df_data, orientation='horizontal')

            # Plot scatterplot on axis 3
            scatter.scatter2D(ax=fig.ax[3],
                              x=tfit[col],
                              y=tresid[col],
                              colorList=list('b'))

            # Draw cutoff line for scatterplot on axis 3
            lines.drawCutoffHoriz(ax=fig.ax[3], y=0)

            # Format axis 0
            fig.formatAxis(figTitle=col,
                           axnum=0,
                           grid=False,
                           showX=True,
                           yTitle="Sample Quantiles",
                           xTitle=" ")

            # Format axis 1
            fig.formatAxis(axnum=1,
                           axTitle="Standardized Residuals",
                           grid=False,
                           showX=False,
                           showY=True,
                           xTitle=" ")

            # Format axis 2
            fig.formatAxis(axnum=2,
                           grid=False,
                           showX=True,
                           showY=True,
                           axTitle=" ",
                           xTitle=" ")

            # Format axis 3
            fig.formatAxis(axnum=3,
                           axTitle="Predicted Values vs Residual Values",
                           xTitle="Predicted Values",
                           yTitle="Residual Values",
                           grid=False)

            #Add figure to pdf
            fig.addToPdf(pdfPages=pdf)
예제 #15
0
def nontechnical_analysis(args, df, mask, C, clustering):
    # Re-order things more palatably for the user,
    # based on the results of the technical analysis.

    # Get the map from the name to the original row index.
    all_row_names = df.index.values
    row_index_map = {s: i for i, s in enumerate(all_row_names)}

    # If some variables are uninformative for clustering,
    # the correlation matrix and the cluster vector will have smaller
    # dimensions than the number of rows in the original data frame.
    remaining_row_names = df[mask].index.values

    # Count the variables included in the clustering.
    p = clustering.shape[0]

    # Count the clusters.
    k = clustering.max() + 1

    # To sort the modules and to sort the variables within the modules,
    # we want to use absolute values of correlations.
    C_abs = np.abs(C)

    # For each cluster, get its indices and its submatrix of C_abs.
    selections = []
    submatrices = []
    degrees = np.zeros(p, dtype=float)
    for i in range(k):
        selection = np.flatnonzero(clustering == i)
        selections.append(selection)
        submatrix = C_abs[np.ix_(selection, selection)]
        submatrices.append(submatrix)
        if selection.size > 1:
            denom = selection.size - 1
            degrees[selection] = (submatrix.sum(axis=0) - 1) / denom

    # Modules should be reordered according to decreasing "average degree".
    cluster_sizes = []
    average_degrees = []
    for selection in selections:
        cluster_sizes.append(selection.size)
        average_degrees.append(degrees[selection].mean())

    module_to_cluster = np.argsort(average_degrees)[::-1]
    cluster_to_module = {v: k for k, v in enumerate(module_to_cluster)}

    triples = [(
        cluster_to_module[clustering[i]],
        -degrees[i],
        i,
    ) for i in range(p)]

    _a, _b, new_to_old_idx = zip(*sorted(triples))

    # Make a csv file if requested.
    header = ('Gene', 'Module', 'Entry Index', 'Average Degree', 'Degree')
    with open(args.out, 'wb') as fout:
        writer = csv.writer(
            fout, 'excel-tab')  #problematic; need to switch to tsv file!
        writer.writerow(header)
        for old_i in new_to_old_idx:
            name = remaining_row_names[old_i]
            cluster = clustering[old_i]
            row = (
                name,
                cluster_to_module[cluster] + 1,
                row_index_map[name] + 1,
                average_degrees[cluster],
                degrees[old_i],
            )
            writer.writerow(row)

    #Create Output
    fh1 = figureHandler(proj="2d")
    fh2 = figureHandler(proj="2d")
    fh3 = figureHandler(proj="2d")

    # Prepare to create the sorted heatmaps. (fh2)
    C_sorted = C[np.ix_(new_to_old_idx, new_to_old_idx)]
    clustering_new = clustering[np.ix_(new_to_old_idx)]

    # Draw the third heatmap (smoothed).
    # Make a smoothed correlation array. (fh3)
    S = expansion(clustering_new)
    block_mask = S.dot(S.T)
    denom = np.outer(S.sum(axis=0), S.sum(axis=0))
    small = S.T.dot(C_sorted).dot(S) / denom
    C_all_smoothed = S.dot(small).dot(S.T)
    C_smoothed = (C_all_smoothed * (1 - block_mask) + C_sorted * block_mask)

    # Getting list of names for heatmaps 2 and 3
    hpnames = [remaining_row_names[old_i] for old_i in new_to_old_idx]

    # Plot using something like http://stackoverflow.com/questions/15988413/
    # Drawing heatmaps
    # Draw first heatmap [C]
    hm.plotHeatmap(C,
                   fh1.ax[0],
                   cmap=palette.mpl_colormap,
                   xlbls=remaining_row_names,
                   ylbls=remaining_row_names)
    fh1.formatAxis(xTitle="sampleID", figTitle="Correlations")

    # Draw second heatmap [C_sorted](reordered according to the clustering).
    hm.plotHeatmap(C_sorted,
                   fh2.ax[0],
                   cmap=palette.mpl_colormap,
                   xlbls=hpnames,
                   ylbls=hpnames)
    fh2.formatAxis(xTitle="sampleID", figTitle="Re-Ordered correlations")

    # Draw the heatmap [C_smoothed](smoothed version of C_sorted)
    hm.plotHeatmap(C_smoothed,
                   fh3.ax[0],
                   cmap=palette.mpl_colormap,
                   xlbls=hpnames,
                   ylbls=hpnames)
    fh3.formatAxis(xTitle="sampleID", figTitle="Smoothed correlations")

    #Create output from maps
    with PdfPages(args.figure) as pdf:
        fh1.addToPdf(pdf)
        fh2.addToPdf(pdf)
        fh3.addToPdf(pdf)
예제 #16
0
def main(args):

    # Loading data trought Interface
    logger.info("Loading data with the Interface")
    dat = wideToDesign(args.input, args.design, args.uniqueID, group = args.group, 
                        runOrder=args.order, logger=logger)

    # Treat everything as numeric
    dat.wide = dat.wide.applymap(float)
    
    # Cleaning from missing data
    dat.dropMissing()


    # SCENARIO 1: Unpaired t-test. In this case there can be as many groups as possible. 
    # Order variable is ignored and t-tests are performed pairwise for each pair of groups.

    if args.pairing == "unpaired":
       logger.info("Unpaired t-test will be performed for all groups pairwise.")
  

       # Getting the uinique pairs and all pairwise prermutations
       # son that we will feed them to pairwise unpaired t-tests.
       group_values_series = dat.transpose()[dat.group].T.squeeze()
       group_values_series_unique = group_values_series.unique()
       number_of_unique_groups = group_values_series_unique.shape[0]
       groups_pairwise = list(combinations(group_values_series_unique,2) ) 
       number_of_groups_pairwise = len(groups_pairwise)

       # Extracting data from the interface.
       data_frame = dat.transpose()
       # Extracting number of features. This will depend on whether the user has provided ordering variable or not.
       # This variable is useless for unpared test. it just adds extra column to the data frame.	
       if args.order == False:
          number_of_features = data_frame.shape[1] - 1
       else:
          number_of_features = data_frame.shape[1] - 2
       # Saving treatment group name from the arguments.



       # Computing overall summaries (mean and variance).
       # This part just produces sumamry statistics for the output table.
       # This has nothing to do with unpaired t-test. This is just summary for the table.
       mean_value_all = [0] * number_of_features
       variance_value_all = [0] * number_of_features

       for j in range(0, number_of_features ):
  

           # Creating duplicate for manipulation.
           data_frame_manipulate = data_frame

           # Dropping columns that characterize group. Only feature columns will remain.
           # We also trnaspose here so it will be easier to operate with.
           # We should either drop 1 or 2 columns depending whether we fed the second one.
           if args.order == False:
              data_frame_manipulate_transpose  = data_frame_manipulate.drop(  args.group, 1 ).transpose()
           else:
              data_frame_manipulate_transpose  = data_frame_manipulate.drop(  [args.group, args.order], 1 ).transpose()
           # Pulling indexes list from the current data frame.
           indexes_list_complete = data_frame_manipulate_transpose.index.tolist()

           # Computing dataset summaries.
           mean_value_all[j] = np.mean(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ]) 
           variance_value_all[j] = np.var(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ], ddof = 1)



       # Creating the table and putting the results there.
       summary_df     =  pd.DataFrame(data = mean_value_all, columns = ["GrandMean"], index = indexes_list_complete )    
       summary_df['SampleVariance'] =  variance_value_all


       # Computing means for each group and outputting them.
       # This part just produces summary statistics for the output table.
       # This has nothing to do with unpaired t-test. This is just summary for the table.

       for i in range(0, number_of_unique_groups ):
        

           # Extracting the pieces of the data frame that belong to the ith group.
           data_frame_current_group  = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[i]]  )]

           # Dropping columns that characterize group. Only feature columns will remain.
           # We also trnaspose here so it will be easier to operate with.
           # We should either drop 1 or 2 columns depending whether we fed the second one.
           if args.order == False:
              data_frame_current_group  = data_frame_current_group.drop(  args.group, 1 ).transpose()
           else:
              data_frame_current_group  = data_frame_current_group.drop(  [args.group, args.order], 1 ).transpose()

           # Pulling indexes list from the current group.
           indexes_list = data_frame_current_group.index.tolist()

           # Creating array of means for the current group that will be filled.
           means_value  = [0] * number_of_features
    
           for j in range(0, number_of_features ):
  
               series_current = data_frame_current_group.loc[ indexes_list[j] ] 
               means_value[j] = series_current.mean()


           # Adding current mean_value column to the data frame and assigning the name.
           means_value_column_name_current  = 'mean_treatment_' + group_values_series_unique[i] 
           summary_df[means_value_column_name_current] = means_value
           
           


       # Running pairwise unpaired (two-sample) t-test for all pairs of group levels that are saved in groups_pairwise.
       for i in range(0, number_of_groups_pairwise ):
        
           # Extracting the pieces of the data frame that belong to groups saved in the i-th unique pair.
           groups_subset = groups_pairwise[i] 
           data_frame_first_group  = data_frame.loc[data_frame[args.group].isin( [groups_subset[0]]  )]
           data_frame_second_group = data_frame.loc[data_frame[args.group].isin( [groups_subset[1]]  )]

           # Dropping columns that characterize group. Only feature columns will remain.
           # We also trnaspose here so it will be easier to operate with.
           # We should either drop 1 or 2 columns depending whether we fed the second one.
           if args.order == False:
              data_frame_first_group  = data_frame_first_group.drop(  args.group, 1 ).transpose()
              data_frame_second_group = data_frame_second_group.drop( args.group, 1 ).transpose()
           else:
              data_frame_first_group  = data_frame_first_group.drop(  [args.group, args.order], 1 ).transpose()
              data_frame_second_group = data_frame_second_group.drop( [args.group, args.order], 1 ).transpose()

           # Pulling indexes list from the first one (they are the same)
           indexes_list = data_frame_first_group.index.tolist()

           # Creating p_values, neg_log10_p_value, flag_values, difference_value lists filled wiht 0es.
           p_value           = [0] * number_of_features
           t_value           = [0] * number_of_features
           neg_log10_p_value = [0] * number_of_features
           flag_value_0p01   = [0] * number_of_features
           flag_value_0p05   = [0] * number_of_features
           flag_value_0p10   = [0] * number_of_features
           difference_value  = [0] * number_of_features


           for j in range(0, number_of_features ):
       
               series_first  = data_frame_first_group.loc[ indexes_list[j] ] 
               series_second = data_frame_second_group.loc[ indexes_list[j] ]

               ttest_ind_args = [series_first, series_second]
               p_value[j] = ttest_ind( *ttest_ind_args )[1]
               t_value[j] = ttest_ind( *ttest_ind_args )[0]
               # Possible alternative for two groups.
               # p_value[j] = ttest_ind_args(series_first, series_second)[1]
	       neg_log10_p_value[j] = - np.log10(p_value[j])
               difference_value[j] = series_first.mean() - series_second.mean()
               if p_value[j] < 0.01: flag_value_0p01[j] = 1
               if p_value[j] < 0.05: flag_value_0p05[j] = 1
               if p_value[j] < 0.10: flag_value_0p10[j] = 1


           # Creating column names for the data frame.
           p_value_column_name_current           = 'prob_greater_than_t_for_diff_' + groups_subset[0] + '_' + groups_subset[1]
           t_value_column_name_current           = 't_value_for_diff_' + groups_subset[0] + '_' + groups_subset[1]
           neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + groups_subset[0] + '_' + groups_subset[1]
           difference_value_column_name_current  = 'diff_of_' + groups_subset[0] + '_' + groups_subset[1]
           flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_' + groups_subset[0] + '_' + groups_subset[1] 
           flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_' + groups_subset[0] + '_' + groups_subset[1] 
           flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_' + groups_subset[0] + '_' + groups_subset[1] 

           # Adding current p_value and flag_value column to the data frame and assigning the name.
           # If the data frame has not been created yet we create it on the fly. i.e. if i == 0 create it.
           if i == 0:
              flag_df     =  pd.DataFrame(data = flag_value_0p01, columns = [flag_value_column_name_current_0p01], index = indexes_list )    
           else:
              flag_df[flag_value_column_name_current_0p01] = flag_value_0p01

           # At this point data frame exists so only columns are added to the existing data frame.
           summary_df[p_value_column_name_current]           = p_value
           summary_df[t_value_column_name_current]           = t_value
           summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value
           summary_df[difference_value_column_name_current]  = difference_value
           flag_df[flag_value_column_name_current_0p05] = flag_value_0p05
           flag_df[flag_value_column_name_current_0p10] = flag_value_0p10
  


    # SCENARIO 2: Paired t-test. In this case there should be EXACTLY TWO groups.
    # Each sample in one group should have exacty one matching pair in the other group. 
    # The matching is controlled by args.order variable.

    if args.pairing == "paired":
       logger.info("Paired test will be performed for two groups pairwise based on pairing variable: {0}.".format(args.order))


       # Getting the number of unique groups. If it is bigger than 2 return the warning and exit.
       group_values_series = dat.transpose()[dat.group].T.squeeze()
       group_values_series_unique = group_values_series.unique()
       number_of_unique_groups = group_values_series_unique.shape[0]
       if number_of_unique_groups != 2:
          logger.warning(u"The number of unique groups is {0} and not 2 as expected. The paired t-test cannot be performed.".format(number_of_unique_groups) )
          exit()	
 
       # This piece of code will be executed only if the number_of_unique_groups is exactly 2 so the group check is passed. 

       # Creating pairwise combination of our two groups that we will use in the future.
       groups_pairwise = list( combinations(group_values_series_unique,2) ) 
       number_of_groups_pairwise = len(groups_pairwise)

       # Extracting data from the interface.
       data_frame = dat.transpose()
       # Extracting number of features. This will depend on whether the user has provided ordering variable or not.
       # Checking that the requred pairing variable has been provided.
       if args.order == False:
          logger.info("The required t-test pairing variable has not been provided: The paired t-test cannot be performed.")
          exit()	


       # This piece of code will be executed only if the args.order has been provided and the check is passed. 

       # Defining the number of features. It should be the dimension of the data frame minus 2 columns that stand for arg.group and args.order
       number_of_features = data_frame.shape[1] - 2

       # At this point is is confirmed that there are only 2 group and that pairing variable args.order has been provided.
       # Now we need to check that pairing is correct i.e. that each pairID corresponds to only two samples from different groups.

       # Getting the unique pairs and deleting those theat have more or less than three.
       pairid_values_series = dat.transpose()[dat.runOrder].T.squeeze()
       pairid_values_series_unique = pairid_values_series.unique()
       number_of_unique_pairid = pairid_values_series_unique.shape[0]


       # Extracting data from the interface.
       data_frame = dat.transpose()
  
       # Extracting the number of samples in the final frame.
       number_of_samples = data_frame.shape[0]


       # Performing the cleaning of the original data. We are removing samples that are not paired and not belonging to the two groups.
       # If the dataset has 1 or 3 or more matches for a pairid those samples are removed with a warning.
       # If pairdid corresponds to exactly two samples (which is correct) but groupid-s are NOT different those values will be also removed.
       for i in range(0, number_of_unique_pairid ):
       
           # Extracting the pieces of the data frame that belong to ith unique pairid.
           data_frame_current_pairid = data_frame.loc[data_frame[args.order].isin( [ pairid_values_series_unique[i] ]  )]

           # We transpose here so it will be easier to operate with.
           data_frame_current_pairid  = data_frame_current_pairid.transpose()
           sample_names_current_pairid = list(data_frame_current_pairid.columns.values)
       
           if data_frame_current_pairid.shape[1] != 2:

              # Pulling indexes list from the current data frame.
              logger.warning(u"Number of samples for the pairID: {0} is equal to {1} and NOT equal to 2. Sample(s) {2} will be removed from further analysis.".format(pairid_values_series_unique[i],
                               data_frame_current_pairid.shape[1], sample_names_current_pairid)  )

              # Getting indexes we are trying to delete.
              boolean_indexes_to_delete = data_frame.index.isin( sample_names_current_pairid )  
              # Deleting the indexes and in the for loop going to next iteration.
              data_frame.drop(data_frame.index[boolean_indexes_to_delete], inplace=True)
    
           # This piece is executed if the numbe is correct i.e. data_frame_current_group.shape[1] == 2:
           # Here we are checking if the groupID-s for the given pair are indeed different.

           elif data_frame_current_pairid.transpose()[args.group][0] == data_frame_current_pairid.transpose()[args.group][1]:

                logger.warning(u"Samples in pairID {0} have groupIDs: {1} and {2}. Should be different! Sample(s) {3} will be removed from further analysis.".format(pairid_values_series_unique[i],       		                         data_frame_current_pairid.transpose()[args.group][1], data_frame_current_pairid.transpose()[args.group][0], sample_names_current_pairid)  )
                   
                # Getting indexes we are trying to delete.
                boolean_indexes_to_delete = data_frame.index.isin( sample_names_current_pairid )  
                # Deleting the indexes.
                data_frame.drop(data_frame.index[boolean_indexes_to_delete], inplace=True)


        
       # Cheching if the data frame bacame empty after cleaning.
       if data_frame.shape[0] == 0:
          logger.warning(u"Number of paired samples in the final dataset is exactly 0! Please check the desing file for accuracy! Exiting the program."  )
          exit()	
   


       # Computing overall summaries (mean and variance).
       # This part just produces sumamry statistics for the output table.
       # This has nothing to do with paired t-test. This is just summary for the table.
       mean_value_all = [0] * number_of_features
       variance_value_all = [0] * number_of_features

       for j in range(0, number_of_features ):

           # Creating duplicate for manipulation.
           data_frame_manipulate = data_frame

           # Dropping columns that characterize group. Only feature columns will remain.
           # We also trnaspose here so it will be easier to operate with.
           data_frame_manipulate_transpose  = data_frame_manipulate.drop( [args.group,args.order], 1 ).transpose()
           # Pulling indexes list from the current data frame.
           indexes_list_complete = data_frame_manipulate_transpose.index.tolist()

           # Computing dataset summaries.
           mean_value_all[j] = np.mean(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ]) 
           variance_value_all[j] = np.var(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ], ddof = 1)



       # Creating the table and putting the results there.
       summary_df     =  pd.DataFrame(data = mean_value_all, columns = ["GrandMean"], index = indexes_list_complete )    
       summary_df['SampleVariance'] =  variance_value_all


       # Computing means for each group and outputting them.
       # This part just produces summary statistics for the output table.
       # This has nothing to do with paired t-test. This is just summary for the table.

       for i in range(0, number_of_unique_groups ):
        

           # Extracting the pieces of the data frame that belong to the ith group.
           data_frame_current_group  = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[i]]  )]

           # Dropping columns that characterize group. Only feature columns will remain.
           data_frame_current_group  = data_frame_current_group.drop(  [args.group, args.order], 1 ).transpose()

           # Pulling indexes list from the current group.
           indexes_list = data_frame_current_group.index.tolist()

           # Creating array of means for the current group that will be filled.
           means_value  = [0] * number_of_features
    
           for j in range(0, number_of_features ):
  
               series_current = data_frame_current_group.loc[ indexes_list[j] ] 
               means_value[j] = series_current.mean()


           # Adding current mean_value column to the data frame and assigning the name.
           means_value_column_name_current  = 'mean_treatment_' + group_values_series_unique[i] 
           summary_df[means_value_column_name_current] = means_value




       # Performing paired t-test for the two groups and saving the results.

       # Creating p_values and flag_values emply list of length number_of_features.
       # This will be used for thw two groups in paired t-test.
       p_value = [0] * number_of_features
       t_value = [0] * number_of_features
       flag_value_0p01   = [0] * number_of_features
       flag_value_0p05   = [0] * number_of_features
       flag_value_0p10   = [0] * number_of_features
       neg_log10_p_value = [0] * number_of_features
       difference_value  = [0] * number_of_features

       # Performing paired t-test for each pair of features.
       for j in range(0, number_of_features ):

  
           # Extracting the pieces of the data frame that belong to 1st group.
           data_frame_first_group  = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[0]]  )]
           data_frame_second_group = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[1]]  )]

        
           # Sorting data frame by args.group index 
           # This will ensure datasets are aligned by pair when fed to the t-test.
           data_frame_first_group  = data_frame_first_group.sort(args.order)
           data_frame_second_group = data_frame_second_group.sort(args.order)


           # Sorting data frame by args.group index 
           data_frame_first_group  = data_frame_first_group.drop(  [args.group,args.order], 1 ).transpose()
           data_frame_second_group = data_frame_second_group.drop( [args.group,args.order], 1 ).transpose()
         
           # Pulling list of indexes. This is the same list for the first and for the second.
           indexes_list = data_frame_first_group.index.tolist()

           # Pullinng the samples out
           series_first  = data_frame_first_group.loc[ indexes_list[j] ] 
           series_second = data_frame_second_group.loc[ indexes_list[j] ]


           # Running t-test for the two given samples
           paired_ttest_args = [series_first, series_second]
           p_value[j] = ttest_rel( *paired_ttest_args )[1]
           t_value[j] = ttest_rel( *paired_ttest_args )[0]
           neg_log10_p_value[j] = - np.log10(p_value[j])
           difference_value[j] = series_first.mean() - series_second.mean()
           if p_value[j] < 0.01: flag_value_0p01[j] = 1
           if p_value[j] < 0.05: flag_value_0p05[j] = 1
           if p_value[j] < 0.10: flag_value_0p10[j] = 1


       # The loop over features has to be finished by now. Converting them into the data frame.    


       # Creating column names for the data frame.
       p_value_column_name_current           = 'prob_greater_than_t_for_diff_' + group_values_series_unique[0] + '_' + group_values_series_unique[1]
       t_value_column_name_current           = 't_value_for_diff_' + group_values_series_unique[0] + '_' + group_values_series_unique[1]
       neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + group_values_series_unique[0] + '_' + group_values_series_unique[1]
       difference_value_column_name_current  = 'diff_of_' + group_values_series_unique[0] + '_' + group_values_series_unique[1]
       flag_value_column_name_current_0p01 = 'flag_value_diff_signif_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] + '_0p01'
       flag_value_column_name_current_0p05 = 'flag_value_diff_signif_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] + '_0p05'
       flag_value_column_name_current_0p10 = 'flag_value_diff_signif_' + group_values_series_unique[0] + '_' + group_values_series_unique[1] + '_0p10'


       summary_df[t_value_column_name_current] = t_value
       summary_df[p_value_column_name_current] = p_value
       summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value
       summary_df[difference_value_column_name_current] = difference_value

       flag_df  =  pd.DataFrame(data = flag_value_0p01, columns = [flag_value_column_name_current_0p01], index = indexes_list )    
       flag_df[flag_value_column_name_current_0p05] = flag_value_0p05
       flag_df[flag_value_column_name_current_0p10] = flag_value_0p10



   
    # Roundign the results up to 4 precision digits.
    summary_df = summary_df.apply(lambda x: x.round(4))

    # Adding name for the unique ID column that was there oroginally.
    summary_df.index.name    =  args.uniqueID
    flag_df.index.name =  args.uniqueID

    # Save summary_df to the ouptut
    summary_df.to_csv(args.summaries, sep="\t")
    # Save flag_df to the output
    flag_df.to_csv(args.flags, sep="\t")




    # Generating Indexing for volcano plots.

    # Getting data for lpvals
    lpvals = {col.split("_value_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \
            if col.startswith("neg_log10_p_value")}

    # Gettign data for diffs
    difs   = {col.split("_of_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \
            if col.startswith("diff_of_")}

    # The cutoff value for significance.
    cutoff=2



    # Making volcano plots
    with PdfPages( args.volcano ) as pdf:
         for i in range(0, number_of_groups_pairwise ):
             # Set Up Figure
             volcanoPlot = figureHandler(proj="2d")


             groups_subset = groups_pairwise[i] 
             current_key =  groups_subset[0] + '_' + groups_subset[1]
             
             # Plot all results
             scatter.scatter2D(x=list(difs[current_key]), y=list(lpvals[current_key]), 
                                colorList=list('b'), ax=volcanoPlot.ax[0])

             # Color results beyond treshold red
             cutLpvals = lpvals[current_key][lpvals[current_key]>cutoff]
             if not cutLpvals.empty:
                    cutDiff = difs[current_key][cutLpvals.index]
                    scatter.scatter2D(x=list(cutDiff), y=list(cutLpvals), 
                                      colorList=list('r'), ax=volcanoPlot.ax[0])

             # Drawing cutoffs
             lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0])

             # Format axis (volcanoPlot)
             volcanoPlot.formatAxis(axTitle=current_key, grid=False,
                 yTitle="-log10(p-value) for Diff of treatment means for {0}".format(current_key),
                 xTitle="Difference of treatment means for {0}".format(current_key))

             # Add figure to PDF
             volcanoPlot.addToPdf(pdfPages=pdf)
  
    # Informing that the volcano plots are done
    logger.info(u"Pairwise volcano plots have been created.")



    # Ending script
    logger.info(u"Finishing running of t-test.")
예제 #17
0
def plotDistances(df_distance, palette, plotType, disType, cutoff, p, pdf):
    #Geting number of samples in dataframe (ns stands for number of samples)
    ns = len(df_distance.index)

    #Calculates the widht for the figure base on the number of samples
    figWidth = max(ns / 2, 16)

    # Keeping the order on the colors
    df_distance["colors"] = palette.design["colors"]

    # Create figure object with a single axis
    figure = figureHandler(proj='2d', figsize=(figWidth, 8))

    # Getting type of distance file
    if "distance_to_mean" in df_distance.columns:
        dataType = "to the mean"
    else:
        dataType = "pairwise"

    # Getting ty of distance header
    if disType == "Mahalanobis":
        distType1 = "Penalized"
        distType2 = disType
    else:
        distType1 = "Standardized"
        distType2 = disType

    # Adds Figure title, x axis limits and set the xticks
    figure.formatAxis(figTitle="{0} for {1} {2} Distance for {3} {4}".format(
        plotType, distType1, distType2, df_distance.name, dataType),
                      yTitle="{0} {1} Distance".format(distType1, distType2),
                      xTitle="Index",
                      ylim="ignore",
                      xlim=(-0.5, -0.5 + ns),
                      xticks=df_distance.index)

    # If distance to mean
    if dataType == "to the mean":
        # Plot scatterplot quickplot
        scatter.scatter2D(ax=figure.ax[0],
                          colorList=df_distance["colors"],
                          x=range(len(df_distance.index)),
                          y=df_distance["distance_to_mean"])
    # if pairwise
    else:
        if plotType == "Scatterplot":
            # Plot scatterplot
            for index in df_distance.index:
                scatter.scatter2D(ax=figure.ax[0],
                                  colorList=df_distance["colors"][index],
                                  x=range(len(df_distance.index)),
                                  y=df_distance[index])

        elif plotType == "Box-plots":
            # Plot Box plot
            box.boxDF(ax=figure.ax[0],
                      colors=df_distance["colors"],
                      dat=df_distance)

    # Shrink figure
    figure.shrink()

    # Plot legend
    figure.makeLegend(figure.ax[0], palette.ugColors, palette.combName)

    #Add a cutoof line
    cutoff.apply(lambda x: plotCutoffs(x, ax=figure.ax[0], p=p), axis=0)

    # Add figure to PDF and close the figure afterwards
    figure.addToPdf(pdf)

    # Drop "color" column to no mess the results
    df_distance.drop("colors", axis=1, inplace=True)
예제 #18
0
def iterateCombo(dat, combo, pdf):
    """ A function to iterate generate all plots and flags.

    :Arguments:
        :type dat: interface.wideToDesign
        :param dat: A wideToDesign object containing wide and design information.

        :param tuple combo: A tuple of pairwise combination for current sample.

        :type pdf: matplotlib.backends.backend_pdf.PdfPages
        :param pdf: Handler for multi-page PDF that will contain all plots.

    :Updates:
        :type pdf: matplotlib.backends.backend_pdf.PdfPages
        :param pdf: Handler for multi-page PDF that will contain all plots.

    :Returns:
        :rtype flag: interface.Flags
        :param flag: A Flags object with outlier flags.

    """

    # Current combination
    c1 = combo[0]
    c2 = combo[1]

    # Set up figure with 2 subplots
    fh = figureHandler(proj='2d',
                       numAx=2,
                       numRow=2,
                       numCol=2,
                       arrangement=[(0, 0, 1, 2), (0, 1, 1, 2)])

    # Scatter Plot of c1 vs c2
    makeScatter(dat.wide.loc[:, c1], dat.wide.loc[:, c2], fh.ax[0], fh)

    # BA plot of c1 vs c2
    outlier, pearson, cooks, dffits = makeBA(dat.wide.loc[:, c1],
                                             dat.wide.loc[:, c2], fh.ax[1], fh)

    # Build plot title
    title = buildTitle(dat, c1, c2)

    # Add plot title to the figure
    fh.formatAxis(figTitle=title)

    # Stablishing a tight layout for the figure
    plt.tight_layout(pad=2, w_pad=.05)

    # Shinking figure
    fh.shrink(top=.85, bottom=.25, left=.15, right=.9)

    # Output figure to pdf
    fh.addToPdf(dpi=90, pdfPages=pdf)

    # Create flags
    flag = Flags(index=dat.wide.index)
    flag.addColumn(column='flag_{0}_{1}'.format(c1, c2), mask=outlier)
    flag.addColumn(column='flag_pearson_{0}_{1}'.format(c1, c2), mask=pearson)
    flag.addColumn(column='flag_cooks_{0}_{1}'.format(c1, c2), mask=cooks)
    flag.addColumn(column='flag_dffits_{0}_{1}'.format(c1, c2), mask=dffits)

    return flag.df_flags
예제 #19
0
def plotFlagDist(propSample, propFeature, pdf):
    """ 
    Plot the distribution of proportion of samples and features that 
    were outliers.

    :Arguments:
        :type propSample: pandas.DataFrame
        :param propSample: Data frame of the proportion of samples flagged as
            an outlier.

        :type propFeature: pandas.DataFrame
        :param propFeature: Data frame of the proportion of features flagged as
            an outlier.

        :type pdf: string
        :param pdf: Filename of pdf to save plots.

    :Returns:
        :rtype: matplotlib.backends.backend_pdf.PdfPages
        :returns: Saves two bar plots to pdf.

    """
    # sort samples
    propSample.sort_values(inplace=True, ascending=False)

    # sort compounds
    propFeature.sort_values(inplace=True, ascending=False)

    # Make Plots
    ## Open pdf for plotting
    ppFlag = PdfPages(pdf)

    # Open figure handler instance
    fh = figureHandler(proj='2d')
    keys = list(propSample.head(30).keys())

    # Plotting quickBar
    bar.quickBar(ax=fh.ax[0], y=list(propSample.head(30).get_values()), x=keys)

    # Formating axis
    fh.formatAxis(xlim=(0, len(keys) + 1),
                  ylim="ignore",
                  xTitle="Sample ID",
                  yTitle="Proportion of features that were outliers.")

    # Save Figure in PDF
    ppFlag.savefig(fh.fig, bbox_inches='tight')

    ## Plot samples
    # Open figure handler instance
    fh = figureHandler(proj='2d')
    keys = list(propFeature.head(30).keys())

    # Plot bar plot
    bar.quickBar(ax=fh.ax[0],
                 y=list(propFeature.head(30).get_values()),
                 x=keys)

    # Format Axis
    fh.formatAxis(
        xlim=(0, len(keys) + 1),
        ylim="ignore",
        xTitle="Feature ID",
        yTitle="Proportion of samples that a feature was an outlier.")

    # Plot samples
    ppFlag.savefig(fh.fig, bbox_inches="tight")

    ## Close pdf
    ppFlag.close()
예제 #20
0
def main(args):
    # If the user provides grouping variable we test each group against the null (my supplied by user, 0 is the default).
    if args.group != False:
        logger.info(
            u"""t-test will be performed for all groups saved in [{0}] variable in the desing file pairwise with the H_0: mu = {1}."""
            .format(args.group, args.mu))

        # Loading data trought Interface.
        logger.info("Loading data with the Interface")
        dat = wideToDesign(args.input,
                           args.design,
                           args.uniqueID,
                           group=args.group,
                           logger=logger)

        # Treat everything as numeric.
        dat.wide = dat.wide.applymap(float)

        # Cleaning from the missing data.
        dat.dropMissing()

        # Getting the uinique group values so that we will feed them to the t-tests.
        group_values_series = dat.transpose()[dat.group].T.squeeze()
        group_values_series_unique = group_values_series.unique()
        number_of_unique_groups = group_values_series_unique.shape[0]

        # Extracting data from the interface.
        data_frame = dat.transpose()
        # Extracting number of features. We subtract 1 since we have provided args.group
        number_of_features = data_frame.shape[1] - 1
        # Saving treatment group name from the arguments.

        # Computing overall summaries (mean and variance).
        # This part just produces sumamry statistics for the output table.
        # This has nothing to do with the single sample t-test.
        mean_value_all = [0] * number_of_features
        variance_value_all = [0] * number_of_features

        for j in range(0, number_of_features):
            # Creating duplicate for manipulation.
            data_frame_manipulate = data_frame

            # Dropping columns that characterize group. Only feature columns will remain.
            # We also transpose here so it will be easier to operate with.
            data_frame_manipulate_transpose = data_frame_manipulate.drop(
                args.group, 1).transpose()
            # Pulling indexes list from the current data frame.
            indexes_list_complete = data_frame_manipulate_transpose.index.tolist(
            )

            # Computing dataset summaries for feature j.
            mean_value_all[j] = np.mean(
                data_frame_manipulate_transpose.loc[indexes_list_complete[j]])
            variance_value_all[j] = np.var(
                data_frame_manipulate_transpose.loc[indexes_list_complete[j]],
                ddof=1)

        # Creating the table and putting the results there.
        summary_df = pd.DataFrame(data=mean_value_all,
                                  columns=["GrandMean"],
                                  index=indexes_list_complete)
        summary_df['SampleVariance'] = variance_value_all

        # Running single sample t-test for all groups.
        # We are also computing means for each group and outputting them.
        for i in range(0, number_of_unique_groups):

            # Extracting the pieces of the data frame that belong to the ith group.
            data_frame_current_group = data_frame.loc[data_frame[
                args.group].isin([group_values_series_unique[i]])]

            # Dropping columns that characterize group. Only feature columns will remain.
            # We also trnaspose here so it will be easier to operate with.
            data_frame_current_group = data_frame_current_group.drop(
                args.group, 1).transpose()

            # Pulling indexes list from the current group.
            indexes_list = data_frame_current_group.index.tolist()

            # Creating array of means for the current group that will be filled.
            # Creating p values, difference values,  neg_log10_p_value, t-value, flag_value lists filled wiht 0es.
            means_value = [0] * number_of_features
            difference_value = [0] * number_of_features
            p_value = [0] * number_of_features
            t_value = [0] * number_of_features
            neg_log10_p_value = [0] * number_of_features
            flag_value_0p01 = [0] * number_of_features
            flag_value_0p05 = [0] * number_of_features
            flag_value_0p10 = [0] * number_of_features

            for j in range(0, number_of_features):
                series_current = data_frame_current_group.loc[indexes_list[j]]
                means_value[j] = series_current.mean()

                # Performing one sample t-test
                ttest_1samp_args = [series_current, float(args.mu)]
                p_value[j] = ttest_1samp(*ttest_1samp_args)[1]
                t_value[j] = ttest_1samp(*ttest_1samp_args)[0]
                neg_log10_p_value[j] = -np.log10(p_value[j])
                difference_value[j] = means_value[j] - float(args.mu)
                if p_value[j] < 0.01: flag_value_0p01[j] = 1
                if p_value[j] < 0.05: flag_value_0p05[j] = 1
                if p_value[j] < 0.10: flag_value_0p10[j] = 1

            # Creating names for the current analysis columns and adding result columns to the data frame.
            means_value_column_name_current = 'mean_treatment_' + group_values_series_unique[
                i]
            p_value_column_name_current = 'prob_greater_than_t_for_diff_' + group_values_series_unique[
                i] + '_' + args.mu
            t_value_column_name_current = 't_value_for_diff_' + group_values_series_unique[
                i] + '_' + args.mu
            neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + group_values_series_unique[
                i] + '_' + args.mu
            difference_value_column_name_current = 'diff_of_' + group_values_series_unique[
                i] + '_' + args.mu
            flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_' + group_values_series_unique[
                i] + '_' + args.mu
            flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_' + group_values_series_unique[
                i] + '_' + args.mu
            flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_' + group_values_series_unique[
                i] + '_' + args.mu

            # Adding flag_value column to the data frame and assigning the name.
            # If the data frame for flags has not been created yet we create it on the fly. i.e. if i == 0 create it.
            if i == 0:
                flag_df = pd.DataFrame(
                    data=flag_value_0p01,
                    columns=[flag_value_column_name_current_0p01],
                    index=indexes_list)
            else:
                flag_df[flag_value_column_name_current_0p01] = flag_value_0p01

            # At this point data frames (summary and flags) exist so only columns are added to the existing data frame.
            summary_df[means_value_column_name_current] = means_value
            summary_df[p_value_column_name_current] = p_value
            summary_df[t_value_column_name_current] = t_value
            summary_df[
                neg_log10_p_value_column_name_current] = neg_log10_p_value
            summary_df[difference_value_column_name_current] = difference_value
            flag_df[flag_value_column_name_current_0p05] = flag_value_0p05
            flag_df[flag_value_column_name_current_0p10] = flag_value_0p10

    # If the user does not provide grouping variable we test all dataset as a single group against the null (my supplied by user, 0 is the default).
    if args.group == False:
        logger.info(
            u"""t-test will be performed for the entire dataset since goruping variable was not provided."""
        )

        # Loading data trough the interface
        logger.info("Loading data with the Interface")
        dat = wideToDesign(args.input,
                           args.design,
                           args.uniqueID,
                           logger=logger)

        # Treat everything as numeric
        dat.wide = dat.wide.applymap(float)

        # Cleaning from missing data
        dat.dropMissing()

        # Saving the number of unique groups that will be used for plotting.
        # Since we did not feed any grouping variable it is exactly one.
        number_of_unique_groups = 1

        # Extracting data from the interface.
        data_frame = dat.wide.transpose()
        # Extracting number of features. We do not subtract 1 since we have not provided args.group
        number_of_features = data_frame.shape[1]
        # Saving treatment group name from the arguments.

        # Computing overall summaries (mean and variance).
        # This part just produces sumamry statistics for the output table.
        # This has nothing to do with single sample t-test. This is just summary for the table.
        mean_value_all = [0] * number_of_features
        variance_value_all = [0] * number_of_features
        # Creating array of means for the current group that will be filled.
        # Creating p_values, neg_log10_p_value, flag_values, difference_value lists filled wiht 0es.
        p_value = [0] * number_of_features
        t_value = [0] * number_of_features
        neg_log10_p_value = [0] * number_of_features
        difference_value = [0] * number_of_features
        flag_value_0p01 = [0] * number_of_features
        flag_value_0p05 = [0] * number_of_features
        flag_value_0p10 = [0] * number_of_features

        for j in range(0, number_of_features):
            # We transpose here so data will be easier to operate on.
            data_frame_manipulate_transpose = data_frame.transpose()
            # Pulling indexes list from the current data frame.
            indexes_list_complete = data_frame_manipulate_transpose.index.tolist(
            )

            # Computing dataset summaries.
            mean_value_all[j] = np.mean(
                data_frame_manipulate_transpose.loc[indexes_list_complete[j]])
            variance_value_all[j] = np.var(
                data_frame_manipulate_transpose.loc[indexes_list_complete[j]],
                ddof=1)

            # Performing one sample t-test for the entire dataset.
            ttest_1samp_args = [
                data_frame_manipulate_transpose.loc[indexes_list_complete[j]],
                float(args.mu)
            ]
            p_value[j] = ttest_1samp(*ttest_1samp_args)[1]
            t_value[j] = ttest_1samp(*ttest_1samp_args)[0]
            neg_log10_p_value[j] = -np.log10(p_value[j])
            difference_value[j] = mean_value_all[j] - float(args.mu)
            if p_value[j] < 0.01: flag_value_0p01[j] = 1
            if p_value[j] < 0.05: flag_value_0p05[j] = 1
            if p_value[j] < 0.10: flag_value_0p10[j] = 1

        # Creating the table and putting the results there.
        summary_df = pd.DataFrame(data=mean_value_all,
                                  columns=["GrandMean"],
                                  index=indexes_list_complete)
        summary_df['SampleVariance'] = variance_value_all

        # Creating names for the current analysis columns and adding result columns to the data frame.
        means_value_column_name_current = 'mean_treatment_all'
        p_value_column_name_current = 'prob_greater_than_t_for_diff_all_' + args.mu
        t_value_column_name_current = 't_value_for_diff_all_' + args.mu
        neg_log10_p_value_column_name_current = 'neg_log10_p_value_all_' + args.mu
        difference_value_column_name_current = 'diff_of_all_' + args.mu
        flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_all_' + args.mu
        flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_all_' + args.mu
        flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_all_' + args.mu

        summary_df[means_value_column_name_current] = mean_value_all
        summary_df[p_value_column_name_current] = p_value
        summary_df[t_value_column_name_current] = t_value
        summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value
        summary_df[difference_value_column_name_current] = difference_value

        flag_df = pd.DataFrame(data=flag_value_0p01,
                               columns=[flag_value_column_name_current_0p01],
                               index=indexes_list_complete)
        flag_df[flag_value_column_name_current_0p05] = flag_value_0p05
        flag_df[flag_value_column_name_current_0p10] = flag_value_0p10

    # Roundign the results up to 4 precision digits.
    summary_df = summary_df.apply(lambda x: x.round(4))

    # Adding name for the unique ID column that was there oroginally.
    summary_df.index.name = args.uniqueID
    flag_df.index.name = args.uniqueID

    # Save summary_df to the ouptut
    summary_df.to_csv(args.summaries, sep="\t")
    # Save flag_df to the output
    flag_df.to_csv(args.flags, sep="\t")

    # Generating Indexing for volcano plots.
    # Getting data for lpvals
    lpvals = {col.split("_value_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \
              if col.startswith("neg_log10_p_value")}

    # Gettign data for diffs
    difs   = {col.split("_of_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \
              if col.startswith("diff_of_")}

    # The cutoff value for significance.
    cutoff = 2

    # Making volcano plots
    with PdfPages(args.volcano) as pdf:
        for i in range(0, number_of_unique_groups):
            # Set Up Figure
            volcanoPlot = figureHandler(proj="2d")

            # If no grouping variable is provided.
            if number_of_unique_groups == 1:
                current_key = 'all_' + args.mu
            else:
                current_key = group_values_series_unique[i] + '_' + args.mu

            # Plot all results
            scatter.scatter2D(x=list(difs[current_key]),
                              y=list(lpvals[current_key]),
                              colorList=list('b'),
                              ax=volcanoPlot.ax[0])

            # Color results beyond treshold red
            cutLpvals = lpvals[current_key][lpvals[current_key] > cutoff]
            if not cutLpvals.empty:
                cutDiff = difs[current_key][cutLpvals.index]
                scatter.scatter2D(x=list(cutDiff),
                                  y=list(cutLpvals),
                                  colorList=list('r'),
                                  ax=volcanoPlot.ax[0])

            # Drawing cutoffs
            lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0])

            # Format axis (volcanoPlot)
            volcanoPlot.formatAxis(
                axTitle=current_key,
                grid=False,
                yTitle="-log10(p-value) for Diff of treatment means for {0}".
                format(current_key),
                xTitle="Difference of the means from H0 for {0}".format(
                    current_key))

            # Add figure to PDF
            volcanoPlot.addToPdf(pdfPages=pdf)

    logger.info(u"Volcano plots have been created.")
    logger.info(u"Finishing running of t-test.")
예제 #21
0
def main(args):

    # Loading data through Interface
    logger.info("Loading data with the Interface")
    dat = wideToDesign(args.input, args.design, args.uniqueID, group = args.group, logger=logger)

    # Treat everything as numeric
    dat.wide = dat.wide.applymap(float)

    # Cleaning from missing data
    dat.dropMissing()

    # Unpaired permuted t-test. In this case there can be as many groups as possible. 
    # Order variable is ignored and t-tests are performed pairwise for each pair of groups.

    logger.info("Unpaired t-test will be performed for all groups pairwise.")

    # Getting the unique pairs and all pairwise permutations to feed to pairwise unpaired t-tests.
    group_values_series = dat.transpose()[dat.group].T.squeeze()
    group_values_series_unique = group_values_series.unique()
    number_of_unique_groups = group_values_series_unique.shape[0]
    groups_pairwise = list(combinations(group_values_series_unique,2) ) 
    number_of_groups_pairwise = len(groups_pairwise)

    # Extracting data from the interface.
    data_frame = dat.transpose()
    # Extracting number of features.
    # This variable not used in unpaired test. it just adds extra column to the data frame.	
#    if args.order == False:
    number_of_features = data_frame.shape[1] - 1

    # Saving treatment group name from the arguments.
    # Computing overall summaries (mean and variance).
    # This part just produces summary statistics for the output table.
    mean_value_all = [0] * number_of_features
    variance_value_all = [0] * number_of_features

    for j in range(0, number_of_features ):
        # Creating duplicate for manipulation.
        data_frame_manipulate = data_frame

        # Dropping columns that characterize group. Only feature columns will remain.
        # We also trnaspose here so it will be easier to operate with.
        # We should either drop 1 or 2 columns depending whether we fed the second one.
        data_frame_manipulate_transpose  = data_frame_manipulate.drop(  args.group, 1 ).transpose()

        # Pulling indexes list from the current data frame.
        indexes_list_complete = data_frame_manipulate_transpose.index.tolist()

        # Computing dataset summaries.
        mean_value_all[j] = np.mean(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ]) 
        variance_value_all[j] = np.var(data_frame_manipulate_transpose.loc[ indexes_list_complete[j] ], ddof = 1)

    # Creating the table and putting the results there.
    summary_df = pd.DataFrame(data = mean_value_all, columns = ["GrandMean"], index = indexes_list_complete )    
    summary_df['SampleVariance'] =  variance_value_all


    # Computing means for each group and outputting them.
    # This part just produces summary statistics for the output table.
    for i in range(0, number_of_unique_groups ):
       # Extracting the pieces of the data frame that belong to the ith group.
       data_frame_current_group  = data_frame.loc[data_frame[args.group].isin( [group_values_series_unique[i]]  )]

       # Dropping columns that characterize group. Only feature columns will remain.
       # We also trnaspose here so it will be easier to operate with.
       # We should either drop 1 or 2 columns depending whether we fed the second one.
       data_frame_current_group  = data_frame_current_group.drop(  args.group, 1 ).transpose()

       # Pulling indexes list from the current group.
       indexes_list = data_frame_current_group.index.tolist()

       # Creating array of means for the current group that will be filled.
       means_value  = [0] * number_of_features

       for j in range(0, number_of_features ):
           series_current = data_frame_current_group.loc[ indexes_list[j] ] 
           means_value[j] = series_current.mean()

       # Adding current mean_value column to the data frame and assigning the name.
       means_value_column_name_current  = 'mean_treatment_' + group_values_series_unique[i] 
       summary_df[means_value_column_name_current] = means_value


    # Running pairwise unpaired (two-sample) t-test for all pairs of group levels that are saved in groups_pairwise.
    for i in range(0, number_of_groups_pairwise ):
       # Extracting the pieces of the data frame that belong to groups saved in the i-th unique pair.
       groups_subset = groups_pairwise[i]
       data_frame_first_group  = data_frame.loc[data_frame[args.group].isin( [groups_subset[0]]  )]
       data_frame_second_group = data_frame.loc[data_frame[args.group].isin( [groups_subset[1]]  )]

       # Dropping columns that characterize group. Only feature columns will remain.
       # We also trnaspose here so it will be easier to operate with.
       # We should either drop 1 or 2 columns depending whether we fed the second one.
       data_frame_first_group  = data_frame_first_group.drop(  args.group, 1 ).transpose()
       data_frame_second_group = data_frame_second_group.drop( args.group, 1 ).transpose()

       # Pulling indexes list from the first one (they are the same)
       indexes_list = data_frame_first_group.index.tolist()

       # Creating p_values, neg_log10_p_value, flag_values, difference_value lists filled wiht 0es.
       p_value           = [0] * number_of_features
       t_value           = [0] * number_of_features
       neg_log10_p_value = [0] * number_of_features
       flag_value_0p01   = [0] * number_of_features
       flag_value_0p05   = [0] * number_of_features
       flag_value_0p10   = [0] * number_of_features
       difference_value  = [0] * number_of_features

       for j in range(0, number_of_features ):
           series_first  = data_frame_first_group.loc[ indexes_list[j] ]
           series_second = data_frame_second_group.loc[ indexes_list[j] ]

           p_value[j] = two_sample(series_first, series_second, reps=int(args.reps), stat='t', alternative='two-sided', seed=None)[0]    
#           print j
#           print p_value[j]
           t_value[j] = two_sample(series_first, series_second, reps=int(args.reps), stat='t', alternative='two-sided', seed=None)[1]
#           print j
#           print t_value[j]

           neg_log10_p_value[j] = - np.log10(p_value[j])
           difference_value[j] = series_first.mean() - series_second.mean()
           if p_value[j] < 0.01: flag_value_0p01[j] = 1
           if p_value[j] < 0.05: flag_value_0p05[j] = 1
           if p_value[j] < 0.10: flag_value_0p10[j] = 1

       # Creating column names for the data frame.
       p_value_column_name_current           = 'perm_greater_than_t_for_diff_' + groups_subset[0] + '_' + groups_subset[1]
       t_value_column_name_current           = 't_value_for_diff_' + groups_subset[0] + '_' + groups_subset[1]
       neg_log10_p_value_column_name_current = 'neg_log10_p_value_' + groups_subset[0] + '_' + groups_subset[1]
       difference_value_column_name_current  = 'diff_of_' + groups_subset[0] + '_' + groups_subset[1]
       flag_value_column_name_current_0p01 = 'flag_significant_0p01_on_' + groups_subset[0] + '_' + groups_subset[1] 
       flag_value_column_name_current_0p05 = 'flag_significant_0p05_on_' + groups_subset[0] + '_' + groups_subset[1] 
       flag_value_column_name_current_0p10 = 'flag_significant_0p10_on_' + groups_subset[0] + '_' + groups_subset[1] 

       # Adding current p_value and flag_value column to the data frame and assigning the name.
       # If the data frame has not been created yet we create it on the fly. i.e. if i == 0 create it.
       if i == 0:
          flag_df = pd.DataFrame(data = flag_value_0p01, columns = [flag_value_column_name_current_0p01], index = indexes_list )    
       else:
          flag_df[flag_value_column_name_current_0p01] = flag_value_0p01

       # At this point data frame exists so only columns are added to the existing data frame.
       summary_df[p_value_column_name_current]           = p_value
       summary_df[t_value_column_name_current]           = t_value
       summary_df[neg_log10_p_value_column_name_current] = neg_log10_p_value
       summary_df[difference_value_column_name_current]  = difference_value
       flag_df[flag_value_column_name_current_0p05] = flag_value_0p05
       flag_df[flag_value_column_name_current_0p10] = flag_value_0p10




    # Rounding the results up to 4 precision digits.
    summary_df = summary_df.apply(lambda x: x.round(4))

    # Adding name for the unique ID column that was there originally.
    summary_df.index.name    =  args.uniqueID
    flag_df.index.name =  args.uniqueID

    # Save summary_df to the ouptut
    summary_df.to_csv(args.summaries, sep="\t")
    # Save flag_df to the output
    flag_df.to_csv(args.flags, sep="\t")



    # Generating Indexing for volcano plots.
    # Getting data for lpvals
    lpvals = {col.split("_value_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \
        if col.startswith("neg_log10_p_value")}

    # Getting data for diffs
    difs   = {col.split("_of_")[-1]:summary_df[col] for col in summary_df.columns.tolist() \
        if col.startswith("diff_of_")}

    # The cutoff value for significance.
    cutoff=2

    # Making volcano plots
    with PdfPages( args.volcano ) as pdf:
        for i in range(0, number_of_groups_pairwise ):
            # Set Up Figure
            volcanoPlot = figureHandler(proj="2d")

            groups_subset = groups_pairwise[i]
            current_key =  groups_subset[0] + '_' + groups_subset[1]

            # Plot all results
            scatter.scatter2D(x=list(difs[current_key]), y=list(lpvals[current_key]), colorList=list('b'), ax=volcanoPlot.ax[0])

            # Color results beyond threshold red
            cutLpvals = lpvals[current_key][lpvals[current_key]>cutoff]
            if not cutLpvals.empty:
                cutDiff = difs[current_key][cutLpvals.index]
                scatter.scatter2D(x=list(cutDiff), y=list(cutLpvals), colorList=list('r'), ax=volcanoPlot.ax[0])

            # Drawing cutoffs
            lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0])

            # Format axis (volcanoPlot)
            volcanoPlot.formatAxis(axTitle=current_key, grid=False,
                yTitle="-log10(p-value) for Diff of treatment means for {0}".format(current_key),
                xTitle="Difference of treatment means for {0}".format(current_key))

            # Add figure to PDF
            volcanoPlot.addToPdf(pdfPages=pdf)

    # Informing that the volcano plots are done
    logger.info(u"Pairwise volcano plots have been created.")

    # Ending script
    logger.info(u"Finishing t-test run.")
예제 #22
0
def volcano(combo, results, oname, cutoff=2):
    """ 
    Plot volcano plots.

    Creates volcano plots to compare means, for all pairwise differences.

    :Arguments:

        :type combo: dictionary
        :param combo: A dictionary of dictionaries with all possible pairwise
            combinations. Used this to create the various column headers in the
            results table.

        :type results: pandas DataFrame
        :param results: TODO

        :type oname: string
        :param oname: Name of the output file in pdf format.
       
        :type cutoff: int
        :param cutoff: The cutoff value for significance.

    :Returns:
        :rtype: PD
        :returns: Outputs a pdf file containing all plots.

    """
    # Getting data for lpvals
    lpvals = {col.split("_")[-1]:results[col] for col in results.columns.tolist() \
            if col.startswith("-log10_p-value_")}

    # Gettign data for diffs
    difs   = {col.split("_")[-1]:results[col] for col in results.columns.tolist() \
            if col.startswith("diff_of")}

    # Making plots
    with PdfPages(oname) as pdf:
        for key in sorted(difs.keys()):
            # Set Up Figure
            volcanoPlot = figureHandler(proj="2d")

            # Plot all results
            scatter.scatter2D(x=list(difs[key]),
                              y=list(lpvals[key]),
                              colorList=list('b'),
                              ax=volcanoPlot.ax[0])

            # Color results beyond treshold red
            cutLpvals = lpvals[key][lpvals[key] > cutoff]
            if not cutLpvals.empty:
                cutDiff = difs[key][cutLpvals.index]
                scatter.scatter2D(x=list(cutDiff),
                                  y=list(cutLpvals),
                                  colorList=list('r'),
                                  ax=volcanoPlot.ax[0])

            # Drawing cutoffs
            lines.drawCutoffHoriz(y=cutoff, ax=volcanoPlot.ax[0])

            # Format axis (volcanoPlot)
            volcanoPlot.formatAxis(
                axTitle=key,
                grid=False,
                yTitle="-log10(p-value) for Diff of treatment = {0}".format(
                    key),
                xTitle="Diff of treatment = {0}".format(key))

            # Add figure to PDF
            volcanoPlot.addToPdf(pdfPages=pdf)
예제 #23
0
def plotSignificantROR(data, pdf, palette):
    """
    Plot a scatter plot of x vs y. 

    :Arguments:

        :type row:
        :param row:

        :type pdf: PdfPages
        :param pdf: pdf object to store scatterplots

        :type des: pandas DataFrame
        :param des: design file

        :type groupName: string
        :param groupName: name of group
    """
    # Iterates over all rows in the dataframe
    # Make scatter plot if p-pvalue is less than 0.05
    for index, row in data.iterrows():
        if row["pval"] > 0.05: continue
        #plotSignificantROR(row,pdf,dat.design,args.group)

        # Get 95% CI
        prstd, lower, upper = wls_prediction_std(row["res"])

        # Sort CIs for Plotting
        toPlot = pd.DataFrame({"x": row["x"], "lower": lower, "upper": upper})
        toPlot.sort_values(by="x", inplace=True)

        # Create plot
        fh = figureHandler(proj="2d", figsize=(14, 8))

        #Plot scatterplot
        scatter.scatter2D(ax=fh.ax[0],
                          x=row["x"],
                          y=row["y"],
                          colorList=palette.list_colors)

        # Plot cutoffs
        lines.drawCutoff(ax=fh.ax[0], x=row["x"], y=row["fitted"], c="c")
        lines.drawCutoff(ax=fh.ax[0], x=toPlot["x"], y=toPlot["lower"], c="r")
        lines.drawCutoff(ax=fh.ax[0], x=toPlot["x"], y=toPlot["upper"], c="r")

        # Formatting
        ymin, ymax = fh.ax[0].get_ylim()
        fh.formatAxis(xTitle="Run Order", yTitle="Value", ylim=(ymin,ymax*1.2),
        figTitle=u"{} Scatter plot (fitted regression line and prediction bands"\
        " included)".format(row["name"]))

        # Shrink figure
        fh.shrink()

        # Add legend to figure
        fh.makeLegend(ax=fh.ax[0],
                      ucGroups=palette.ugColors,
                      group=palette.combName)

        #Add text to the ax
        fh.ax[0].text(.7, .85, u"Slope= {0:.4f}\n(p-value = {1:.4f})\n"\
            "$R^2$ = {2:4f}".format(round(row["slope"],4), round(row["pval"],4),
            round(row["rsq"],4)),transform=fh.ax[0].transAxes, fontsize=12)

        # Save to PDF
        fh.addToPdf(pdf)
def makePlots(SEDData, design, pdf, groupName, cutoff, p, plotType, ugColors,
              levels):
    """
    Manage all the plots for this script

    :Arguments:
        :type SEDData: pandas.dataFrame
        :param SEDData: Contains SED data either to Mean or pairwise

        :type design: pandas.dataFrame
        :param design: Design file after getColor

        :type pdf: PDF object
        :param pdf: PDF for output plots

        :type groupName: string
        :param groupName: Name of the group (figure title).

        :type cutoff: pandas.dataFrame
        :param cutoff: Cutoff values, beta, chi-sqr and normal.

        :type p: float
        :param p: Percentil for cutoff.

        :type plotType: string
        :param plotType: Type of plot, the possible types are scatterplot to mean
            scatterplot pairwise and boxplot pairwise.

    """

    #Geting number of features in dataframe
    nFeatures = len(SEDData.index)

    #Calculates the widht for the figure base on the number of features
    figWidth = max(nFeatures / 2, 16)

    # Create figure object with a single axis and initiate the figss
    figure = figureHandler(proj='2d', figsize=(figWidth, 8))

    # Keeping the order on the colors
    SEDData["colors"] = design["colors"]

    # Choose type of plot
    # Plot scatterplot to mean
    if (plotType == "scatterToMean"):
        #Adds Figure title, x axis limits and set the xticks
        figure.formatAxis(
            figTitle=
            "Standardized Euclidean Distance from samples {} to the mean".
            format(groupName),
            xlim=(-0.5, -0.5 + nFeatures),
            ylim="ignore",
            xticks=SEDData.index.values,
            xTitle="Index",
            yTitle="Standardized Euclidean Distance")

        #Plot scatterplot quickplot
        scatter.scatter2D(ax=figure.ax[0],
                          colorList=SEDData["colors"],
                          x=range(len(SEDData.index)),
                          y=SEDData["SED_to_Mean"])

    #Plot scatterplot pairwise
    elif (plotType == "scatterPairwise"):
        # Adds Figure title, x axis limits and set the xticks
        figure.formatAxis(
            figTitle="Pairwise standardized Euclidean Distance from samples {}"
            .format(groupName),
            xlim=(-0.5, -0.5 + nFeatures),
            ylim="ignore",
            xticks=SEDData.index.values,
            xTitle="Index",
            yTitle="Standardized Euclidean Distance")

        # Plot scatterplot
        for index in SEDData.index.values:
            scatter.scatter2D(ax=figure.ax[0],
                              colorList=design["colors"][index],
                              x=range(len(SEDData.index)),
                              y=SEDData[index])

    #Plot boxplot pairwise
    elif (plotType == "boxplotPairwise"):
        # Add Figure title, x axis limits and set the xticks
        figure.formatAxis(
            figTitle=
            "Box-plots for pairwise standardized Euclidean Distance from samples {}"
            .format(groupName),
            xlim=(-0.5, -0.5 + nFeatures),
            ylim="ignore",
            xticks=SEDData.index.values,
            xTitle="Index",
            yTitle="Standardized Euclidean Distance")
        # Plot Box plot
        box.boxDF(ax=figure.ax[0],
                  colors=SEDData["colors"].values,
                  dat=SEDData)

    #Add a cutoof line
    cutoff.apply(lambda x: plotCutoffs(x, ax=figure.ax[0], p=p), axis=0)
    figure.shrink()
    # Plot legend
    #if group:
    figure.makeLegend(figure.ax[0], ugColors, levels)

    # Add figure to PDF and close the figure afterwards
    figure.addToPdf(pdf)