Пример #1
0
def main(args):
    """Runs eveything"""
    # Importing data
    dat = wideToDesign(args.input, args.design, args.uniqID, logger=logger)

    # Cleaning from missing data
    dat.dropMissing()

    # Getting labels to drop from arguments
    x = True
    y = True
    if "x" in args.labels:
        x = False
    if "y" in args.labels:
        y = False

    print("x =", x)
    print("y =", y)

    #Plotting with dendogram Hierarchical cluster heatmap (HCH)
    logger.info("Plotting heatmaps")
    if args.dendogram == True:
        fh = hm.plotHCHeatmap(dat.wide,
                              hcheatmap=True,
                              cmap=palette.mpl_colormap,
                              xlbls=x,
                              ylbls=y)
        fh.savefig(args.fig, format="pdf")

    #Plotting without a dendogram single heatmap
    else:
        # Creating figure Handler object
        fh = figureHandler(proj='2d', figsize=(14, 14))

        # Creating plot
        hm.plotHeatmap(dat.wide,
                       fh.ax[0],
                       cmap=palette.mpl_colormap,
                       xlbls=x,
                       ylbls=y)

        # formating axis
        fh.formatAxis(xTitle="sampleID")

        # Saving figure
        fh.export(out=args.fig, dpi=300)

    # Finishing script
    logger.info("Script Complete!")
Пример #2
0
def nontechnical_analysis(args, df, mask, C, clustering):
    # Re-order things more palatably for the user,
    # based on the results of the technical analysis.

    # Get the map from the name to the original row index.
    all_row_names = df.index.values
    row_index_map = {s: i for i, s in enumerate(all_row_names)}

    # If some variables are uninformative for clustering,
    # the correlation matrix and the cluster vector will have smaller
    # dimensions than the number of rows in the original data frame.
    remaining_row_names = df[mask].index.values

    # Count the variables included in the clustering.
    p = clustering.shape[0]

    # Count the clusters.
    k = clustering.max() + 1

    # To sort the modules and to sort the variables within the modules,
    # we want to use absolute values of correlations.
    C_abs = np.abs(C)

    # For each cluster, get its indices and its submatrix of C_abs.
    selections = []
    submatrices = []
    degrees = np.zeros(p, dtype=float)
    for i in range(k):
        selection = np.flatnonzero(clustering == i)
        selections.append(selection)
        submatrix = C_abs[np.ix_(selection, selection)]
        submatrices.append(submatrix)
        if selection.size > 1:
            denom = selection.size - 1
            degrees[selection] = (submatrix.sum(axis=0) - 1) / denom

    # Modules should be reordered according to decreasing "average degree".
    cluster_sizes = []
    average_degrees = []
    for selection in selections:
        cluster_sizes.append(selection.size)
        average_degrees.append(degrees[selection].mean())

    module_to_cluster = np.argsort(average_degrees)[::-1]
    cluster_to_module = {v: k for k, v in enumerate(module_to_cluster)}

    triples = [(
        cluster_to_module[clustering[i]],
        -degrees[i],
        i,
    ) for i in range(p)]

    _a, _b, new_to_old_idx = zip(*sorted(triples))

    # Make a csv file if requested.
    header = ('Gene', 'Module', 'Entry Index', 'Average Degree', 'Degree')
    with open(args.out, 'wb') as fout:
        writer = csv.writer(
            fout, 'excel-tab')  #problematic; need to switch to tsv file!
        writer.writerow(header)
        for old_i in new_to_old_idx:
            name = remaining_row_names[old_i]
            cluster = clustering[old_i]
            row = (
                name,
                cluster_to_module[cluster] + 1,
                row_index_map[name] + 1,
                average_degrees[cluster],
                degrees[old_i],
            )
            writer.writerow(row)

    #Create Output
    fh1 = figureHandler(proj="2d")
    fh2 = figureHandler(proj="2d")
    fh3 = figureHandler(proj="2d")

    # Prepare to create the sorted heatmaps. (fh2)
    C_sorted = C[np.ix_(new_to_old_idx, new_to_old_idx)]
    clustering_new = clustering[np.ix_(new_to_old_idx)]

    # Draw the third heatmap (smoothed).
    # Make a smoothed correlation array. (fh3)
    S = expansion(clustering_new)
    block_mask = S.dot(S.T)
    denom = np.outer(S.sum(axis=0), S.sum(axis=0))
    small = S.T.dot(C_sorted).dot(S) / denom
    C_all_smoothed = S.dot(small).dot(S.T)
    C_smoothed = (C_all_smoothed * (1 - block_mask) + C_sorted * block_mask)

    # Getting list of names for heatmaps 2 and 3
    hpnames = [remaining_row_names[old_i] for old_i in new_to_old_idx]

    # Plot using something like http://stackoverflow.com/questions/15988413/
    # Drawing heatmaps
    # Draw first heatmap [C]
    hm.plotHeatmap(C,
                   fh1.ax[0],
                   cmap=palette.mpl_colormap,
                   xlbls=remaining_row_names,
                   ylbls=remaining_row_names)
    fh1.formatAxis(xTitle="sampleID", figTitle="Correlations")

    # Draw second heatmap [C_sorted](reordered according to the clustering).
    hm.plotHeatmap(C_sorted,
                   fh2.ax[0],
                   cmap=palette.mpl_colormap,
                   xlbls=hpnames,
                   ylbls=hpnames)
    fh2.formatAxis(xTitle="sampleID", figTitle="Re-Ordered correlations")

    # Draw the heatmap [C_smoothed](smoothed version of C_sorted)
    hm.plotHeatmap(C_smoothed,
                   fh3.ax[0],
                   cmap=palette.mpl_colormap,
                   xlbls=hpnames,
                   ylbls=hpnames)
    fh3.formatAxis(xTitle="sampleID", figTitle="Smoothed correlations")

    #Create output from maps
    with PdfPages(args.figure) as pdf:
        fh1.addToPdf(pdf)
        fh2.addToPdf(pdf)
        fh3.addToPdf(pdf)