예제 #1
0
파일: plot.py 프로젝트: ranikay/GSEApy
def dotplot(df, cutoff=0.05, figsize=(3.5, 6), top_term=10, scale=1):
    """Visualize enrichr results.

    :param df: GSEApy DataFrame results.
    :param cutoff: p-adjust cut-off.
    :param top_term: number of enriched terms to show.
    :param scale: dotplot point size scale.
    :return:  a dotplot for enrichr terms.

    """

    if 'fdr' in df.columns:
        #gsea results
        df.rename(columns={
            'fdr': 'Adjusted P-value',
        }, inplace=True)
        df['hits_ratio'] = df['matched_size'] / df['gene_set_size']
    else:
        #enrichr results
        df['Count'] = df['Overlap'].str.split("/").str[0].astype(int)
        df['Background'] = df['Overlap'].str.split("/").str[1].astype(int)
        df['hits_ratio'] = df['Count'] / df['Background']

    # pvalue cut off
    df = df[df['Adjusted P-value'] <= cutoff]

    if len(df) < 1:
        logging.warning("Warning: No enrich terms when cuttoff = %s" % cutoff)
        return None
    #sorting the dataframe for better visualization
    df = df.sort_values(by='Adjusted P-value', ascending=False)
    df = df.head(top_term)
    # x axis values
    padj = df['Adjusted P-value']
    combined_score = df['Combined Score'].round().astype('int')
    x = -padj.apply(np.log10)
    # y axis index and values
    y = [i for i in range(0, len(df))]
    labels = df.Term.values

    area = np.pi * (df['Count'] * scale)**2

    #creat scatter plot
    if hasattr(sys, 'ps1'):
        #working inside python console, show figure
        fig, ax = plt.subplots(figsize=figsize)
    else:
        #If working on commandline, don't show figure
        fig = Figure(figsize=figsize)
        canvas = FigureCanvas(fig)
        ax = fig.add_subplot(111)
    vmin = np.percentile(combined_score.min(), 2)
    vmax = np.percentile(combined_score.max(), 98)
    sc = ax.scatter(x=x,
                    y=y,
                    s=area,
                    edgecolors='face',
                    c=combined_score,
                    cmap=plt.cm.RdBu,
                    vmin=vmin,
                    vmax=vmax)
    ax.set_xlabel("-log$_{10}$(Adjust P-value)", fontsize=16)
    ax.yaxis.set_major_locator(plt.FixedLocator(y))
    ax.yaxis.set_major_formatter(plt.FixedFormatter(labels))
    ax.set_yticklabels(labels, fontsize=16)
    #ax.set_ylim([-1, len(df)])
    ax.grid()

    #colorbar
    cax = fig.add_axes([0.93, 0.20, 0.07, 0.22])
    cbar = fig.colorbar(
        sc,
        cax=cax,
    )
    cbar.ax.tick_params(right='off')
    cbar.ax.set_title('Com-\nscore', loc='left', fontsize=12)

    #for terms less than 3
    if len(df) >= 3:

        # find the index of the closest value to the median
        idx = [
            area.argmax(),
            np.abs(area - area.mean()).argmin(),
            area.argmin()
        ]
        idx = unique(idx)
        x2 = [0] * len(idx)
    else:
        x2 = [0] * len(df)
        idx = df.index
    #scale of dots
    ax2 = fig.add_axes([0.93, 0.55, 0.09, 0.06 * len(idx)])
    #s=area[idx]
    l1 = ax2.scatter([], [], s=10, edgecolors='none')
    l2 = ax2.scatter([], [], s=50, edgecolors='none')
    l3 = ax2.scatter([], [], s=100, edgecolors='none')
    labels = df['Count'][idx]
    leg = ax.legend([l1, l2, l3],
                    labels,
                    nrow=3,
                    frameon=True,
                    fontsize=12,
                    handlelength=2,
                    loc=8,
                    borderpad=1.8,
                    handletextpad=1,
                    title='Gene\nRatio',
                    scatterpoints=1)

    #canvas.print_figure('test', bbox_inches='tight')
    return fig
예제 #2
0
파일: plot.py 프로젝트: jiawu/GSEApy
def dotplot(df,
            column='Adjusted P-value',
            title='',
            cutoff=0.05,
            top_term=10,
            sizes=None,
            norm=None,
            legend=True,
            figsize=(6, 5.5),
            cmap='RdBu_r',
            ofname=None,
            **kwargs):
    """Visualize enrichr results.

    :param df: GSEApy DataFrame results.
    :param column: which column of DataFrame to show. Default: Adjusted P-value
    :param title: figure title
    :param cutoff: terms with 'column' value < cut-off are shown.
    :param top_term: number of enriched terms to show.
    :param ascending: bool, the order of y axis.
    :param sizes: tuple, (min, max) scatter size. Not functional for now
    :param norm: maplotlib.colors.Normalize object.
    :param legend: bool, whether to show legend.
    :param figsize: tuple, figure size. 
    :param cmap: matplotlib colormap
    :param ofname: output file name. If None, don't save figure 

    """

    colname = column
    # sorting the dataframe for better visualization
    if colname in ['Adjusted P-value', 'P-value']:
        # check if any values in `df[colname]` can't be coerced to floats
        can_be_coerced = df[colname].map(isfloat)
        if np.sum(~can_be_coerced) > 0:
            raise ValueError(
                'some value in %s could not be typecast to `float`' % colname)
        else:
            df.loc[:, colname] = df[colname].map(float)
        df = df[df[colname] <= cutoff]
        if len(df) < 1:
            msg = "Warning: No enrich terms when cutoff = %s" % cutoff
            return msg
        df = df.assign(logAP=lambda x: -x[colname].apply(np.log10))
        colname = 'logAP'
    df = df.sort_values(by=colname).iloc[-top_term:, :]
    #
    temp = df['Overlap'].str.split("/", expand=True).astype(int)
    df = df.assign(Hits=temp.iloc[:, 0], Background=temp.iloc[:, 1])
    df = df.assign(Hits_ratio=lambda x: x.Hits / x.Background)
    # x axis values
    x = df.loc[:, colname].values
    combined_score = df['Combined Score'].round().astype('int')
    # y axis index and values
    y = [i for i in range(0, len(df))]
    ylabels = df['Term'].values
    # Normalise to [0,1]
    # b = (df['Count']  - df['Count'].min())/ np.ptp(df['Count'])
    # area = 100 * b

    # control the size of scatter and legend marker
    levels = numbers = np.sort(df.Hits.unique())
    if norm is None:
        norm = Normalize()
    elif isinstance(norm, tuple):
        norm = Normalize(*norm)
    elif not isinstance(norm, Normalize):
        err = ("``size_norm`` must be None, tuple, " "or Normalize object.")
        raise ValueError(err)
    min_width, max_width = np.r_[20, 100] * plt.rcParams["lines.linewidth"]
    norm.clip = True
    if not norm.scaled():
        norm(np.asarray(numbers))
    size_limits = norm.vmin, norm.vmax
    scl = norm(numbers)
    widths = np.asarray(min_width + scl * (max_width - min_width))
    if scl.mask.any():
        widths[scl.mask] = 0
    sizes = dict(zip(levels, widths))
    df['sizes'] = df.Hits.map(sizes)
    area = df['sizes'].values

    # create scatter plot
    if hasattr(sys, 'ps1') and (ofname is None):
        # working inside python console, show figure
        fig, ax = plt.subplots(figsize=figsize)
    else:
        # If working on commandline, don't show figure
        fig = Figure(figsize=figsize)
        canvas = FigureCanvas(fig)
        ax = fig.add_subplot(111)
    vmin = np.percentile(combined_score.min(), 2)
    vmax = np.percentile(combined_score.max(), 98)
    sc = ax.scatter(x=x,
                    y=y,
                    s=area,
                    edgecolors='face',
                    c=combined_score,
                    cmap=cmap,
                    vmin=vmin,
                    vmax=vmax)

    if column in ['Adjusted P-value', 'P-value']:
        xlabel = "-log$_{10}$(%s)" % column
    else:
        xlabel = column
    ax.set_xlabel(xlabel, fontsize=14, fontweight='bold')
    ax.yaxis.set_major_locator(plt.FixedLocator(y))
    ax.yaxis.set_major_formatter(plt.FixedFormatter(ylabels))
    ax.set_yticklabels(ylabels, fontsize=16)

    # ax.set_ylim([-1, len(df)])
    ax.grid()
    # colorbar
    cax = fig.add_axes([0.95, 0.20, 0.03, 0.22])
    cbar = fig.colorbar(
        sc,
        cax=cax,
    )
    cbar.ax.tick_params(right=True)
    cbar.ax.set_title('Combined\nScore', loc='left', fontsize=12)

    # for terms less than 3
    if len(df) >= 3:
        # find the index of the closest value to the median
        idx = [
            area.argmax(),
            np.abs(area - area.mean()).argmin(),
            area.argmin()
        ]
        idx = unique(idx)
    else:
        idx = range(len(df))
    label = df.iloc[idx, df.columns.get_loc('Hits')]

    if legend:
        handles, _ = ax.get_legend_handles_labels()
        legend_markers = []
        for ix in idx:
            legend_markers.append(ax.scatter([], [], s=area[ix], c='b'))
        # artist = ax.scatter([], [], s=size_levels,)
        ax.legend(legend_markers, label, title='Hits')
    ax.set_title(title, fontsize=20, fontweight='bold')

    if ofname is not None:
        # canvas.print_figure(ofname, bbox_inches='tight', dpi=300)
        fig.savefig(ofname, bbox_inches='tight', dpi=300)
        return
    return ax