示例#1
0
def plot_vs_discrete(data_table,
                     discrete_metric_name,
                     metric_name,
                     segment_name,
                     title,
                     ylim=None,
                     aggregate="mean"
                     ):
    data_filtered = \
        data_table.loc[((pd.notnull(data_table[metric_name])) & (pd.notnull(data_table[discrete_metric_name])))][
            [discrete_metric_name, metric_name, segment_name]]

    data_filtered[[metric_name]] = data_filtered[[metric_name]].astype(float)
    result = data_filtered.groupby([discrete_metric_name, segment_name]).agg({metric_name: aggregate}).reset_index()
    result[metric_name] = round(result[metric_name], 3)

    gg_result = plot.ggplot(result) + plot.aes(x=discrete_metric_name,
                                               y=metric_name,
                                               fill=segment_name,
                                               label=metric_name
                                               ) + \
                plot.geom_bar(stat="identity", position="dodge") + \
                plot.geom_text(position=plot.position_dodge(width=.9), size=8) + \
                plot.labs(x=discrete_metric_name, y=aggregate + "(" + metric_name + ")", title=title)

    if pd.notnull(ylim):
        gg_result = gg_result + plot.ylim(ylim)

    return gg_result
示例#2
0
def plot_bargraph(count_plot_df, plot_df):
    """
    Plots the bargraph 
    Arguments:
        count_plot_df - The dataframe that contains lemma counts
        plot_df - the dataframe that contains the odds ratio and lemmas
    """

    graph = (
        p9.ggplot(count_plot_df.astype({"count": int}),
                  p9.aes(x="lemma", y="count")) +
        p9.geom_col(position=p9.position_dodge(width=0.5), fill="#253494") +
        p9.coord_flip() + p9.facet_wrap("repository", scales='free_x') +
        p9.scale_x_discrete(limits=(plot_df.sort_values(
            "odds_ratio", ascending=True).lemma.tolist())) +
        p9.scale_y_continuous(labels=custom_format('{:,.0g}')) +
        p9.labs(x=None) + p9.theme_seaborn(
            context='paper', style="ticks", font="Arial", font_scale=0.95) +
        p9.theme(
            # 640 x 480
            figure_size=(6.66, 5),
            strip_background=p9.element_rect(fill="white"),
            strip_text=p9.element_text(size=12),
            axis_title=p9.element_text(size=12),
            axis_text_x=p9.element_text(size=10),
        ))
    return graph
示例#3
0
def plot_pointplot(plot_df, y_axis_label="", use_log10=False, limits=[0, 3.2]):
    """
    Plots the pointplot
    Arguments:
        plot_df - the dataframe that contains the odds ratio and lemmas
        y_axis_label - the label for the y axis
        use_log10 - use log10 for the y axis?
    """
    graph = (
        p9.ggplot(plot_df, p9.aes(x="lemma", y="odds_ratio")) +
        p9.geom_pointrange(p9.aes(ymin="lower_odds", ymax="upper_odds"),
                           position=p9.position_dodge(width=1),
                           size=0.3,
                           color="#253494") +
        p9.scale_x_discrete(limits=(plot_df.sort_values(
            "odds_ratio", ascending=True).lemma.tolist())) +
        (p9.scale_y_log10() if use_log10 else p9.scale_y_continuous(
            limits=limits)) +
        p9.geom_hline(p9.aes(yintercept=1), linetype='--', color='grey') +
        p9.coord_flip() + p9.theme_seaborn(
            context='paper', style="ticks", font_scale=1, font='Arial') +
        p9.theme(
            # 640 x 480
            figure_size=(6.66, 5),
            panel_grid_minor=p9.element_blank(),
            axis_title=p9.element_text(size=12),
            axis_text_x=p9.element_text(size=10)) +
        p9.labs(x=None, y=y_axis_label))
    return graph
示例#4
0
def test_dodge_preserve_single_text():
    df1 = pd.DataFrame({'x': ['a', 'b', 'b', 'b'], 'y': ['a', 'a', 'b', 'b']})

    d = position_dodge(preserve='single', width=0.9)
    p = (ggplot(df1, aes('x', fill='y')) + geom_bar(position=d) +
         geom_text(aes(y=after_stat('count'), label=after_stat('count')),
                   stat='count',
                   position=d,
                   va='bottom'))
    assert p + _theme == 'dodge_preserve_single_text'
示例#5
0
文件: core.py 项目: ozacas/asxtrade
def plot_cumulative_returns(wanted_stocks: Iterable[str],
                            ld: LazyDictionary) -> p9.ggplot:
    df = ld["cip_df"]
    df = df.filter(wanted_stocks, axis=0).filter(regex="^\d", axis=1)
    dates = set(df.columns)
    movers = df
    movers["asx_code"] = movers.index
    movers = movers.melt(id_vars="asx_code", value_vars=dates)
    movers = movers[(movers["value"] < -5.0) |
                    (movers["value"] > 5.0)]  # ignore small movers
    # print(movers)
    movers["fetch_date"] = pd.to_datetime(movers["fetch_date"],
                                          format="%Y-%m-%d")

    # need to have separate dataframe's for positive and negative stocks - otherwise plotnine plot will be wrong
    #print(df)
    pos_df = df.agg([positive_sum])
    neg_df = df.agg([negative_sum])
    pos_df = pos_df.melt(value_vars=dates)
    neg_df = neg_df.melt(value_vars=dates)
    pos_df["fetch_date"] = pd.to_datetime(pos_df["fetch_date"],
                                          format="%Y-%m-%d")
    neg_df["fetch_date"] = pd.to_datetime(neg_df["fetch_date"],
                                          format="%Y-%m-%d")

    plot = (p9.ggplot() + p9.geom_bar(
        p9.aes(x="fetch_date", y="value"),
        data=pos_df,
        stat="identity",
        fill="green",
    ) + p9.geom_bar(
        p9.aes(x="fetch_date", y="value"),
        data=neg_df,
        stat="identity",
        fill="red",
    ) + p9.geom_point(
        p9.aes(
            x="fetch_date",
            y="value",
            fill="asx_code",
        ),
        data=movers,
        size=3,
        position=p9.position_dodge(width=0.4),
        colour="black",
    ))
    return user_theme(
        plot,
        y_axis_label="Cumulative Return (%)",
        legend_position="right",
        asxtrade_want_cmap_d=False,
        asxtrade_want_fill_d=
        True,  # points (stocks) are filled with the user-chosen theme, but everything else is fixed
    )
示例#6
0
def plot_result_stats(results, title):
    stats = results.describe().unstack().reset_index().rename(columns={
        "level_0": "metric",
        "level_1": "group",
        0: "value"
    })
    stats = stats[~stats["group"].isin(["count", "min", "max"])]
    stats["value_presentation"] = round(stats["value"], 2)
    plot = (p9.ggplot(stats) + p9.aes("metric", "value", fill="group") +
            p9.geom_col(position="dodge") + p9.theme_bw() +
            p9.coord_cartesian(ylim=[0, 1.0]) + p9.ggtitle(title) +
            p9.geom_text(p9.aes(label="value_presentation"),
                         position=p9.position_dodge(width=0.9),
                         va="bottom"))
    return plot
示例#7
0
def plot_vs_continuous(data_table,
                       continuous_metric_name,
                       breaks,
                       metric_name,
                       segment_name,
                       title,
                       aggregate="mean"):
    result = _aggregate_vs_continuous(data_table, continuous_metric_name, breaks, metric_name, segment_name, aggregate)
    gg_result = plot.ggplot(result) + plot.aes(x="level_0",
                                               y=metric_name,
                                               fill=segment_name,
                                               label=metric_name
                                               ) + \
                plot.geom_bar(stat="identity", position="dodge") + \
                plot.geom_text(position=plot.position_dodge(width=.9), size=8) + \
                plot.labs(x=continuous_metric_name, y=aggregate + "(" + metric_name + ")", title=title)
    return gg_result
def plot_distributions_bar_plot_grid(dataframe, figure_size=(14, 4)):
    """
    We create a function to plot the bar plot.
    """

    return (
        # Define the plot.
        p9.ggplot(dataframe, p9.aes(x='threshold', fill='value'))
        # Add the bars.
        + p9.geom_bar(position='dodge') +
        p9.geom_text(p9.aes(label='stat(count)'),
                     stat='count',
                     position=p9.position_dodge(0.9),
                     size=7,
                     va='bottom')
        # Rename the x axis.
        + p9.scale_x_discrete(name='Threshold')
        # Rename the y axis, give some space on top and bottom (mul_bottom, add_bottom, mul_top, add_top).
        + p9.scale_y_continuous(name='Count', expand=(0, 0, 0, 500))
        # Replace the names in the legend and set the colors of the bars.
        + p9.scale_fill_manual(values={
            0: '#009e73',
            1: '#d55e00'
        },
                               labels=lambda l: [{
                                   0: 'Stable',
                                   1: 'Unstable'
                               }[x] for x in l])
        # Place the plots in a grid, renaming the labels.
        + p9.facet_grid('. ~ iterations',
                        labeller=p9.labeller(cols=lambda x: f'iters = {x}'))
        # Define the theme for the plot.
        + p9.theme(
            # Remove the y axis name.
            axis_title_y=p9.element_blank(),
            # Set the size of x and y tick labels font.
            axis_text_x=p9.element_text(size=7),
            axis_text_y=p9.element_text(size=7),
            # Place the legend on top, without title, and reduce the margin.
            legend_title=p9.element_blank(),
            legend_position='top',
            legend_box_margin=2,
            # Set the size for the figure.
            figure_size=figure_size,
        ))
示例#9
0
def main():
    """Run CLI."""
    parser = argparse.ArgumentParser(description="""
            Calcualte and compare LISI across a series of reduced dims and
            categorical variables.
            """)

    parser.add_argument(
        '-v',
        '--version',
        action='version',
        version='%(prog)s {version}'.format(version=__version__))

    # parser.add_argument(
    #     '-h5', '--h5_anndata',
    #     action='store',
    #     dest='h5',
    #     required=True,
    #     help='H5 AnnData file.'
    # )

    parser.add_argument(
        '-rf',
        '--reduced_dims_tsv',
        action='store',
        dest='reduced_dims',
        required=True,
        help='List of tab-delimited files of reduced dimensions (e.g., PCs)\
            for each cell. First column is cell_barcode. List should be\
            split by "::" (e.g. file1.tsv.gz::file2.tsv.gz).')

    parser.add_argument(
        '-lbl',
        '--reduced_dims_tsv_labels',
        action='store',
        dest='reduced_dims_labels',
        required=True,
        help='String of labels for each reduced_dims_tsv file. List should be\
            split by "::".')

    parser.add_argument(
        '-mf',
        '--metadata_tsv',
        action='store',
        dest='metadata_tsv',
        required=True,
        help='Tab-delimited file of metadata for each cell. First column\
            is cell_barcode.')

    parser.add_argument(
        '-mv',
        '--metadata_columns',
        action='store',
        dest='metadata_columns',
        default='experiment_id',
        help='Comma separated string of categorical variables to calculate\
            LISI with.\
            (default: %(default)s)')

    parser.add_argument('-p',
                        '--perplexity',
                        action='store',
                        dest='perplexity',
                        default=30.0,
                        type=float,
                        help='Perplexity.\
            (default: %(default)s)')

    parser.add_argument(
        '-of',
        '--output_file',
        action='store',
        dest='of',
        default='',
        help='Basename of output files, assuming output in current working \
            directory.\
            (default: <metadata_tsv>-lisi)')

    options = parser.parse_args()

    # Fixed settings.
    # verbose = True

    # Get the out file base.
    out_file_base = options.of
    if out_file_base == '':
        out_file_base = '{}-lisi'.format(
            os.path.basename(
                options.metadata_tsv.rstrip('tsv.gz').rstrip('.')))

    # Get the columns to use
    lisi_columns = options.metadata_columns.split(',')
    # lisi_columns = ['experiment_id', 'batch']
    lisi_columns_dtype = dict(
        zip(lisi_columns, ['category'] * len(lisi_columns)))

    # Load the metadata file
    file_meta = options.metadata_tsv
    df_meta = pd.read_csv(file_meta,
                          sep='\t',
                          index_col='cell_barcode',
                          dtype=lisi_columns_dtype)

    # Load the reduced dims.
    files = options.reduced_dims.split('::')
    labels = options.reduced_dims_labels.split('::')
    assert len(files) == len(labels), 'ERROR: check files and labels input'

    # Make a dict of theoretical maximum LISI value for each label.
    lisi_limit = {}
    for col in lisi_columns:
        n_cat = len(df_meta[col].cat.categories)
        lisi_limit[col] = n_cat

    list_lisi = []
    for i in range(len(files)):
        df_reduced_dims = pd.read_csv(files[i],
                                      sep='\t',
                                      index_col='cell_barcode')

        # Run lisi and save results to dataframe
        _df_lisi = pd.DataFrame(hm.compute_lisi(
            df_reduced_dims.loc[df_meta.index, :], df_meta[lisi_columns],
            lisi_columns),
                                columns=lisi_columns)
        _df_lisi['file'] = files[i]
        _df_lisi['label'] = labels[i]
        _df_lisi['cell_barcode'] = df_meta.index
        list_lisi.append(_df_lisi)

    # Make one long dataframe.
    df_lisi = pd.concat(list_lisi)
    # Make cell_barcode the first column.
    cols = list(df_lisi.columns)
    cols = [cols[-1]] + cols[:-1]

    # Save the results
    df_lisi[cols].to_csv('{}.tsv.gz'.format(out_file_base),
                         sep='\t',
                         index=False,
                         quoting=csv.QUOTE_NONNUMERIC,
                         na_rep='',
                         compression='gzip')

    # Compare the lisi distributions
    n_labels = len(labels)
    for lisi_column in lisi_columns:
        # Make density plot.
        gplt = plt9.ggplot(df_lisi,
                           plt9.aes(
                               fill='label',
                               x='label',
                               y=lisi_column,
                           ))
        gplt = gplt + plt9.theme_bw(base_size=12)
        gplt = gplt + plt9.geom_violin(alpha=0.9)
        gplt = gplt + plt9.geom_boxplot(
            group='label',
            position=plt9.position_dodge(width=.9),
            width=.1,
            fill='white',
            outlier_alpha=0  # Do not know how to totally remove outliers.
        )
        # Add a line at the theoretical maximum
        gplt = gplt + plt9.geom_hline(
            plt9.aes(yintercept=lisi_limit[lisi_column]))
        # gplt = gplt + plt9.facet_grid('{} ~ .'.format(label))
        gplt = gplt + plt9.labs(x='Reduced dimensions', y='LISI', title='')
        gplt = gplt + plt9.theme(
            axis_text_x=plt9.element_text(angle=-45, hjust=0))
        gplt = gplt + plt9.theme(legend_position='none')
        if n_labels != 0 and n_labels < 9:
            gplt = gplt + plt9.scale_fill_brewer(palette='Dark2', type='qual')
        gplt.save(
            '{}-{}-violin.png'.format(out_file_base, lisi_column),
            dpi=300,
            width=4 * (n_labels / 4),
            height=10,
            # height=4*(n_samples/4),
            limitsize=False)

        # Make ecdf.
        gplt = plt9.ggplot(df_lisi, plt9.aes(
            x=lisi_column,
            color='label',
        ))
        gplt = gplt + plt9.theme_bw(base_size=12)
        gplt = gplt + plt9.stat_ecdf(alpha=0.8)
        gplt = gplt + plt9.labs(
            x='LISI',
            y='Cumulative density',
            # color='Reduction',
            title='')
        if n_labels != 0 and n_labels < 9:
            gplt = gplt + plt9.scale_color_brewer(palette='Dark2', type='qual')
        gplt.save('{}-{}-ecdf.pdf'.format(out_file_base, lisi_column),
                  dpi=300,
                  width=10,
                  height=4,
                  limitsize=False)
示例#10
0
metadata_df["author_type"].value_counts()

# # BioRxiv Research Article Categories

# Categories assigned to each research article. Neuroscience dominates majority of the articles as expected.

# In[9]:

category_list = metadata_df.category.value_counts().index.tolist()[::-1]

# plot nine doesn't implement reverse keyword for scale x discrete
# ugh...
g = (
    p9.ggplot(metadata_df, p9.aes(x="category")) +
    p9.geom_bar(size=10, fill="#253494", position=p9.position_dodge(width=3)) +
    p9.scale_x_discrete(limits=category_list) + p9.coord_flip() +
    p9.theme_seaborn(
        context="paper", style="ticks", font="Arial", font_scale=1))
g.save("output/figures/preprint_category.png", dpi=500)
print(g)

# In[10]:

metadata_df["category"].value_counts()

# # New, Confirmatory, Contradictory Results?

# In[11]:

heading_list = metadata_df.heading.value_counts().index.tolist()[::-1]
示例#11
0
def test_dodge_preserve_single():
    df1 = pd.DataFrame({'x': ['a', 'b', 'b'], 'y': ['a', 'a', 'b']})
    p = (ggplot(df1, aes('x', fill='y')) +
         geom_bar(position=position_dodge(preserve='single')))
    assert p + _theme == 'dodge_preserve_single'
示例#12
0
                                                         max_depth=5,
                                                         min_samples_split=2,
                                                         max_features=5,
                                                         n_jobs=n_threads)

        sklearn_forest.fit(X, y)
        current_timing = (time.time() - start_time)
        if n >= n_burn_in:
            timing_data['implementation'].append('scikit-learn 0.23.1')
            timing_data['threads'].append(n_threads)
            timing_data['timing'].append(current_timing)

df = pd.DataFrame(data=timing_data)
df = df.groupby(['implementation', 'threads']).agg(['mean',
                                                    'std']).reset_index()
df.columns = ['Implementation', 'threads', 'mean', 'std']
print(df)

df['error_min'] = df['mean'] - df['std']
df['error_max'] = df['mean'] + df['std']
p = (ggplot(
    df,
    aes(x='threads', y='mean', group='Implementation',
        color='Implementation')) + geom_line() + geom_point() +
     geom_errorbar(aes(ymin='error_min', ymax='error_max'),
                   width=.2,
                   position=position_dodge(0.05)) +
     labs(x="Number of threads", y="timing [s]"))

p.save(filename='benchmark.png')
示例#13
0
gg_rep_act.save(os.path.join(dir_output, 'gg_rep_act.png'), width=8, height=4)

di_notes = {
    'chi2': 'χ2-correction',
    'insig': 'Erroneous',
    'specification': 'Specification',
    'non-replicable': 'Inconsistent'
}
# (ii) Breakdown of counts
tmp = acc_tt.merge(
    res_fisher.tt.value_counts().reset_index().rename(columns={
        'index': 'tt',
        'tt': 'n_lit'
    }))
tmp = tmp.assign(tt=lambda x: x.tt.map(di_tt),
                 notes=lambda x: x.notes.map(di_notes),
                 share=lambda x: x.n / x.n_lit)

gg_acc_notes = (
    pn.ggplot(tmp, pn.aes(x='notes', y='share', fill='tt')) + pn.theme_bw() +
    pn.scale_y_continuous(labels=percent_format(), limits=[0, 0.1]) +
    pn.scale_fill_discrete(name='Literature') +
    pn.geom_col(color='black', position=pn.position_dodge(0.5), width=0.5) +
    pn.labs(y='Percent', x='Investigation') +
    pn.theme(axis_text_x=pn.element_text(angle=45),
             axis_title_x=pn.element_blank()))
gg_acc_notes.save(os.path.join(dir_output, 'gg_acc_notes.png'),
                  width=7,
                  height=3)

print('~~~ End of 4_results_insig.py ~~~')
# ## Global View of PCA plot

# In[5]:

g = (p9.ggplot(biorxiv_pca_method_section_df) +
     p9.aes(x="pca1", y="pca2", color="category") + p9.geom_point() +
     p9.theme_bw() + p9.labs(title="TSNE Methods Section (300 dim)"))
print(g)

# ## Neuroscience Methods Section

# In[6]:

g = (p9.ggplot(biorxiv_pca_method_section_df.query("category=='neuroscience'"))
     + p9.aes(x="pca1", y="pca2", color="section") +
     p9.geom_point(position=p9.position_dodge(width=0.2)) +
     p9.facet_wrap("section") + p9.theme_bw() +
     p9.theme(subplots_adjust={'wspace': 0.10}) +
     p9.scale_color_manual({
         "has_methods": "#d8b365",
         "no_methods": "#5ab4ac"
     }) + p9.labs(title="Neuroscience Methods Section"))
g.save("output/pca/neuroscience_missing_methods.png", dpi=500)
print(g)

# In[7]:

g = (p9.ggplot(biorxiv_pca_method_section_df.query("category=='neuroscience'"))
     + p9.aes(x="pca1", y="pca2", color="section") +
     p9.geom_point(position=p9.position_dodge(width=0.2)) + p9.theme_bw() +
     p9.scale_color_manual({
示例#15
0
        if group is None:
            g += p9.geom_crossbar(p9.aes(x="x",
                                         y='center',
                                         ymin='low',
                                         ymax='high'),
                                  colour=ez_colors(1)[0],
                                  na_rm=False)
        else:
            g += p9.geom_crossbar(p9.aes(x="x",
                                         y='center',
                                         ymin='low',
                                         ymax='high',
                                         group="factor(group_x)",
                                         colour="factor(group)",
                                         fill="factor(group)"),
                                  position=p9.position_dodge(
                                      0.7, preserve='single'),
                                  na_rm=True,
                                  alpha=0.2)

            g += p9.scale_fill_manual(values=ez_colors(g.n_groups('group')))
            g += p9.scale_colour_manual(values=ez_colors(g.n_groups('group')))

    elif geom == 'ribbon':

        g = EZPlot(gdata.dropna())

        # set groups
        if group is None:
            g += p9.geom_ribbon(p9.aes(x="x",
                                       y='center',
                                       ymin='low',
示例#16
0
def box_plot(df,
             x,
             y,
             group = None,
             facet_x = None,
             facet_y = None,
             dodge_groups=True,
             base_size = 10,
             figure_size = (6,3),
             **kwargs):
    '''
    Aggregates data in df and plots as a line chart.

    Parameters
    ----------
    df : pd.DataFrame
      input dataframe
    x : str
      quoted expression to be plotted on the x axis
    y : str
      quoted expression to be plotted on the y axis
    group : str
      quoted expression to be used as group (ie color)
    facet_x : str
      quoted expression to be used as facet
    facet_y : str
      quoted expression to be used as facet
    base_size : int
      base size for theme_ez
    figure_size :tuple of int
      figure size
    **kwargs : kwargs
      additional kwargs for geom_boxplot

    Returns
    -------
    g : EZPlot
      EZplot object

    '''

    # create a copy of the data
    dataframe = df.copy()

    # define groups and variables; remove and store (eventual) names
    names = {}
    groups = {}
    variables = {}

    for label, var in zip(['x', 'group', 'facet_x', 'facet_y'], [x, group, facet_x, facet_y]):
        names[label], groups[label] = unname(var)
    names['y'], variables['y'] = unname(y)

    # fix special cases
    if x == '.index':
        groups['x'] = '.index'
        names['x'] = dataframe.index.name if dataframe.index.name is not None else ''

    # aggregate data and reorder columns
    gdata = agg_data(dataframe, variables, groups, None, fill_groups=True)
    gdata = gdata[[c for c in ['x', 'y', 'group', 'facet_x', 'facet_y'] if c in gdata.columns]]

    # add group_x column
    if group is not None:
        gdata['group_x'] = gdata['group'].astype('str') + '_' + gdata['x'].astype(str)

    g = EZPlot(gdata)

    # set groups
    if group is None:
        g += p9.geom_boxplot(p9.aes(x="factor(x)", y="y", group="factor(x)"),
                             colour = ez_colors(1)[0],
                             na_rm = False,
                             **kwargs)
    else:
        if dodge_groups:
            g += p9.geom_boxplot(p9.aes(x="factor(x)", y="y", group="factor(group_x)", fill="factor(group)"),
                                 position=p9.position_dodge(0.9, preserve='single'),
                                 na_rm = True,
                                 **kwargs)
        else:
            g += p9.geom_boxplot(p9.aes(x="factor(x)", y="y", group="factor(group_x)", fill="factor(group)"),
                                 na_rm = True,
                                 **kwargs)
        g += p9.scale_fill_manual(values=ez_colors(g.n_groups('group')))

    # set facets
    if facet_x is not None and facet_y is None:
        g += p9.facet_wrap('~facet_x')
    if facet_x is not None and facet_y is not None:
        g += p9.facet_grid('facet_y~facet_x')

    # set x scale
    g += p9.scale_x_discrete()

    # set y scale
    g += p9.scale_y_continuous(labels=ez_labels)

    # set axis labels
    g += \
        p9.xlab(names['x']) + \
        p9.ylab(names['y'])

    # set theme
    g += theme_ez(figure_size = figure_size,
                  base_size = base_size,
                  legend_title=p9.element_text(text=names['group'], size=base_size))

    return g
示例#17
0
    def barchart_make(roi, df, list_rois, config, ylimit, save_function,
                      find_ylim_function):
        thisroi = list_rois[roi]

        current_df = df.loc[df['index'] == thisroi]

        current_df = current_df.sort_values([config.single_roi_fig_x_axis])
        current_df = current_df.reset_index(
            drop=True)  # Reset index to remove grouping
        current_df[config.single_roi_fig_x_axis] = pd.Categorical(
            current_df[config.single_roi_fig_x_axis],
            categories=current_df[config.single_roi_fig_x_axis].unique())

        figure = (
            pltn.ggplot(
                current_df,
                pltn.aes(x=config.single_roi_fig_x_axis,
                         y='Mean',
                         ymin="Mean-Conf_Int_95",
                         ymax="Mean+Conf_Int_95",
                         fill='factor({colour})'.format(
                             colour=config.single_roi_fig_colour))) +
            pltn.theme_538() + pltn.geom_col(position=pltn.position_dodge(
                preserve='single', width=0.8),
                                             width=0.8,
                                             na_rm=True) +
            pltn.geom_errorbar(size=1,
                               position=pltn.position_dodge(
                                   preserve='single', width=0.8)) +
            pltn.labs(x=config.single_roi_fig_label_x,
                      y=config.single_roi_fig_label_y,
                      fill=config.single_roi_fig_label_fill) +
            pltn.scale_x_discrete(labels=[]) +
            pltn.theme(panel_grid_major_x=pltn.element_line(alpha=0),
                       axis_title_x=pltn.element_text(
                           weight='bold', color='black', size=20),
                       axis_title_y=pltn.element_text(
                           weight='bold', color='black', size=20),
                       axis_text_y=pltn.element_text(size=20, color='black'),
                       legend_title=pltn.element_text(size=20, color='black'),
                       legend_text=pltn.element_text(size=18, color='black'),
                       subplots_adjust={'right': 0.85},
                       legend_position=(0.9, 0.8),
                       dpi=config.plot_dpi) +
            pltn.geom_text(pltn.aes(y=-.7, label=config.single_roi_fig_x_axis),
                           color='black',
                           size=20,
                           va='top') + pltn.scale_fill_manual(
                               values=config.colorblind_friendly_plot_colours))

        if ylimit:
            # Set y limit of figure (used to make it the same for every barchart)
            figure += pltn.ylim(None, ylimit)
            thisroi += '_same_ylim'

        returned_ylim = 0
        if config.use_same_axis_limits in ('Same limits',
                                           'Create both') and ylimit == 0:
            returned_ylim = find_ylim_function(thisroi, figure, 'yaxis')

        if config.use_same_axis_limits == 'Same limits' and ylimit == 0:
            return returned_ylim
        elif ylimit != 0:
            folder = 'Same_yaxis'
        else:
            folder = 'Different_yaxis'

        save_function(figure, thisroi, config, folder, 'barchart')

        return returned_ylim
示例#18
0
        rel
    })
edges_df = pd.DataFrame.from_records(datarows)
edges_df

# In[11]:

import math
g = (p9.ggplot(edges_df, p9.aes(x="relation", y="edges", fill="in_hetionet")) +
     p9.geom_col(position="dodge") +
     p9.scale_fill_manual(values={
         "Existing": color_map["Existing"],
         "Novel": color_map["Novel"]
     }) + p9.geom_text(p9.aes(label=(
         edges_df.apply(lambda x: f"{x['edges']}\n({x['recall']*100:.0f}%)"
                        if not math.isnan(x['recall']) else f"{x['edges']}",
                        axis=1))),
                       position=p9.position_dodge(width=0.9),
                       size=9,
                       va="bottom") + p9.scale_y_log10() +
     p9.labs(y="# of Edges",
             x="Relation Type",
             title="Reconstructing Edges in Hetionet") +
     p9.guides(fill=p9.guide_legend(title="In Hetionet?")) + p9.theme(
         axis_text_y=p9.element_blank(),
         axis_ticks_major=p9.element_blank(),
         rect=p9.element_blank(),
     ))
print(g)
g.save(filename="../edges_added.png", dpi=300)
示例#19
0
        arti_start = time.time()
        df, separated_peaks = er.proof_artificial(
            model,
            ad_partial,
            region_length=parameters['pad_to'],
            nb_datasets=parameters['artificial_nb_datasets'],
            nb_tfs=parameters['artificial_nb_tfs'],
            n_iter=500,
            squish_factor=parameters['squish_factor'])
        arti_end = time.time()
        print('Artificial data generalisation completed in ' +
              str(arti_end - arti_start) + ' s')

        # The plots
        a = ggplot(df, aes(x="type", y="rebuilt_value", fill="tf_group"))
        a1 = a + geom_violin(position=position_dodge(1), width=1)
        a2 = a + geom_boxplot(position=position_dodge(1), width=0.5)
        b = ggplot(df, aes(
            x="brothers", y="rebuilt_value",
            group="brothers")) + scale_fill_grey() + geom_boxplot(width=0.4)

        a2.save(filename=plot_output_path +
                'artifical_data_systematisation_value_per_type.png',
                height=10,
                width=14,
                units='in',
                dpi=400,
                verbose=False)
        b.save(filename=plot_output_path +
               'artifical_data_systematisation_value_per_brothers.png',
               height=10,
示例#20
0
def do_PCA(df, allowed_nas, groups, paired, outbasename, name, extra=None):
    """
    PCA analysis from PSIs quantifications based on the 2000 events with
    greatest variance. Additionally, plots of principal components based
    on the defined groups are drawn

    :param pd.DataFrame df: Ready df with per-event PSIs to calculate
    principal components
    :param int allowed_nas: Allowed NAs. If > 0, imputations will be
    performed based on the row mean (PCA doesn't accept missing values)
    :param dict groups: Dictionary with info about each sample
    and the group they represent
    :param bool paired: Whether samples of each group are paired (if so,
    order of samples per group must be preserved)
    :param str outbasename: Output basename
    :param str name: str to add to output (e.g. software name)
    :param str extra: Extra str to add (e.g. rMATS event type)
    :return:
    """
    print("Number of {} events that will be used in the PCA: {}".format(
        name, df.shape[0]))
    if allowed_nas > 0:
        print("Doing imputation of missing values")
        df = df.apply(lambda x: x.fillna(x.mean()), axis=1)

    df = df.loc[df.var(axis=1).nlargest(2000).index, ]
    pca = PCA(n_components=3)
    PCs = pca.fit_transform(df.T)
    print(
        "Amount of variance that the first PCs contain for {} data: {}".format(
            name, pca.explained_variance_ratio_))

    PC_df = pd.DataFrame(data=PCs,
                         columns=['PC1', 'PC2', 'PC3'],
                         index=df.T.index)

    cols_groups_df = ['Group', 'Ind'] if paired else ['Group']
    groups_df = pd.DataFrame.from_dict(groups,
                                       orient='index',
                                       columns=cols_groups_df)

    PC_df = PC_df.merge(groups_df, left_index=True,
                        right_index=True).rename_axis('Sample').reset_index()

    for pc_pair in [("PC1", "PC2"), ("PC2", "PC3")]:
        if paired:
            p1 = (p9.ggplot(
                PC_df,
                p9.aes(x=pc_pair[0], y=pc_pair[1], fill="Group", shape='Ind'))
                  + p9.geom_point(color="black",
                                  size=6,
                                  alpha=0.7,
                                  position=p9.position_dodge(
                                      width=0.3, preserve="total")))
        else:
            p1 = (p9.ggplot(
                PC_df,
                p9.aes(
                    x=pc_pair[0], y=pc_pair[1], fill="Group", shape='Sample'))
                  + p9.geom_point(color="black",
                                  size=6,
                                  alpha=0.7,
                                  position=p9.position_dodge(
                                      width=0.3, preserve="total")))

        if extra is not None:
            output = "{}_{}_{}_{}vs{}.pdf".format(outbasename, extra, name,
                                                  pc_pair[0], pc_pair[1])
        else:
            output = "{}_{}_{}vs{}.pdf".format(outbasename, name, pc_pair[0],
                                               pc_pair[1])
        p1.save(output, verbose=False)
    pdtypes.CategoricalDtype())

# Using only combis of each individual length

all_lengths = sorted(set(df['combi_length']))

# No point in going beyond, roughly, 12
all_lengths = [l for l in all_lengths if l <= 12]

for length in all_lengths:

    df_filtered = df.loc[df['combi_length'] == length, :]

    try:
        p = (ggplot(data=df_filtered, mapping=aes(x='entropy', y='fc')) +
             geom_violin(position=position_dodge(1), width=1) +
             geom_boxplot(position=position_dodge(1), width=0.25) +
             xlab("Entropy") + ylab("Fold change (log2)") +
             ggtitle("Order " + str(length)))

        p.save(filename=ROOT_PATH + "entropy_graph/entropy_length_" +
               str(length) + "_fc.png")

        p = (ggplot(data=df_filtered, mapping=aes(x='entropy', y='s')) +
             geom_violin(position=position_dodge(1), width=1) +
             geom_boxplot(position=position_dodge(1), width=0.25) +
             xlab("Entropy") + ylab("True total overlapping bp.") +
             ggtitle("Order " + str(length)))

        p.save(filename=ROOT_PATH + "entropy_graph/entropy_length_" +
               str(length) + "_s.png")
示例#22
0
def main():
    """Run CLI."""
    parser = argparse.ArgumentParser(description="""
            Fits logistic regression to predict labels.'
            """)

    parser.add_argument(
        '-v',
        '--version',
        action='version',
        version='%(prog)s {version}'.format(version=__version__))

    parser.add_argument(
        '-h5',
        '--h5_anndata',
        action='store',
        dest='h5',
        required=True,
        help='H5 AnnData file where clusters have been saved to cluster slot.')

    # parser.add_argument(
    #     '-ncpu', '--number_cpu',
    #     action='store',
    #     dest='number_cpu',
    #     default=50,
    #     type=int,
    #     help='Number of CPUs to use. Since we are testing the dask backend,\
    #         this corresponds to the number of CPUs available across all of\
    #         the worker jobs we spin out.\
    #         (default: %(default)s)'
    # )

    parser.add_argument('-s',
                        '--sparsity_l1',
                        action='store',
                        dest='sparsity_l1',
                        default=0.0001,
                        type=float,
                        help='Smaller values specify stronger regularization.\
            (default: %(default)s)')

    parser.add_argument('-nepoch',
                        '--number_epoch',
                        action='store',
                        dest='number_epoch',
                        default=25,
                        type=int,
                        help='Number of epochs.\
            (default: %(default)s)')

    parser.add_argument(
        '-bs',
        '--batch_size',
        action='store',
        dest='batch_size',
        default=32,
        type=int,
        help='Batch size. Divides the dataset into n batches and updates the\
            weights at the end of each one.\
            (default: %(default)s)')

    parser.add_argument(
        '-tsc',
        '--train_size_cells',
        action='store',
        dest='train_size_cells',
        default=0,
        type=int,
        help='Number of cells to use for training set. If > 0 all\
            remaining cells not randomly selected for training will be used\
            for the test set. Overrides <train_size_fraction>.\
            (default: %(default)s)')

    parser.add_argument('-tsf',
                        '--train_size_fraction',
                        action='store',
                        dest='train_size_fraction',
                        default=0.67,
                        type=float,
                        help='Fraction of the data to use for training set.\
            (default: %(default)s)')

    parser.add_argument(
        '--dict_add',
        action='store',
        dest='dict_add',
        default='',
        type=str,
        help='Additional information to add to output model_report.\
            Format: key::value:::key2::value2.\
            Example: method::leiden:::resolution::3.0\
            (default: %(default)s)')

    parser.add_argument('--grid_search',
                        action='store_true',
                        dest='grid_search',
                        default=False,
                        help='Run a grid search of hyperparameters.\
            (default: %(default)s)')

    parser.add_argument('--memory_limit',
                        action='store',
                        dest='memory_limit',
                        default=50,
                        type=int,
                        help='Memory limit in Gb.\
            (default: %(default)s)')

    parser.add_argument(
        '-of',
        '--output_file',
        action='store',
        dest='of',
        default='',
        help='Basename of output files, assuming output in current working \
            directory.\
            (default: keras_model-<params>)')
    options = parser.parse_args()

    verbose = True

    # Set GPU memory limits
    gpus = tf.config.list_physical_devices('GPU')
    print(gpus)
    if gpus:
        # For TF v1
        # config = tf.ConfigProto()
        # config.gpu_options.allow_growth = True
        # session = tf.Session(config=config)

        # For TF v2
        try:
            # Method 1:
            # Currently, memory growth needs to be the same across GPUs
            for gpu in gpus:
                tf.config.experimental.set_memory_growth(gpu, True)

            # Method 2:
            # Restrict TensorFlow to only allocate 1GB of memory on the first
            # GPU
            # tf.config.experimental.set_virtual_device_configuration(
            #     gpus[0],
            #     [tf.config.experimental.VirtualDeviceConfiguration(
            #         memory_limit=options.memory_limit*1024
            #     )])
            # logical_gpus = tf.config.list_logical_devices('GPU')
            # print(
            #     len(gpus),
            #     "Physical GPUs,",
            #     len(logical_gpus),
            #     "Logical GPUs"
            # )
        except RuntimeError as e:
            # Virtual devices must be set before GPUs have been initialized
            print(e)
    else:
        raise Exception('ERROR: no GPUs detected.')

    # Get additional data we are going to append to the output model info
    dict_add = {}
    if options.dict_add != '':
        for item in options.dict_add.split(':::'):
            _tmp = item.split('::')
            if len(_tmp) != 2:
                raise Exception('ERROR: check dict_add.')
            else:
                dict_add[_tmp[0]] = _tmp[1]
    print(dict_add)

    # Load the AnnData file.
    # This file should already have clusters identified and saved to the
    # clusters slot.
    adata = sc.read_h5ad(filename=options.h5)

    # Set X to cp10k
    # adata.X = np.expm1(adata.layers['log1p_cp10k'])
    # Set X to ln(cp10k+1)
    # NOTE: Testing with 100k TI dataset, we were able to achieve higher
    # accuracy with log1p_cp10k - likely becuase better spread in distribution.
    adata.X = adata.layers['log1p_cp10k']
    # Set X to raw counts
    # adata.X = adata.layers['counts']

    # Add some info from adata to dict_add
    for key, value in adata.uns['neighbors']['params'].items():
        dict_add['neighbors__{}'.format(key)] = value
    for key, value in adata.uns['cluster']['params'].items():
        dict_add['cluster__{}'.format(key)] = value

    # If train_size_cells, override the fraction so that the total number of
    # cells in the training set will be equal to train_size_cells.
    train_size_fraction = options.train_size_fraction
    if options.train_size_cells > 0:
        if options.train_size_cells >= adata.n_obs:
            raise Exception('Invalid train_size_cells.')
        train_size_fraction = (
            1 - ((adata.n_obs - options.train_size_cells) / adata.n_obs))
        if verbose:
            print(
                'Set train_size_fraction to: {}.'.format(train_size_fraction))
    if verbose:
        print('Number cells training ({}) and testing ({}).'.format(
            int(train_size_fraction * adata.n_obs),
            int((1 - train_size_fraction) * adata.n_obs)))

    # Set X and y
    X = adata.X
    y = adata.obs['cluster'].values

    # Set other variables
    sparsity_l1 = options.sparsity_l1
    n_epochs = options.number_epoch
    batch_size = options.batch_size

    # Center and scale the data
    if sp.sparse.issparse(X):
        X = X.todense()
    X_std = X
    scaler = preprocessing.StandardScaler(with_mean=True, with_std=True)
    X_std = scaler.fit_transform(X)
    if verbose:
        print('center={} scale={}'.format(True, True))

    # One hot encode y (the cell type classes)
    # encode class values as integers
    encoder = preprocessing.LabelEncoder()
    encoder.fit(y)
    print('Found {} clusters'.format(len(encoder.classes_)))

    # Define the model
    # NOTE: Defaults determined via grid search of 160k TI single cells
    def classification_model(optimizer='sgd',
                             activation='softmax',
                             loss='categorical_crossentropy',
                             sparsity_l1__activity=0.0001,
                             sparsity_l2__activity=0.0,
                             sparsity_l1__kernel=0.0,
                             sparsity_l2__kernel=0.0,
                             sparsity_l1__bias=0.0,
                             sparsity_l2__bias=0.0):
        # create model
        model = Sequential()
        # Use a “softmax” activation function in the output layer. This is to
        # ensure the output values are in the range of 0 and 1 and may be used
        # as predicted probabilities.
        #
        # https://developers.google.com/machine-learning/crash-course/multi-class-neural-networks/softmax
        # Softmax assigns decimal probabilities to each class in a multi-class
        # problem. Those decimal probabilities must add up to 1.0. This
        # additional constraint helps training converge more quickly than it
        # otherwise would. Softmax is implemented through a neural network
        # layer just before the output layer. The Softmax layer must have the
        # same number of nodes as the output layer.
        # Softmax assumes that each example is a member of exactly one class.
        #
        # Softmax should be used for multi-class prediction with single label
        # https://developers.google.com/machine-learning/crash-course/multi-class-neural-networks/video-lecture
        # NOTE: input dimension = number of features your data has
        model.add(
            Dense(
                len(encoder.classes_),  # output dim is number of classes
                use_bias=True,  # intercept
                activation=activation,  # softmax, sigmoid
                activity_regularizer=L1L2(l1=sparsity_l1__activity,
                                          l2=sparsity_l2__activity),
                kernel_regularizer=L1L2(l1=sparsity_l1__kernel,
                                        l2=sparsity_l2__kernel),
                bias_regularizer=L1L2(l1=sparsity_l1__bias,
                                      l2=sparsity_l2__bias),
                input_dim=X.shape[1]))
        # Example of adding additional layers
        # model.add(Dense(8, input_dim=4, activation='relu'))
        # model.add(Dense(3, activation='softmax'))

        # Metrics to check out over training epochs
        mets = [
            # loss,
            keras.metrics.CategoricalAccuracy(name='categorical_accuracy'),
            # keras.metrics.TruePositives(name='tp'),
            # keras.metrics.FalsePositives(name='fp'),
            # keras.metrics.TrueNegatives(name='tn'),
            # keras.metrics.FalseNegatives(name='fn'),
            # keras.metrics.Precision(name='precision'),
            # keras.metrics.Recall(name='recall'),
            # keras.metrics.AUC(name='auc'),
            keras.metrics.BinaryAccuracy(name='accuracy')
        ]
        # Use Adam gradient descent optimization algorithm with a logarithmic
        # loss function, which is called “categorical_crossentropy” in Keras.
        # UPDATE: sgd works better emperically.
        model.compile(
            optimizer=optimizer,  # adam, sgd
            loss=loss,
            metrics=mets)

        return model

    # Now, either call a grid search or specific model fit
    if options.grid_search:
        # Get the out file base.
        out_file_base = options.of
        if out_file_base == '':
            out_file_base = 'keras_model'
        out_file_base = '{}-grid_search'.format(out_file_base)

        # Call grid search of various parameters
        grid_result, df_grid_result = keras_grid(
            model_function=classification_model,
            encoder=encoder,
            X_std=X_std,
            y=y,
            n_epochs=n_epochs,
            batch_size=batch_size)

        # NOTE: This will fail because can't pickle KerasClassifier. This is
        # fine though becuase results are saved in tsv.gz format below.
        # Save the results
        # out_f = '{}-grid_result.gz'.format(out_file_base)
        # joblib.dump(
        #     grid_result,
        #     out_f,
        #     compress=('gzip', 3)
        # )
        # Load the model
        # lr = joblib.load(
        #     'test-lr_model.joblib.gz'
        # )
        # print(lr)

        # Save the results of our search to tsv
        out_f = '{}-grid_result.tsv.gz'.format(out_file_base)
        df_grid_result.to_csv(out_f,
                              sep='\t',
                              index=False,
                              quoting=csv.QUOTE_NONNUMERIC,
                              na_rep='',
                              compression=compression_opts)

        # Add a single columns that summarizes params
        param_columns = [
            col for col in df_grid_result.columns if 'param__' in col
        ]
        df_grid_result['params'] = df_grid_result[param_columns].astype(
            str).apply(lambda x: '-'.join(x), axis=1)

        # Plot the distribution of accuracy across folds
        split_columns = [
            col for col in df_grid_result.columns if 'split' in col
        ]
        split_columns = [col for col in split_columns if '_test_score' in col]
        df_plt = pd.melt(df_grid_result,
                         id_vars=['params'],
                         value_vars=split_columns)
        gplt = plt9.ggplot(df_plt, plt9.aes(x='params', y='value'))
        gplt = gplt + plt9.theme_bw()
        gplt = gplt + plt9.geom_boxplot(alpha=0.8)
        gplt = gplt + plt9.geom_jitter(alpha=0.75)
        gplt = gplt + plt9.scale_y_continuous(
            # trans='log10',
            # labels=comma_labels,
            minor_breaks=0
            # limits=[0, 1]
        )
        gplt = gplt + plt9.labs(x='Parameters', y='Score', title='')
        gplt = gplt + plt9.theme(
            axis_text_x=plt9.element_text(angle=-45, hjust=0))
        gplt.save('{}-score.png'.format(out_file_base),
                  dpi=300,
                  width=10,
                  height=4,
                  limitsize=False)

        # Plot the mean time and std err for fitting results
        gplt = plt9.ggplot(df_grid_result,
                           plt9.aes(x='params', y='mean_fit_time'))
        gplt = gplt + plt9.theme_bw()
        gplt = gplt + plt9.geom_point()
        gplt = gplt + plt9.geom_errorbar(plt9.aes(
            ymin='mean_fit_time-std_fit_time',
            ymax='mean_fit_time+std_fit_time'),
                                         width=0.2,
                                         position=plt9.position_dodge(0.05))
        gplt = gplt + plt9.scale_y_continuous(
            # trans='log10',
            # labels=comma_labels,
            minor_breaks=0)
        gplt = gplt + plt9.labs(x='Parameters', y='Mean fit time', title='')
        gplt = gplt + plt9.theme(
            axis_text_x=plt9.element_text(angle=-45, hjust=0))
        gplt.save('{}-fit_time.png'.format(out_file_base),
                  dpi=300,
                  width=10,
                  height=4,
                  limitsize=False)

    else:
        # Get the out file base.
        out_file_base = options.of
        if out_file_base == '':
            out_file_base = 'keras_model'
            # out_file_base = '{}-center={}-scale={}'.format(
            #     out_file_base,
            #     center,
            #     scale
            # )
            out_file_base = '{}-batch_size={}-epochs={}'.format(
                out_file_base, batch_size, n_epochs)
            out_file_base = '{}-sparsity_l1={}-train_size_fraction={}'.format(
                out_file_base,
                str(sparsity_l1).replace('.', 'pt'),
                str(train_size_fraction).replace('.', 'pt'))

        # Fit the specific model and save the results
        model, model_report, y_prob_df, history = fit_model_keras(
            model_function=classification_model,
            encoder=encoder,
            X_std=X_std,
            y=y,
            sparsity_l1=sparsity_l1,
            sparsity_l2=0.0,
            n_epochs=n_epochs,
            batch_size=batch_size,
            train_size_fraction=train_size_fraction)

        # Save the model, weights (coefficients), and bias (intercept)
        model.save('{}.h5'.format(out_file_base),
                   overwrite=True,
                   include_optimizer=True)

        # Save the model and weights (coefficients) seperately
        # open('{}.json'.format(out_file_base), 'w').write(model.to_json())
        open('{}.yml'.format(out_file_base), 'w').write(model.to_yaml())
        model.save_weights('{}-weights.h5'.format(out_file_base))
        # Example read functions
        # model = model_from_yaml(open('my_model_architecture.yaml').read())
        # model.load_weights('my_model_weights.h5')

        # Save the model report
        # Add column telling us if this is cluster or summary value
        is_cluster = []
        for i in model_report.index:
            if i in encoder.classes_:
                is_cluster.append(True)
            else:
                is_cluster.append(False)
        model_report['is_cluster'] = is_cluster
        # Add in extra data
        model_report['sparsity_l1'] = sparsity_l1
        if dict_add:
            for key, value in dict_add.items():
                model_report[key] = value
        print(model_report)
        out_f = '{}-model_report.tsv.gz'.format(out_file_base)
        model_report.to_csv(out_f,
                            sep='\t',
                            index=True,
                            index_label='cell_label',
                            quoting=csv.QUOTE_NONNUMERIC,
                            na_rep='',
                            compression=compression_opts)
        if verbose:
            print('Completed: save {}.'.format(out_f))

        # Save the test results - each row is a cell and the columns are the
        # prob of that cell belonging to a particular class.
        # Add in extra data
        y_prob_df['sparsity_l1'] = sparsity_l1
        if dict_add:
            for key, value in dict_add.items():
                y_prob_df[key] = value
        out_f = '{}-test_result.tsv.gz'.format(out_file_base)
        y_prob_df.to_csv(
            out_f,
            sep='\t',
            index=False,  # NOTE: Not adding the label to test_result index.
            # index_label='cell_label',
            quoting=csv.QUOTE_NONNUMERIC,
            na_rep='',
            compression=compression_opts)
        if verbose:
            print('Completed: save {}.'.format(out_f))

        # Make a matrix of weights per gene
        # Columns = genes tested and rows = cell type label
        weight, bias = model.layers[-1].get_weights()
        # weight, bias = model.get_layer("output").get_weights()
        df_weights = pd.DataFrame.from_records(
            weight,
            index=adata.var.index,  # index is gene
            columns=encoder.classes_)
        # Save the weights dataframe.
        out_f = '{}-weights.tsv.gz'.format(out_file_base)
        df_weights.to_csv(out_f,
                          sep='\t',
                          index=True,
                          index_label='ensembl_gene_id',
                          quoting=csv.QUOTE_NONNUMERIC,
                          na_rep='',
                          compression=compression_opts)
        if verbose:
            print('Completed: save {}.'.format(out_f))

        # Plot the number of features with non-zero coefficients in each
        # cluster.
        out_f = '{}-n_features.png'.format(out_file_base)
        df_plt = pd.DataFrame({
            'classes': df_weights.columns,
            'features': (df_weights != 0).sum(axis=0)
        })
        df_plt = df_plt.set_index('classes')
        # print(df_plt)
        # Add in catgories with no predictive model (e.g., becuase they were
        # too few in training).
        for i in adata.obs['cluster'].cat.categories:
            if i not in df_plt.index:
                df_plt = df_plt.append(
                    pd.Series([0], index=df_plt.columns, name=i))
        fig = plt.figure(figsize=(max(0.5 * len(df_plt.index), 5), 4))
        # plt.bar(lr.classes_, n_features)
        plt.bar(df_plt.index, df_plt['features'])
        plt.xlabel('Cluster')
        plt.ylabel('Features with coefficient != 0')
        plt.xticks(rotation=90)
        for i in df_plt.index:
            plt.annotate(str(df_plt.loc[i, 'features']),
                         xy=(i, df_plt.loc[i, 'features']))
        fig.savefig(out_f, dpi=300, bbox_inches='tight')
        plt.close(fig)

        # Plot ROC of the test and truth.
        out_f = '{}-roc.png'.format(out_file_base)
        fig = plt.figure()
        cell_label_true = y_prob_df.pop('cell_label_true')
        # Drop columns that are not cell type labels
        for i in y_prob_df.columns:
            if 'class__' not in i:
                del y_prob_df[i]
        plot_roc(y_prob_df.values, cell_label_true.values, y_prob_df.columns)
        fig.savefig(out_f, dpi=300, bbox_inches='tight')
        plt.close(fig)
        if verbose:
            print('Completed: save {}.'.format(out_f))

        # Plot metrics vs cluster size to see if smaller clusters have poorer
        # metric measures.
        df_plt = model_report.fillna(0)
        for i in df_plt.index:
            if i not in encoder.classes_:
                df_plt = df_plt.drop(i)
        for i in ['AUC', 'f1-score', 'average_precision_score', 'MCC']:
            out_f = '{}-cluster_size_{}.png'.format(out_file_base, i)
            fig = plt.figure()
            plt.scatter(df_plt['n_cells_full_dataset'], df_plt[i], alpha=0.5)
            plt.xlabel('Number of cells in cluster (full dataset)')
            plt.ylabel(i)
            if i in ['AUC', 'f1-score', 'average_precision_score']:
                plt.ylim(0, 1)
            elif i == 'MCC':
                plt.ylim(-1, 1)
            # Add annotation of the cluster
            for index, row in df_plt.iterrows():
                if row['n_cells_full_dataset'] == 0:
                    print('ERROP: n_cells_full_dataset = 0 for {}.'.format(
                        index))
                plt.annotate(
                    index,  # this is the text
                    (row['n_cells_full_dataset'], row[i]),  # point to label
                    textcoords='offset points',  # how to position the text
                    xytext=(0, 10),  # distance from text to points (x,y)
                    ha='center'  # horiz alignment can be left, right, center
                )
            fig.savefig(out_f, dpi=300, bbox_inches='tight')
            plt.xscale('log', basex=10)
            fig.savefig('{}-cluster_size_{}_log10.png'.format(
                out_file_base, i),
                        dpi=300,
                        bbox_inches='tight')
            plt.close(fig)
            if verbose:
                print('Completed: save {}.'.format(out_f))

        # Plot history of metrics over epochs
        for dat_i in history.history.keys():
            fig = plt.figure()
            plt.plot(history.history[dat_i])
            plt.ylabel(dat_i)
            plt.xlabel('Epoch')
            fig.savefig('{}-model_iter_{}.png'.format(out_file_base, dat_i),
                        dpi=300,
                        bbox_inches='tight')
            plt.close(fig)
    int(grouped_candidates_pred_df.hetionet.value_counts()[1]),
    "relation":
    "DaG"
})
datarows.append({
    "edges": (grouped_candidates_pred_df.query(
        "pred_max > 0.5").hetionet.value_counts()[0]),
    "in_hetionet":
    "Novel",
    "relation":
    "DaG"
})
edges_df = pd.DataFrame.from_records(datarows)
edges_df

# In[20]:

g = (p9.ggplot(edges_df, p9.aes(x="relation", y="edges", fill="in_hetionet")) +
     p9.geom_col(position="dodge") + p9.geom_text(p9.aes(label=(
         edges_df.apply(lambda x: f"{x['edges']} ({x['recall']*100:.0f}%)"
                        if not math.isnan(x['recall']) else f"{x['edges']}",
                        axis=1))),
                                                  position=p9.position_dodge(
                                                      width=1),
                                                  size=9,
                                                  va="bottom") +
     p9.scale_y_log10() + p9.theme(axis_text_y=p9.element_blank(),
                                   axis_ticks_major=p9.element_blank(),
                                   rect=p9.element_blank()))
print(g)
示例#24
0
        # Different thresholds for the two cell lines

        if CELL_LINE == 'jurkat':
            if val > 0: toadd = 'a__0-0.5'
            if val > 0.5: toadd = 'b__0.5-1'
            if val > 1: toadd = 'c__1-2'
            if val > 2: toadd = 'd__2-3'
            if val > 3: toadd = 'e__3+'

        if CELL_LINE == "mcf7":
            if val > 0: toadd = 'a__0-3'
            if val > 3: toadd = 'b__3-10'
            if val > 5: toadd = 'c__5-10'
            if val > 10: toadd = 'd__10+'

        update_ratio_binarized += [toadd]

    sub['update_ratio_bin'] = update_ratio_binarized
    sub['sqrt_peak_score'] = np.sqrt(sub['peak_score'])

    # Now do violin plot
    p4 = (ggplot(data=sub[0:10000],
                 mapping=aes(x='update_ratio_bin', y='peak_score')) +
          geom_violin(position=position_dodge(1), width=1) + scale_y_log10() +
          geom_boxplot(position=position_dodge(1), width=0.25))

    p4.save(
        FIGURE_DIRECTORY +
        "peak_confirmation_nb_update_ratio_well_characterized_crm_violin_plot.pdf",
        verbose=False)
示例#25
0
best_result = list(filter(lambda x: x[1] == model.C_, enumerate(model.Cs_)))[0]
print(best_result)

print("Best CV Fold")
print(model.scores_["polka"][:, best_result[0]])
model.scores_["polka"][:, best_result[0]].mean()

model_weights_df = pd.DataFrame.from_dict({
    "weight": model.coef_[0],
    "pc": list(range(1, 51)),
})
model_weights_df["pc"] = pd.Categorical(model_weights_df["pc"])
model_weights_df.head()

g = (p9.ggplot(model_weights_df, p9.aes(x="pc", y="weight")) +
     p9.geom_col(position=p9.position_dodge(width=5), fill="#253494") +
     p9.coord_flip() +
     p9.scale_x_discrete(limits=list(sorted(range(1, 51), reverse=True))) +
     p9.theme_seaborn(
         context="paper", style="ticks", font_scale=1.1, font="Arial") +
     p9.theme(figure_size=(10, 8)) + p9.labs(title="Regression Model Weights",
                                             x="Princpial Component",
                                             y="Model Weight"))
# g.save("output/figures/pca_log_regression_weights.svg")
# g.save("output/figures/pca_log_regression_weights.png", dpi=250)
print(g)

fold_features = model.coefs_paths_["polka"].transpose(1, 0, 2)
model_performance_df = pd.DataFrame.from_dict({
    "feat_num": ((fold_features.astype(bool).sum(axis=1)) > 0).sum(axis=1),
    "C":
示例#26
0
def test_dodge_preserve_single():
    df1 = pd.DataFrame({'x': ['a', 'b', 'b'],
                        'y': ['a', 'a', 'b']})
    p = (ggplot(df1, aes('x', fill='y')) +
         geom_bar(position=position_dodge(preserve='single')))
    assert p + _theme == 'dodge_preserve_single'