示例#1
0
文件: protobowl.py 项目: Pinafore/qb
def plot():
    outdir = 'output/protobowl/'
    pathlib.Path(outdir).mkdir(parents=True, exist_ok=True)

    df = load_protobowl()
    df.result = df.result.apply(lambda x: x is True)
    df['log_n_records'] = df.user_n_records.apply(np.log)

    df_user_grouped = df.groupby('uid')
    user_stat = df_user_grouped.agg(np.mean)
    print('{} users'.format(len(user_stat)))
    print('{} records'.format(len(df)))
    max_color = user_stat.log_n_records.max()
    user_stat['alpha'] = pd.Series(
        user_stat.log_n_records.apply(lambda x: x / max_color), index=user_stat.index)

    # 2D user plot
    p0 = ggplot(user_stat) \
        + geom_point(aes(x='relative_position', y='result',
                     size='user_n_records', color='log_n_records', alpha='alpha'),
                     show_legend={'color': False, 'alpha': False, 'size': False}) \
        + scale_color_gradient(high='#e31a1c', low='#ffffcc') \
        + labs(x='Average buzzing position', y='Accuracy') \
        + theme(aspect_ratio=1)
    p0.save(os.path.join(outdir, 'protobowl_users.pdf'))
    # p0.draw()
    print('p0 done')

    # histogram of number of records
    p1 = ggplot(user_stat, aes(x='log_n_records', y='..density..')) \
        + geom_histogram(color='#e6550d', fill='#fee6ce') \
        + geom_density() \
        + labs(x='Log number of records', y='Density') \
        + theme(aspect_ratio=0.3)
    p1.save(os.path.join(outdir, 'protobowl_hist.pdf'))
    # p1.draw()
    print('p1 done')

    # histogram of accuracy
    p2 = ggplot(user_stat, aes(x='result', y='..density..')) \
        + geom_histogram(color='#31a354', fill='#e5f5e0') \
        + geom_density() \
        + labs(x='Accuracy', y='Density') \
        + theme(aspect_ratio=0.3)
    p2.save(os.path.join(outdir, 'protobowl_acc.pdf'))
    # p2.draw()
    print('p2 done')

    # histogram of buzzing position
    p3 = ggplot(user_stat, aes(x='relative_position', y='..density..')) \
        + geom_histogram(color='#3182bd', fill='#deebf7') \
        + geom_density() \
        + labs(x='Average buzzing position', y='Density') \
        + theme(aspect_ratio=0.3)
    p3.save(os.path.join(outdir, 'protobowl_pos.pdf'))
    # p3.draw()
    print('p3 done')
示例#2
0
 def plot(i):
     c = colors[i]
     if i == 2:
         p = (qplot(x, y, color=c, xlab='x', ylab='y')
              + lims(color=(1, 7))
              + labs(color='color'))
     else:
         p = (qplot(x, y, stroke=c, xlab='x', ylab='y')
              + lims(stroke=(1, 7))
              + labs(stroke='stroke'))
     return p + theme_minimal()
示例#3
0
 def plot(i):
     return (qplot(x, y, color=colors[i], xlab='x', ylab='y')
             + lims(color=(1, 7))
             + labs(color='color')
             + theme_minimal()
             + _theme
             )
示例#4
0
    def test_theme_linedraw(self):
        p = self.g + labs(title='Theme Linedraw') + theme_linedraw()

        if six.PY2:
            # Small displacement in title
            assert p + _theme == ('theme_linedraw', {'tol': 8})
        else:
            assert p + _theme == 'theme_linedraw'
示例#5
0
    def plot(i):
        if i == 2:
            p = qplot(x, y, xlab='x', ylab='y')
        else:
            p = (qplot(x, y, color=colors[i], xlab='x', ylab='y')
                 + lims(color=(1, 7))
                 + labs(color='color'))

        return p + theme_minimal()
示例#6
0
    def test_theme_xkcd(self):
        p = self.g + labs(title='Theme Xkcd') + theme_xkcd()

        if os.environ.get('TRAVIS'):
            # Travis does not have the fonts, we still check
            # to catch any other errors
            assert p + _theme != 'theme_gray'
        else:
            assert p + _theme == 'theme_xkcd'
示例#7
0
 def plot(i):
     if i == 2:
         _lims = lims(color=(3, 7))
     else:
         _lims = lims(color=(1, 7))
     return (qplot(x, y, color=colors[i], xlab='x', ylab='y')
             + _lims
             + labs(color='color')
             + theme_minimal()
             + _theme
             )
示例#8
0
def test_geometries(tmpdir):
    test_file = '{}/test_file.shp'.format(tmpdir)
    _create_test_input_files(test_file)

    df = GeoDataFrame.from_file(test_file)
    p = (ggplot(df)
         + aes(fill='geometry.bounds.miny')
         + geom_map()
         + geom_map(draw='Point', size=4)
         + geom_map(draw='LineString', size=2)
         + labs(fill='miny')
         )

    assert p + _theme == 'geometries'
示例#9
0
    def fit_curve(self):
        df, questions = load_protobowl()
        # convert prompt to false
        df.result = df.result.apply(lambda x: x is True)

        xy = list(zip(df.relative_position.tolist(), df.result.tolist()))
        xy = sorted(xy, key=lambda x: x[0])
        ratios = dict()
        cnt = 0
        for x, y in xy:
            x = int(x*1000)
            ratios[x] = cnt
            cnt += y
        ratios = sorted(ratios.items(), key=lambda x: x[0])
        ratios = [(x / 1000, y) for x, y in ratios]

        ttl_correct = df.result.tolist().count(True)
        ttl_correct = len(xy)
        curve = [(x, 1 - y / ttl_correct) for x, y in ratios]
        X, y = list(map(list, zip(*curve)))

        X = np.asarray(X)
        y = np.asarray(y)
        degree = 3
        polynomial_features = PolynomialFeatures(degree=degree, include_bias=False)
        linear_regression = LinearRegression()
        pipeline = Pipeline([("polynomial_features", polynomial_features),
                             ("linear_regression", linear_regression)])
        pipeline.fit(X[:, np.newaxis], y)
        print(pipeline.steps[1][1].coef_)

        def get_weight(x):
            return pipeline.predict(np.asarray([[x]]))[0]

        ddf = pd.DataFrame({'x': X, 'y': y})
        p0 = ggplot(ddf, aes(x='x', y='y')) \
            + geom_point(size=0.3, color='blue', alpha=0.5, shape='+') \
            + stat_function(fun=get_weight, color='red', size=2, alpha=0.5) \
            + labs(x='Position', y='Weight')
        p0.save('output/reporting/curve_score.pdf')
        p0.draw()

        return pipeline
示例#10
0
def test_rect_aesthetics():
    p = (ggplot(df, aes(xmin='xmin', xmax='xmax',
                        ymin='ymin', ymax='ymax')) +
         geom_rect() +
         geom_rect(aes(ymin='ymin+2', ymax='ymax+2',
                       alpha='z'),
                   show_legend=False) +
         geom_rect(aes(ymin='ymin+4', ymax='ymax+4',
                       fill='factor(z)')) +
         geom_rect(aes(ymin='ymin+6', ymax='ymax+6',
                       color='factor(z+1)'), size=2) +
         geom_rect(aes(ymin='ymin+8', ymax='ymax+8',
                       linetype='factor(z+2)'),
                   color='yellow', size=2) +
         _theme +
         # for comparison with geom_tile which
         # has labels by default
         labs(x='x', y='y'))

    assert p == 'rect-aesthetics'
        'aupr_upper':
        lambda x: x.aupr_mean +
        (critical_val * x.aupr_std) / pd.np.sqrt(x.lf_num_len),
        'aupr_lower':
        lambda x: x.aupr_mean -
        (critical_val * x.aupr_std) / pd.np.sqrt(x.lf_num_len)
    }))
dev_set_stats_df

# In[9]:

(p9.ggplot(dev_set_stats_df,
           p9.aes(x="factor(lf_num)", y="auroc_mean", color="model")) +
 p9.geom_point() + p9.geom_line(p9.aes(group="model")) + p9.geom_errorbar(
     p9.aes(ymin="auroc_lower", ymax="auroc_upper", group="model")) +
 p9.theme_seaborn() + p9.labs(title="DaG Tune Set AUROC", color="Model") +
 p9.scale_color_manual({
     "disc_model": "blue",
     "gen_model": "orange"
 }) + p9.scale_y_continuous(limits=[0.4, 0.75]))

# In[10]:

(p9.ggplot(dev_set_stats_df,
           p9.aes(x="factor(lf_num)", y="aupr_mean", color="model")) +
 p9.geom_point() + p9.geom_line(p9.aes(group="model")) + p9.geom_errorbar(
     p9.aes(ymin="aupr_lower", ymax="aupr_upper", group="model")) +
 p9.theme_seaborn() + p9.labs(title="DaG Tune Set AUPR", color="Model") +
 p9.scale_color_manual({
     "disc_model": "blue",
     "gen_model": "orange"
        yintercept=0.4196, linetype="solid", color=color_mapper["2018"]) +
    p9.geom_hline(yintercept=published / posted,
                  linetype="solid",
                  color=color_mapper["2020ML"]) +
    p9.annotate("text", x=8.5, y=0.395, label="overall: 0.4196", size=14) +
    p9.annotate("text",
                x=8.5,
                y=0.48,
                label=f"overall: {published/posted:.4f}",
                size=14) +
    p9.theme_seaborn(
        style="ticks", context="paper", font="Arial", font_scale=2) + p9.theme(
            figure_size=(11, 6.5),
            axis_text_x=p9.element_blank(),
            axis_title_x=p9.element_text(margin={"t": 15}),
        ) + p9.labs(y="Proportion Published", x="Month"))
g.save("output/figures/publication_rate.svg")
g.save("output/figures/publication_rate.png", dpi=250)
print(g)

# # Plot Publication Rate

# +
publish_rate_df["pub_month"] = pd.Categorical(
    publish_rate_df.pub_month.values.tolist(), ordered=True)

posted_recency_adj = (
    publish_rate_df.query("label=='2020 Snapshot+Missing Links'").query(
        "pub_month < '2019-01'").posted.sum())

published_recency_adj = (
示例#13
0
similarity_score_df

# In[10]:

print("Similarity between input vs permuted data is {}".format(permuted_score))

# In[16]:

# Plot
threshold = pd.DataFrame(pd.np.tile(permuted_score,
                                    (len(lst_num_experiments), 1)),
                         index=lst_num_experiments,
                         columns=['score'])

g = ggplot(similarity_score_df, aes(x=lst_num_experiments, y='score'))     + geom_line()     + geom_line(aes(x=lst_num_experiments, y='score'), threshold, linetype='dashed')     + labs(x = "Number of Experiments",
           y = "Similarity score (SVCCA)",
           title = "Similarity after correcting for experiment variation") \
    + theme_bw() \
    + theme(plot_title=element_text(weight='bold'))

print(g)
ggsave(plot=g, filename=svcca_file, dpi=300)

# In[17]:

# Plot - black
threshold = pd.DataFrame(pd.np.tile(permuted_score,
                                    (len(lst_num_experiments), 1)),
                         index=lst_num_experiments,
                         columns=['score'])
    cv_results_df = cv_results_df.append(df)
    
cv_results_summary = (cv_results_df
    .groupby(['classify__alpha', 'feature_set'])['mean_test_score']
    .max()
    .reset_index())


# In[17]:

(gg.ggplot(cv_results_summary, gg.aes(x='classify__alpha',
                                      y='mean_test_score',
                                      color='feature_set'))
 + gg.geom_jitter(size=4, alpha=0.8, height=0, width=0.05)
 + gg.scale_x_log10()
 + gg.labs(x='Regularization strength multiplier (log alpha)',
           y='CV AUROC')
 + gg.guides(fill=gg.guide_legend(title="Feature Set"))
 + gg.aes(ymin=min([0.5, cv_results_summary['mean_test_score'].min()]), ymax=1)
 + theme_cognoma()
)


# ## Use optimal hyperparameters to output ROC curve

# In[18]:

y_pred_dict = {
    model: {
        'train': pipeline.decision_function(X_train),
        'test':  pipeline.decision_function(X_test)
    } for model, pipeline in cv_pipelines.items()
示例#15
0
print(rpkm_data.shape)

# In[6]:

# 0-1 normalize per gene
rnaseq_scaled_df = preprocessing.MinMaxScaler().fit_transform(rpkm_data)
rnaseq_scaled_df = pd.DataFrame(rnaseq_scaled_df,
                                columns=rpkm_data.columns,
                                index=rpkm_data.index).T

rnaseq_scaled_df.head()

# In[7]:

# UMAP embedding of original input data
model = umap.UMAP(random_state=randomState).fit(rnaseq_scaled_df.T)

input_data_UMAPencoded = model.transform(rnaseq_scaled_df.T)
input_data_UMAPencoded_df = pd.DataFrame(data=input_data_UMAPencoded,
                                         index=rnaseq_scaled_df.T.index,
                                         columns=['1', '2'])

g_input = ggplot(input_data_UMAPencoded_df, aes(x='1', y='2')) + geom_point(
    alpha=0.3) + labs(x="UMAP 1", y="UMAP 2", title="Input data")
print(g_input)

# In[8]:

# Save scaled data
rnaseq_scaled_df.to_csv(out_data_file, sep='\t', compression='xz')
示例#16
0
    def show_prediction(
        self,
        samples,
        plot_samples: bool = True,
        plot_fitted: bool = False,
        percent_kept: float = 0.95,
        side_cut_from: str = "both",
        show_community: bool = False,
        num_samples: int = 1000,
        **kwargs,
    ):
        """
        Plot prediction on the true question scale from samples or a submission
        object. Optionally compare prediction against a sample from the distribution
        of community predictions

        :param samples: samples from a distribution answering the prediction question
            (true scale). Can either be a 1-d array corresponding to one model's
            predictions, or a pandas DataFrame with each column corresponding to
            a distinct model's predictions
        :param plot_samples: boolean indicating whether to plot the raw samples
        :param plot_fitted: boolean indicating whether to compute Logistic Mixture
            Params from samples and plot the resulting fitted distribution. Note
            this is currently only supported for 1-d samples
        :param percent_kept: percentage of sample distrubtion to keep
        :param side_cut_from: which side to cut tails from,
            either 'both','lower', or 'upper'
        :param show_community: boolean indicating whether comparison
            to community predictions should be made
        :param num_samples: number of samples from the community
        :param kwargs: additional plotting parameters
        """

        df = pd.DataFrame()

        if not plot_fitted and not plot_samples:
            raise ValueError(
                "Nothing to plot. Niether plot_fitted nor plot_samples was True"
            )

        if plot_samples:
            if isinstance(samples, list):
                samples = pd.Series(samples)
            if not type(samples) in ArrayLikes:
                raise ValueError(
                    "Samples should be a list, numpy array or pandas series"
                )
            num_samples = samples.shape[0]

            if type(samples) == pd.DataFrame:
                if plot_fitted and samples.shape[1] > 1:
                    raise ValueError(
                        "For multiple predictions comparisons, only samples can be compared (plot_fitted must be False)"
                    )
                for col in samples:
                    # use numpy array to ensure df doesn't become read-only
                    df[col] = onp.array(self.scale.normalize_points(samples[col]))
            else:
                # use numpy array to ensure df doesn't become read-only
                df["samples"] = onp.array(self.scale.normalize_points(samples))

        if plot_fitted:
            prediction = self.get_submission_from_samples(samples)
            df["fitted"] = pd.Series(
                [prediction.sample() for _ in range(0, num_samples)]
            )

        if show_community:
            df["community"] = [  # type: ignore
                self.sample_normalized_community() for _ in range(0, num_samples)
            ]

        # get domain for graph given the percentage of distribution kept
        xmin, xmax = self.scale.denormalize_points(
            self.get_central_quantiles(
                df, percent_kept=percent_kept, side_cut_from=side_cut_from,
            )
        )

        for col in df:
            df[col] = self.scale.denormalize_points(df[col])

        df = pd.melt(df, var_name="sources", value_name="samples")  # type: ignore

        plot = self.comparison_plot(df, xmin, xmax, **kwargs) + labs(
            x="Prediction",
            y="Density",
            title=self.plot_title + "\n\nPrediction vs Community"
            if show_community
            else self.plot_title,
        )
        try:
            plot.draw()  # type: ignore
        except RuntimeError as err:
            print(err)
            print(
                "The plot was unable to automatically determine a bandwidth. You can manually specify one with the keyword 'bw', e.g., show_prediction(..., bw=.1)"
            )
示例#17
0
def pseudotime_lineplot(adata,
                        y,
                        facet=True,
                        alpha=1,
                        smoothness=0.3,
                        size=1,
                        color='black',
                        ncol=2,
                        lab_ypos=2):
    """Plots a line plot of pseudotime vs one or multiple variables

    Parameters
    --------------
    adata: AnnData
        The AnnData object being used for the analysis. Must be previously
        evaluated by `tl.pseudotime`.
    y: str or list
        If type(y) == str, y must be a variable annotated in adata.obs and
        will be used as the y-axis. If type(y) == list, then multiple variables
        will be plotted using a shared y-axis but different point colors.
    facet: bool
        Whether to return a facetted plot or all signatures in a single plot.
        Only used if y is a list.
    alpha: float
        A value between 0 and 1. Controls point transparency.
    smoothness: float
        A value passed to geom_smooth as span. Controls how smooth the
        LOESS regression will be.
    size: float
        Controls the line width of the smooth line.
    color: str
        A supported color name. Controls the point color if type(y)==str.
        Ignored otherwise.
    ncol: int
        Number of columns in the facetting, if facet=True. Ignored otherwise.
    lab_ypos: float
        Controls the y-axis position of the cell cycle phase annotation, if present.

    Returns
    -------------
    A plotnine line plot of pseudotime.
    """
    if type(y) == str:
        #-- Get data
        if y in adata.obs.columns:
            plot_df = pd.DataFrame({
                'x': adata.obs['pseudotime'],
                'y': adata.obs[y]
            })
        elif y in adata.var_names:
            plot_df = pd.DataFrame({
                'x': adata.obs['pseudotime'],
                'y': adata[:, y].X.flatten()
            })
        else:
            raise Exception('`y` variable not found')

        #-- Make plot
        if color in adata.obs.columns:
            time_line = (ggplot(plot_df, aes(x='x', y='y')) +
                         geom_smooth(aes(color=color),
                                     method='loess',
                                     size=size,
                                     alpha=alpha,
                                     se=False,
                                     span=smoothness) +
                         labs(x='Pseudotime', y=y) + theme_std)
        else:
            time_line = (ggplot(plot_df, aes(x='x', y='y')) +
                         geom_smooth(method='loess',
                                     size=size,
                                     alpha=alpha,
                                     color=color,
                                     se=False,
                                     span=smoothness) +
                         labs(x='Pseudotime', y=y) + theme_std)

    else:
        #-- Make multiple color plot
        sannot = pd.DataFrame({'pseudotime': adata.obs['pseudotime']})
        sannot['id'] = range(sannot.shape[0])
        #-- Checks
        check1 = [var in adata.var_names for var in y]
        check2 = [var in adata.obs.columns.values for var in y]
        idx = np.array(check1) | np.array(check2)
        y_arr = np.array(y)
        if not np.any(idx):
            raise Exception('No variables in `y` found.')
        if not np.all(idx):
            warnings.warn('Variable not found! Dropping: ' +
                          ', '.join((y_arr[~idx])))
            y = y_arr[idx]
        #-- Get y from obs or matrix:
        for var in y:
            if var in adata.obs.columns:
                sannot[var] = adata.obs[var]
            elif var in adata.var_names:
                sannot[var] = adata[:, var].X.flatten()
        plot_df = pd.melt(sannot,
                          id_vars=['id', 'pseudotime'],
                          var_name='signature',
                          value_name='score')
        plot_df['signature'] = plot_df['signature'].astype('category')
        plot_df['signature'].cat.categories
        plot_df['signature'].cat.reorder_categories(y, inplace=True)

        if facet:
            time_line = (ggplot(plot_df, aes('pseudotime', 'score')) +
                         facet_wrap('signature', scales='free_y', ncol=ncol) +
                         geom_smooth(aes(color='signature'),
                                     method='loess',
                                     size=size,
                                     se=False,
                                     span=smoothness) + theme_std)
        else:
            time_line = (ggplot(plot_df, aes('pseudotime', 'score')) +
                         geom_smooth(aes(color='signature'),
                                     method='loess',
                                     size=size,
                                     se=False,
                                     span=smoothness) + theme_std)

    if "cell_cycle_division" in adata.uns["scycle"]:
        cc_divs = adata.uns["scycle"]["cell_cycle_division"]
        # -- Cell cycle annotation
        cc_phase = pd.DataFrame(
            dict(
                starts=[
                    None,
                    cc_divs["pr_start"],
                    cc_divs["rep_start"],
                    # cc_divs["m_start"],
                ],
                labels=["G1 PM", "G1 PR", "S/G2/M"],
                labpos=[
                    np.mean([0, cc_divs["pr_start"]]),
                    np.mean([cc_divs["pr_start"], cc_divs["rep_start"]]),
                    np.mean([cc_divs["rep_start"], 1]),
                    # np.mean([cc_divs["m_start"], 1]),
                ],
                y=lab_ypos,
            ))
        time_line = (
            time_line + geom_vline(
                aes(xintercept="starts"), linetype="dashed", data=cc_phase) +
            geom_text(aes(x="labpos", y="y", label="labels"), data=cc_phase))

    return time_line
示例#18
0
    def test_theme_seaborn(self):
        p = self.g + labs(title='Theme Seaborn') + theme_seaborn()

        assert p + _theme == 'theme_seaborn'
示例#19
0
    def test_theme_minimal(self):
        p = self.g + labs(title='Theme Minimal') + theme_minimal()

        assert p + _theme == 'theme_minimal'
示例#20
0
    def test_theme_matplotlib(self):
        p = self.g + labs(title='Theme Matplotlib') + theme_matplotlib()

        assert p + _theme == 'theme_matplotlib'
示例#21
0
    def test_theme_538(self):
        p = self.g + labs(title='Theme 538') + theme_538()

        assert p + _theme == 'theme_538'
示例#22
0
    def test_theme_light(self):
        p = self.g + labs(title='Theme Light') + theme_light()

        assert p + _theme == 'theme_light'
示例#23
0
    def test_theme_gray(self):
        p = self.g + labs(title='Theme Gray') + theme_gray()

        assert p + _theme == 'theme_gray'
示例#24
0
    def test_theme_dark(self):
        p = self.g + labs(title='Theme Dark') + theme_dark()

        assert p + _theme == 'theme_dark'
示例#25
0
    def test_theme_classic(self):
        p = self.g + labs(title='Theme Classic') + theme_classic()

        assert p + _theme == 'theme_classic'
示例#26
0
def labs(x, y):
    return gg.labs(x=dollars(x), y=dollars(y))
示例#27
0
    def test_theme_void(self):
        p = self.g + labs(title='Theme Void') + theme_void()

        assert p + _theme == 'theme_void'
g = ( 
    p9.ggplot(dev_disc_df, p9.aes(x="factor(lf_num)", y="auroc_mean", linetype="model", color="relation"))
    + p9.geom_point()
    + p9.geom_errorbar(p9.aes(ymin="auroc_lower", ymax="auroc_upper"))
    + p9.geom_line(p9.aes(group="model"))
    + p9.scale_x_discrete(limits=[0, 1, 6, 11, 16, 'All'])
    + p9.scale_color_manual(values={
        "DaG": mcolors.to_hex(color_map["DaG"]),
        'CtD': mcolors.to_hex(color_map["CtD"]),
        "CbG": mcolors.to_hex(color_map["CbG"]),
        "GiG": mcolors.to_hex(color_map["GiG"]),
        }, guide=False)
    + p9.facet_wrap("relation")
    + p9.labs(
        title="Disc Model Performance (Tune Set)",
    )
    + p9.xlab("Number of Label Functions")
    + p9.ylab("AUROC")
    + p9.theme_bw()
)
print(g)
g.save(filename="../disc_model_dev_auroc.png", dpi=300)


# In[8]:


g = ( 
    p9.ggplot(dev_disc_df, p9.aes(x="factor(lf_num)", y="aupr_mean", linetype="model", color="relation"))
    + p9.geom_point()
                aes(x=lst_num_partitions, y='score', color='Group'),
                size=1.5) \
    + geom_point(aes(x=lst_num_partitions, y='score'),
                 color ='darkgrey',
                size=0.5) \
    + geom_errorbar(all_svcca,
                  aes(x=lst_num_partitions, ymin='ymin', ymax='ymax'),
                   color='darkgrey') \
    + geom_line(threshold,
                aes(x=lst_num_partitions, y='score'),
                linetype='dashed',
                size=1,
                color="darkgrey",
                show_legend=False) \
    + labs(x = "Number of Partitions",
           y = "Similarity score (SVCCA)",
           title = "Similarity across varying numbers of partitions") \
    + theme(plot_title=element_text(weight='bold'),
            plot_background=element_rect(fill="white"),
            panel_background=element_rect(fill="white"),
            panel_grid_major_x=element_line(color="lightgrey"),
            panel_grid_major_y=element_line(color="lightgrey"),
            axis_line=element_line(color="grey"),
            legend_key=element_rect(fill='white', colour='white')
           ) \
    + scale_color_manual(['#1976d2', '#b3e5fc']) \

print(panel_A)
ggsave(plot=panel_A, filename=svcca_file, device="svg", dpi=300)
ggsave(plot=panel_A, filename=svcca_png_file, device="svg", dpi=300)
from plotnine.data import economics

from plotnine import ggplot, aes, geom_line, labs

g = ggplot(economics) + \
    aes(x="date", y="uempmed") + \
    geom_line() + \
    labs(x="date", y="median duration of unemployment")

g.save("07.png")
示例#31
0
def gene_profile(genes: list, 
                 weights: pd.DataFrame, 
                 stddev: pd.DataFrame=None,
                 y_axis_label: str=None,
                 highlight_n: int=None, 
                 highlight_anno: list=None, 
                 figsize: tuple=None,
                 ylim: tuple=None) -> p9.ggplot:
    """
    
    Parameters
    ----------
    weights            : DataFrame of ES weights
    genes          : a single str or list of genes to include in plot as facets
    highlight_n    : number of highest ESw to highlight
    highlight_anno : specific annotations to highlight
    figsize : (float, float), optional (default: None)
        Specify width and height of plot.
    
    Returns
    -------
        g    : ggplot
        
    Todo:
        * find a better way for sorting cell-types along x-axis
        * report if gene in genes is not found in df
        * report if duplicate genes
        * replace hacky x-axis labelling
    
    """
    
    ### Reduce dataframe to genes of interest
    genes = [str.upper(s) for s in genes]
    idx = np.char.upper(weights.index.values.astype(str))
    mask = np.isin(idx, genes)
    df_tidy = weights[mask]
    n_genes = len(df_tidy)

    assert (n_genes >= 1), "No matching genes found in dataframe."

    stddev_tidy = None
    if stddev is not None:
        idx = np.char.upper(stddev.index.values.astype(str))
        mask = np.isin(idx, genes)
        stddev_tidy = stddev[mask]
        n_genes = len(df_tidy)
        assert (n_genes >= 1), "No matching genes found in stddev dataframe."

    # Constants, height and width of plot.
    if figsize is None:
        H = 5*n_genes
        W = 15
    else:
        W, H = figsize

    if ylim is None:
        ylim = (-1,1)
    
    if y_axis_label is None:
        y_axis_label = "Expression Specificity"
    
    ### Convert to tidy / long format if necessary
    # Org:
    #       ABC  ACBG  ACMB
    # POMC  0.0   0.5   0.9
    # AGRP  0.2   0.0   0.0
    # LEPR  0.1   0.1   0.4
    
    # Tidy:
    #   gene_name annotation    es_weight
    # 1 POMC      ABC           0.0
    # 2 AGRP      ABC           0.6
    # 3 LEPR      ABC           1.0     

    df_tidy.index.name = None # ensure that index name is none, so "index" is used for id_vars
    df_tidy = pd.melt(df_tidy.reset_index(), id_vars="index", var_name="annotation", value_name="weight")
    
    if stddev_tidy is not None:
        stddev_tidy.index.name = None
        stddev_tidy = pd.melt(stddev_tidy.reset_index(), id_vars="index", var_name="annotation", value_name="stddev")
        df_tidy = df_tidy.merge(stddev_tidy, on=["index", "annotation"])


    ### Sort values by gene_name and es_weight and add order
    # Sorted:
    #   gene_name annotation   es_weight   x_order
    # 1 AGRP      MOL2         0.0         1
    # 2 AGRP      ACNT1        0.1         2
    # 3 AGRP      MOL1         0.2         3
    
    df_tidy = df_tidy.sort_values(by=["index", "weight"])
    df_tidy["order"] = np.arange(len(df_tidy)) + 1
    
    ### Generate highlight
    # Default: highlight top 5
    if ((highlight_n is None) and (highlight_anno is None)):
        highlight_n = 5

    # highlight list of 
    if (highlight_anno is not None):
        df_tidy["highlight"] = df_tidy["annotation"].isin(highlight_anno)
    elif (highlight_n is not None):
        df_tidy["highlight"] = df_tidy.groupby("index")["order"].rank("first", ascending=False) <= highlight_n
    else:
        df_tidy["highlight"] = np.array([False] * len(df_tidy))
    
    df_highlight = df_tidy[df_tidy["highlight"]]
    
    ### Plot
    # linear function to compute x_axis text-size.
    # Mainly depends on number of genes in df per faceet, i.e. len(df_tidy) / len(genes).
    SIZE_TEXT_X_AXIS = 10.161 - 0.023 * (len(df_tidy) / len(genes))
    
    # Limits of the order for each index gene / facet, e.g. [0, 266, 531]
    # These limits are necessary to only plot the labels
    order_lims = [0, *(df_tidy.groupby("index")["order"].max().values)]
    
    def find_nearest(array,value):
        array = np.asarray(array)
        idx = (np.abs(array - value)).argmin()
        return array[idx]
        
    def getbreaks(lims):
        # function defined for use in debugging
        l = find_nearest(order_lims, lims[0])
        r = find_nearest(order_lims, lims[1])
        breaks = np.arange(l, r)
        return breaks

    def getlbls(idx):
        # function defined for use in debugging
        idx = idx
        lbls = df_tidy["annotation"].iloc[idx].values
        return lbls
    
    p = (
        ### data
        p9.ggplot(data=df_tidy, mapping=p9.aes(x="order", y="weight", label="annotation"))

        ### theming
        + p9.theme_classic()
        + p9.theme(
            figure_size = (W,H),
            axis_ticks_major_x = p9.element_blank(),
            axis_text_x = p9.element_text(rotation=75, hjust=0, size=SIZE_TEXT_X_AXIS), # 
            axis_text_y = p9.element_text(size=W),
            panel_spacing = 1,
            strip_background = p9.element_blank()
        )

        + p9.ylim(ylim[0],ylim[1])

        + p9.labs(
            x="", # e.g. "Cell-type"
            y=y_axis_label, # e.g. "ES weight"
        )

        ### viz
        # all
        + p9.geom_segment(mapping=p9.aes(x="order", xend="order", y=0, yend="weight"),
                       color="grey",
                       alpha=0.3,
                       show_legend=False
        )

        + p9.geom_point(mapping=p9.aes(size=2),
                     color="grey",
                    show_legend=False
        )

        # highlight
        + p9.geom_point(data=df_highlight, mapping=p9.aes(size=2), 
                     color="dodgerblue",
                    show_legend=False
        )

        + p9.geom_segment(data=df_highlight, mapping=p9.aes(x="order", xend="order", y=0, yend="weight"),
                       color="dodgerblue",
                       alpha=0.3,
                       show_legend=False
        )

        + p9.facet_wrap("index",
                     scales="free",
                     nrow=n_genes
                    )
        
        + p9.scale_x_continuous(
            # order_scale is continuous across all annotations
            # so the scale will look weird for each facet, e.g.
            # facet 1 may have order 1-7, and facet 2 has order 8-14.
            # therefore we must use a labeller function to get the 
            # correct labels for each interval of order.
            breaks = lambda lims: getbreaks(lims),
            labels = lambda idx: getlbls(idx)
        )
    )
    
    if stddev_tidy is not None:
        p = p + p9.geom_errorbar(mapping=p9.aes(ymin="weight-stddev", ymax="weight+stddev"), 
                                    color="grey", width=0.1)\
                + p9.geom_errorbar(data=df_highlight, mapping=p9.aes(ymin="weight-stddev", ymax="weight+stddev"),
                                color="dodgerblue", width=0.1)

    # add labels last for them to be on top
    p = p + p9.geom_label(data=df_highlight,
                    color = "dodgerblue",
                    adjust_text = {'expand_points': (2,2)}
        )

    return p
    
示例#32
0
def run_cluster_sim(n_sims = 1000, param = (.1, .5), n = 1000,
                            n_cluster = 50, rho = .5, cluster_robust = False):

    res = [cluster_sim(param = param, n = n, rho = rho,
                                      n_cluster = n_cluster,
                                      cluster_robust = cluster_robust) for x in range(n_sims)]
    df = pd.DataFrame(res)
    df.columns = ('b1', 'se_b1', 'ci95_lower', 'ci95_upper')
    df['param_caught'] = (df['ci95_lower'] <= param[1]) & (param[1] <= df['ci95_upper'])
    df['id'] = df.index
    return df


# Simulation clustered SE
sim_params = [.4, 0] # beta1 = 0: no effect of x on y
sim_nocluster = run_cluster_sim(n_sims=1000, param = sim_params, cluster_robust = False)


p.ggplot(sim_nocluster.sample(100).sort_values('b1'),
                         p.aes(x = 'factor(id)', y = 'b1', 
                             ymin = 'ci95_lower', ymax = 'ci95_upper',
                             color = 'param_caught')) +\
  p.geom_hline(yintercept = sim_params[1], linetype = 'dashed') +\
  p.geom_pointrange() +\
  p.labs(x = 'sim ID', y = 'b1', title = 'Randomly Chosen 100 95% CIs') +\
  p.scale_color_discrete(name = 'True param value', labels = ('missed', 'hit')) +\
  p.coord_flip()



示例#33
0
    items = {}
    data = {}
    all_models = set()
    all_tasks = set()
    data = {}
    with gzip.open(options.input) as ifd:
        for row in csv.DictReader(ifd, delimiter="\t"):
            for k, v in row.items():
                data[k] = data.get(k, [])
                data[k].append(v)
    
    for k in data.keys():
        floats = [maybe_float(x) for x in data[k]]
        if all([re.match(r"^\d+$", x) for x in data[k]]):
            data[k] = [int(x) for x in data[k]]
        elif all(floats):
            data[k] = floats

    df = pandas.DataFrame(data)
    #print df
    x = (ggplot(df, aes("factor(%s)" % (options.x), options.y, color="factor(%s)" % (options.color)))) + \
        ggtitle(options.title.strip("'")) + \
        ylab(options.ylabel.strip("'")) + \
        xlab(options.xlabel.strip("'")) + \
        labs(color=options.color_label.strip("'")) + \
        geom_col(show_legend=False) + \
        lims(y=(0.0, 1.0))
    x.save(options.output)

    #theme(legend_title=element_text("")) + \
    ["x_loc", "y_loc", "well", "site_location",
     "site"])["total_cell_count"].mean().reset_index())

plate = cell_count_df["plate"].unique()[0]

os.makedirs(output_figuresdir, exist_ok=True)
by_well_gg = (
    gg.ggplot(cell_count_totalcells_df, gg.aes(x="x_loc", y="y_loc")) +
    gg.geom_point(gg.aes(fill="total_cell_count"), size=10) +
    gg.geom_text(gg.aes(label="site_location"), color="lightgrey") +
    gg.facet_wrap("~well") + gg.coord_fixed() + gg.theme_bw() +
    gg.ggtitle(f"Total Cells/Well\n{plate}") + gg.theme(
        axis_text=gg.element_blank(),
        axis_title=gg.element_blank(),
        strip_background=gg.element_rect(colour="black", fill="#fdfff4"),
    ) + gg.labs(fill="Cells") + gg.scale_fill_cmap(name="magma"))

output_file = pathlib.Path(output_figuresdir,
                           "plate_layout_cells_count_per_well.png")
if check_if_write(output_file, force, throw_warning=True):
    by_well_gg.save(output_file, dpi=300, verbose=False)

# Plot cell category ratios per well
ratio_df = pd.pivot_table(
    cell_count_df,
    values="cell_count",
    index=["site", "plate", "well", "site_location", "x_loc", "y_loc"],
    columns=["Cell_Quality"],
)
ratio_df = ratio_df.assign(Sum=ratio_df.sum(axis=1),
                           Pass_Filter=ratio_df[cell_filter].sum(axis=1))
            "pca2":
            "last",
            "category":
            "last",
            "section":
            "last",
        }).reset_index(drop=True))
biorxiv_pca_method_section_df.head()

# ## Global View of PCA plot

# In[5]:

g = (p9.ggplot(biorxiv_pca_method_section_df) +
     p9.aes(x="pca1", y="pca2", color="category") + p9.geom_point() +
     p9.theme_bw() + p9.labs(title="TSNE Methods Section (300 dim)"))
print(g)

# ## Neuroscience Methods Section

# In[6]:

g = (p9.ggplot(biorxiv_pca_method_section_df.query("category=='neuroscience'"))
     + p9.aes(x="pca1", y="pca2", color="section") +
     p9.geom_point(position=p9.position_dodge(width=0.2)) +
     p9.facet_wrap("section") + p9.theme_bw() +
     p9.theme(subplots_adjust={'wspace': 0.10}) +
     p9.scale_color_manual({
         "has_methods": "#d8b365",
         "no_methods": "#5ab4ac"
     }) + p9.labs(title="Neuroscience Methods Section"))
示例#36
0
                          yend = sol.Ra[k].imag + sol.Aa[k].imag * ACC_SCALE),\
                      colour='red', arrow=arrow()) + # Point A
         geom_segment(aes(x = sol.Rpa[k].real, y = sol.Rpa[k].imag, \
                          xend = sol.Rpa[k].real + sol.Apaa[k].real * ACC_SCALE, \
                          yend = sol.Rpa[k].imag + sol.Apaa[k].imag * ACC_SCALE),\
                      colour='red', arrow=arrow()) + # Point C
          # ACCELERATIONS TEXTS (you may comment if you wish to remove acceleration informations)
          # positions of the accelerations texts may be altered in case the plot gets hard to read
          annotate("text", x = sol.Rba[k].real, y = sol.Rba[k].imag+10, label = f'${np.absolute(sol.Aba[k])/1000:.2f}~m/s^2$', colour='red') +
          annotate("text", x = sol.Ra[k].real, y = sol.Ra[k].imag-20, label = f'${np.absolute(sol.Aa[k])/1000:.2f}~m/s^2$', colour='red') +
          annotate("text", x = sol.Rpa[k].real+10, y = sol.Rpa[k].imag-20, label = f'${np.absolute(sol.Apaa[k])/1000:.2f}~m/s^2$', colour='red') +
         # MECHANISM KINEMATIC PROPERTIES
           annotate("label", x = -50, y = -100, label = f'$\\theta_2={sol.theta2[k] * 180/(2*pi):.2f}^\\circ$') +
                     # Brackets need to be doubled so Python doesn't interpret 3a or 4a as variables
           annotate("label", x = -10, y = -100, label = f'$\\theta_{{3a}}={sol.theta3a[k] * 180/(2*pi):.2f}^\\circ$, $\\theta_{{3c}}={sol.theta3c[k] * 180/(2*pi):.2f}^\\circ$') + 
           annotate("label", x = 45, y = -100, label = f'$\\theta_{{4a}}={sol.theta4a[k] * 180/(2*pi):.2f}^\\circ$, $\\theta_{{4c}}={sol.theta4c[k] * 180/(2*pi):.2f}^\\circ$') +
           
           annotate("label", x = -50, y = -150, label = f'$\\omega_2={sol.omega2[k]:.2f}~rad/s$') +
           annotate("label", x = 0, y = -150, label = f'$\\omega_{{3a}}={sol.omega3a[k]:.2f}~rad/s$, $\\omega_{{3c}}={sol.omega3c[k]:.2f}~rad/s$') +
           annotate("label", x = 70, y = -150, label = f'$\\omega_{{4a}}={sol.omega4a[k]:.2f}~rad/s$, $\\omega_{{4c}}={sol.omega4c[k]:.2f}~rad/s$') +
           
           annotate("label", x = -50, y = -200, label = f'$\\alpha_2={sol.omega2[k]:.2f}~rad/s^2$') +
           annotate("label", x = 0, y = -200, label = f'$\\alpha_{{3a}}={sol.alpha3a[k]:.2f}~rad/s^2$, $\\alpha_{{3c}}={sol.alpha3c[k]:.2f}~rad/s^2$') +
           annotate("label", x = 70, y = -200, label = f'$\\alpha_{{4a}}={sol.alpha4a[k]:.2f}~rad/s^2$, $\\alpha_{{4c}}={sol.alpha4c[k]:.2f}~rad/s^2$') +
         #
         labs(x='$x~[mm]$', y='$y~[mm]$') +
         coord_cartesian(xlim=SCALE_X, ylim=SCALE_Y) + # Scales plot limits, avoiding it to be bigger than necessary. You may comment this out if you wish to do so.
         theme_bw() # Plot is prettier with this theme compared to the default.
         ) 
    
plot.save('SolutionPlot.pdf', dpi = 330, width = 50, height = 30, units = 'cm')
示例#37
0
#Topic ----Plot Nine- Bar Plot
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#pip install plotnine --user
from plotnine import *

#https://datacarpentry.org/python-ecology-lesson/07-visualization-ggplot-python/index.html


from plotnine import ggplot, geom_point, aes, stat_smooth, facet_wrap
from plotnine.data import mtcars
mtcars
(ggplot(mtcars, aes('wt', 'mpg', color='factor(gear)')) + geom_point() + stat_smooth(method='lm') + facet_wrap('~gear'))

ggplot(mtcars, aes('wt', 'hp', color='factor(cyl)')) + geom_point(aes(size='mpg')) + labs(title='MT cars', subtitle ='wt vs hp', x='weight', y='horsepower') + geom_text(aes(label='name'))


#%%%
%matplotlib inline
import plotnine as p9
from plotnine.data import mtcars
from adjustText import adjust_text
#https://github.com/Phlya/adjustText/wiki
p9.ggplot(mtcars, aes('wt', 'hp', color='factor(cyl)')) + p9.geom_point(aes(size='mpg')) + p9.labs(title='MT cars', subtitle ='wt vs hp', x='weight', y='horsepower') + p9.geom_text(aes(label='name'), size=11, nudge_y=2)
p9.geom_text?
plt.ioff()# and plt.ion()
plt.close()
%matplotlib
simulated_data.head(10)

# In[9]:

# UMAP embedding of original input data

# Get and save model
model = umap.UMAP(random_state=randomState).fit(normalized_data)

input_data_UMAPencoded = model.transform(normalized_data)
input_data_UMAPencoded_df = pd.DataFrame(data=input_data_UMAPencoded,
                                         index=normalized_data.index,
                                         columns=['1', '2'])

g_input = ggplot(input_data_UMAPencoded_df, aes(x='1', y='2')) + geom_point(
    alpha=0.5) + labs(x="UMAP 1", y="UMAP 2", title="Input data")
print(g_input)

# In[10]:

# UMAP embedding of simulated data
simulated_data_UMAPencoded = model.transform(simulated_data)
simulated_data_UMAPencoded_df = pd.DataFrame(data=simulated_data_UMAPencoded,
                                             index=simulated_data.index,
                                             columns=['1', '2'])

g_sim = ggplot(simulated_data_UMAPencoded_df, aes(x='1', y='2')) + geom_point(
    alpha=0.5) + labs(x="UMAP 1", y="UMAP 2", title="Simulated data")
print(g_sim)

# In[11]:
def scatter_enrich_components(adata, plot_type='panel', palette='Set1'):
    """Plots a scatter plot of trajectory vs component scores for each component
    from the dimensionality reduction

    Parameters
    --------------
    adata: AnnData
        The AnnData object being used for the analysis. Must be previously
        evaluated by `tl.enrich_components` and `tl.principal_circle`.
    plot_type: str
        One of 'all' or 'panel'
    palette: 'str'
        A palette supported by matplotlib.cm.get_cmap
    
    Returns
    -------------
    A plotnine scatter plot of IC scores vs trajectory. It can be used to
    diagnose whether the cell cycle ICs vary through the trajectory, and if
    others do not.
    """
    #-- Get projection data
    proj = adata.obsm['X_dimRed']
    n_ics = proj.shape[1]
    spart = adata.obs['partition'].values
    comps = adata.uns['scycle']['enrich_components']

    #-- Make IC dataframe
    ic_df = pd.DataFrame(proj)
    ic_names = ['IC' + str(i) for i in range(proj.shape[1])]
    ic_df.columns = ic_names
    ic_df['partition'] = spart

    #-- Melt for plotting
    ic_traj = ic_df.groupby('partition').sum()
    # ic_traj = pd.DataFrame(zscore(ic_traj))
    ic_traj['partition'] = [i for i in range(np.max(spart) + 1)]
    ic_trajm = pd.melt(ic_traj, id_vars='partition', var_name='IC')

    if plot_type == 'all':
        #-- Add variables for mapping plotting
        ic_trajm = _update_ictrajm(ic_trajm, comps, 'G1/S')
        ic_trajm = _update_ictrajm(ic_trajm, comps, 'G2/M+')
        ic_trajm = _update_ictrajm(ic_trajm, comps, 'G2/M-')
        ic_trajm = _update_ictrajm(ic_trajm, comps, 'Histone')

        idx = [i not in list(comps.keys()) for i in ic_trajm['IC']]
        ic_trajm['ccIC'] = 'cell cycle IC'
        ic_trajm.loc[idx, 'ccIC'] = 'other'

        #-- Get colors
        cmap = mpl.cm.get_cmap(palette, n_ics)
        colors = np.array([mpl.colors.rgb2hex(cmap(i)) for i in range(n_ics)])
        jmp = int(np.round(n_ics / 3))
        cidx = np.array([0, 0 + jmp, 0 + 2 * jmp, n_ics - 1])
        oidx = np.array([i not in cidx for i in range(n_ics)])
        cc_cols = np.append(colors[cidx], colors[oidx])

        #-- Plot
        splot = (ggplot(
            ic_trajm,
            aes(x='partition',
                y='value',
                color='IC',
                alpha='ccIC',
                size='ccIC')) + geom_point(size=3) + geom_line() +
                 scale_alpha_manual(values=[1, 0.2], name='IC type') +
                 scale_size_manual(values=[1.5, 1], name='IC type') +
                 scale_color_manual(values=cc_cols) +
                 labs(x='Trajectory', y='IC score') + theme_std)

    elif plot_type == 'panel':
        #-- Add variables for mapping plotting
        ic_trajm1 = _multi_ictrajm(ic_trajm, comps, 'G1/S')
        ic_trajm2 = _multi_ictrajm(ic_trajm, comps, 'G2/M+')
        ic_trajm3 = _multi_ictrajm(ic_trajm, comps, 'G2/M-')
        ic_trajm4 = _multi_ictrajm(ic_trajm, comps, 'Histone')
        ic_trajm4plot = pd.concat([ic_trajm1, ic_trajm2, ic_trajm3, ic_trajm4])

        #-- Get mapping colors
        cmap = mpl.cm.get_cmap(palette, 5)
        cc_cols = np.append(
            np.array([mpl.colors.rgb2hex(cmap(i)) for i in range(4)]), 'grey')

        #-- Plot
        splot = (ggplot(
            ic_trajm4plot,
            aes(x='partition', y='value', color='IC', alpha='IC', size='IC')) +
                 facet_wrap(facets='facet') + geom_point(size=3) +
                 geom_line() +
                 scale_size_manual(values=[1.5, 1.5, 1.5, 1.5, 1]) +
                 scale_alpha_manual(values=[1, 1, 1, 1, 0.2]) +
                 scale_color_manual(values=cc_cols) + theme_std +
                 labs(x='Trajectory', y='IC score'))

    return splot
#
#  (C) Copyright 2021  Pavel Tisnovsky
#
#  All rights reserved. This program and the accompanying materials
#  are made available under the terms of the Eclipse Public License v1.0
#  which accompanies this distribution, and is available at
#  http://www.eclipse.org/legal/epl-v10.html
#
#  Contributors:
#      Pavel Tisnovsky
#

from plotnine.data import mpg
from plotnine import ggplot, aes, facet_grid, labs, geom_point

print(
    ggplot(mpg) + facet_grid(facets="year~class") + aes(x="displ", y="hwy") +
    labs(
        x="Engine Size",
        y="Miles per Gallon",
        title="Miles per Gallon for Each Year and Vehicle Class",
    ) + geom_point())
示例#41
0
    def test_theme_bw(self):
        p = self.g + labs(title='Theme BW') + theme_bw()

        assert p + _theme == 'theme_bw'
示例#42
0
final_df.head()

# # Distribution plot

g = (
    p9.ggplot(
        final_df.replace(
            {
                "pre_vs_published": "preprint-published",
                "pre_vs_random": "preprint-random",
            }
        )
    )
    + p9.aes(x="label", y="distance")
    + p9.geom_violin(fill="#a6cee3")
    + p9.labs(x="Document Pair Groups", y="Euclidean Distance")
    + p9.theme_seaborn(context="paper", style="ticks", font="Arial", font_scale=2)
)
g.save("output/figures/biorxiv_article_distance.svg")
g.save("output/figures/biorxiv_article_distance.png")
print(g)

# # Logistic Regression bioRxiv preprints -> published PMC articles

model = LogisticRegressionCV(
    Cs=5,
    cv=10,
    random_state=100,
    penalty="elasticnet",
    solver="saga",
    l1_ratios=[0.1, 0.5, 0.8],
示例#43
0
文件: jmlr.py 项目: Pinafore/qb
def error_comparison():
    char_frames = {}
    first_frames = {}
    full_frames = {}
    train_times = {}
    use_wiki = {}
    best_accuracies = {}
    for p in glob.glob(f'output/guesser/best/qanta.guesser*/guesser_report_guesstest.pickle', recursive=True):
        with open(p, 'rb') as f:
            report = pickle.load(f)
            name = report['guesser_name']
            params = report['guesser_params']
            train_times[name] = params['training_time']
            use_wiki[name] = params['use_wiki'] if 'use_wiki' in params else False
            char_frames[name] = report['char_df']
            first_frames[name] = report['first_df']
            full_frames[name] = report['full_df']
            best_accuracies[name] = (report['first_accuracy'], report['full_accuracy'])
    first_df = pd.concat([f for f in first_frames.values()]).sort_values('score', ascending=False).groupby(['guesser', 'qanta_id']).first().reset_index()
    first_df['position'] = ' Start'
    full_df = pd.concat([f for f in full_frames.values()]).sort_values('score', ascending=False).groupby(['guesser', 'qanta_id']).first().reset_index()
    full_df['position'] = 'End'
    compare_df = pd.concat([first_df, full_df])
    compare_df = compare_df[compare_df.guesser != 'qanta.guesser.vw.VWGuesser']
    compare_results = {}
    comparisons = ['qanta.guesser.dan.DanGuesser', 'qanta.guesser.rnn.RnnGuesser', 'qanta.guesser.elasticsearch.ElasticSearchGuesser']
    cr_rows = []
    for (qnum, position), group in compare_df.groupby(['qanta_id', 'position']):
        group = group.set_index('guesser')
        correct_guessers = []
        wrong_guessers = []
        for name in comparisons:
            if group.loc[name].correct == 1:
                correct_guessers.append(name)
            else:
                wrong_guessers.append(name)
        if len(correct_guessers) > 3:
            raise ValueError('this should be unreachable')
        elif len(correct_guessers) == 3:
            cr_rows.append({'qnum': qnum, 'Position': position, 'model': 'All', 'Result': 'Correct'})
        elif len(correct_guessers) == 0:
            cr_rows.append({'qnum': qnum, 'Position': position, 'model': 'All', 'Result': 'Wrong'})
        elif len(correct_guessers) == 1:
            cr_rows.append({
                'qnum': qnum, 'Position': position,
                'model': to_shortname(correct_guessers[0]),
                'Result': 'Correct'
            })
        else:
            cr_rows.append({
                'qnum': qnum, 'Position': position,
                'model': to_shortname(wrong_guessers[0]),
                'Result': 'Wrong'
            })
    cr_df = pd.DataFrame(cr_rows)
    # samples = cr_df[(cr_df.Position == ' Start') & (cr_df.Result == 'Correct') & (cr_df.model == 'RNN')].qnum.values
    # for qid in samples:
    #     q = lookup[qid]
    #     print(q['first_sentence'])
    #     print(q['page'])
    #     print()
    p = (
        ggplot(cr_df)
        + aes(x='model', fill='Result') + facet_grid(['Result', 'Position']) #+ facet_wrap('Position', labeller='label_both')
        + geom_bar(aes(y='(..count..) / sum(..count..)'), position='dodge')
        + labs(x='Models', y='Fraction with Corresponding Result') + coord_flip()
        + theme_fs() + theme(aspect_ratio=.6)
    )
    p.save('output/plots/guesser_error_comparison.pdf')
示例#44
0
final_annotated_df.head()

# In[6]:

binned_stats_df = (final_annotated_df.groupby(
    "distance_bin").final_same_paper.mean().to_frame().rename(
        index=str, columns={
            "final_same_paper": "frac_correct"
        }).reset_index())
binned_stats_df

# In[7]:

g = (p9.ggplot(binned_stats_df, p9.aes(x="distance_bin", y="frac_correct")) +
     p9.geom_col(fill="#a6cee3") + p9.coord_flip() +
     p9.labs(x="Fraction Correct", y="Euclidean Distance Bins") +
     p9.theme_seaborn(
         context="paper", style="ticks", font="Arial", font_scale=1.5))
g.save("output/figures/distance_bin_accuracy.svg")
g.save("output/figures/distance_bin_accuracy.png", dpi=250)
print(g)

# # Logsitic Regression Performance

# In[8]:

biorxiv_embed_df = (pd.read_csv(Path("../word_vector_experiment/output/") /
                                "word2vec_output/" /
                                "biorxiv_all_articles_300.tsv.xz",
                                sep="\t").set_index("document"))
biorxiv_embed_df.head()
示例#45
0
    def histogram_make(roi, combined_raw_df, list_rois, config, xlimit,
                       save_function, find_xlim_function):
        if combined_raw_df.empty:
            if config.verbose:
                print(
                    'INFO: Histograms cannot be made for the No ROI category.')
            return
        else:
            thisroi = list_rois[roi]

            figure = (
                pltn.ggplot(combined_raw_df, pltn.aes(x="voxel_value")) +
                pltn.theme_538() + pltn.geom_histogram(
                    binwidth=config.histogram_binwidth,
                    fill=config.histogram_fig_colour,
                    boundary=0,
                    na_rm=True
                )  # Boundary centers the bars, na_rm cancels error from setting an xlimit
                + pltn.facet_grid(
                    f"{config.histogram_fig_y_facet}~{config.histogram_fig_x_facet}",
                    drop=True,
                    labeller="label_both") +
                pltn.labs(x=config.histogram_fig_label_x,
                          y=config.histogram_fig_label_y) +
                pltn.theme(
                    panel_grid_minor_x=pltn.themes.element_line(alpha=0),
                    panel_grid_major_x=pltn.themes.element_line(alpha=1),
                    panel_grid_major_y=pltn.element_line(alpha=0),
                    plot_background=pltn.element_rect(fill="white"),
                    panel_background=pltn.element_rect(fill="gray", alpha=0.1),
                    axis_title_x=pltn.element_text(
                        weight='bold', color='black', size=20),
                    axis_title_y=pltn.element_text(
                        weight='bold', color='black', size=20),
                    strip_text_x=pltn.element_text(
                        weight='bold', size=10, color='black'),
                    strip_text_y=pltn.element_text(
                        weight='bold', size=10, color='black'),
                    axis_text_x=pltn.element_text(size=10, color='black'),
                    axis_text_y=pltn.element_text(size=10, color='black'),
                    dpi=config.plot_dpi))

            # Display mean or median as vertical lines on plot
            if config.histogram_show_mean or config.histogram_show_median:
                figure += pltn.geom_vline(pltn.aes(xintercept="stat_value",
                                                   color="Statistic"),
                                          size=config.histogram_stat_line_size)
                figure += pltn.scale_color_manual(values=[
                    config.colorblind_friendly_plot_colours[3],
                    config.colorblind_friendly_plot_colours[1]
                ])

            # Display legend for mean and median
            if not config.histogram_show_legend:
                figure += pltn.theme(legend_position='none')

            if xlimit:
                # Set y limit of figure (used to make it the same for every barchart)
                figure += pltn.xlim(-1, xlimit)
                thisroi += '_same_xlim'
            else:
                figure += pltn.xlim(-1, None)

            returned_xlim = 0
            if config.use_same_axis_limits in ('Same limits',
                                               'Create both') and xlimit == 0:
                returned_xlim = find_xlim_function(thisroi, figure, 'xaxis')

            if config.use_same_axis_limits == 'Same limits' and xlimit == 0:
                return returned_xlim
            elif xlimit != 0:
                folder = 'Same_xaxis'
            else:
                folder = 'Different_xaxis'

            # Suppress Pandas warning about alignment of non-concatenation axis
            warnings.simplefilter(action='ignore', category=FutureWarning)

            save_function(figure, thisroi, config, folder, 'histogram')

            warnings.simplefilter(action='default', category=FutureWarning)

            return returned_xlim
category_sim_df.to_csv("output/category_cossim_95_ci.tsv",
                       sep="\t",
                       index=False)

# In[11]:

g = (p9.ggplot(category_sim_df) + p9.aes(x="category",
                                         y="pca1_cossim",
                                         ymin="pca1_cossim_lower",
                                         ymax="pca1_cossim_upper") +
     p9.geom_pointrange() + p9.coord_flip() + p9.theme_bw() +
     p9.scale_x_discrete(limits=category_sim_df.category.tolist()[::-1]) +
     p9.theme(figure_size=(11, 7),
              text=p9.element_text(size=12),
              panel_grid_major_y=p9.element_blank()) +
     p9.labs(y="PC1 Cosine Similarity"))
g.save("output/pca_plots/figures/category_pca1_95_ci.svg", dpi=500)
g.save("output/pca_plots/figures/category_pca1_95_ci.png", dpi=500)
print(g)

# In[12]:

g = (p9.ggplot(category_sim_df) + p9.aes(x="category",
                                         y="pca2_cossim",
                                         ymax="pca2_cossim_upper",
                                         ymin="pca2_cossim_lower") +
     p9.geom_pointrange() + p9.coord_flip() + p9.theme_bw() +
     p9.scale_x_discrete(limits=category_sim_df.category.tolist()[::-1]) +
     p9.theme(figure_size=(11, 7),
              text=p9.element_text(size=12),
              panel_grid_major_y=p9.element_blank()) +
def merge_ologram_stats(inputfiles=None,
                        pdf_width=None,
                        pdf_height=None,
                        output=None,
                        labels=None):
    # -------------------------------------------------------------------------
    # Check user provided labels
    # -------------------------------------------------------------------------

    if labels is not None:

        labels = labels.split(",")

        for elmt in labels:
            if not re.search("^[A-Za-z0-9_]+$", elmt):
                message(
                    "Only alphanumeric characters and '_' allowed for --more-bed-labels",
                    type="ERROR")
        if len(labels) != len(inputfiles):
            message(
                "--labels: the number of labels should be"
                " the same as the number of input files ",
                type="ERROR")

        if len(labels) != len(set(labels)):
            message("Redundant labels not allowed.", type="ERROR")

    # -------------------------------------------------------------------------
    # Loop over input files
    # -------------------------------------------------------------------------

    df_list = list()
    df_label = list()

    for pos, infile in enumerate(inputfiles):
        message("Reading file : " + infile.name)
        # Read the dataset into a temporay dataframe
        df_tmp = pd.read_csv(infile, sep='\t', header=0, index_col=None)
        # Change name of 'feature_type' column.
        df_tmp = df_tmp.rename(index=str, columns={"feature_type": "Feature"})
        # Assign the name of the dataset to a new column

        if labels is None:
            file_short_name = os.path.basename(
                os.path.normpath(os.path.dirname(infile.name)))
            df_label += [file_short_name]
        else:
            file_short_name = labels[pos]
            df_label += [labels[pos]]

        df_tmp = df_tmp.assign(
            **{"dataset": [file_short_name] * df_tmp.shape[0]})
        # Pval set to 0 or -1 are changed to 1e-320 and NaN respectively
        df_tmp.loc[df_tmp['summed_bp_overlaps_pvalue'] == 0,
                   'summed_bp_overlaps_pvalue'] = 1e-320
        df_tmp.loc[df_tmp['summed_bp_overlaps_pvalue'] == -1,
                   'summed_bp_overlaps_pvalue'] = np.nan
        # Compute -log10(pval)
        df_tmp = df_tmp.assign(
            **{"-log_10(pval)": -np.log10(df_tmp.summed_bp_overlaps_pvalue)})

        # Which p-values are signifcant ?
        # TODO Add Benjamini-Hochberg multitesting correction
        df_tmp = df_tmp.assign(
            **{"pval_signif": df_tmp.summed_bp_overlaps_pvalue < 0.01})

        # Add the df to the list to be subsequently merged
        df_list += [df_tmp]

    if len(set(df_label)) < len(df_label):
        message(
            'Enclosing directories are ambiguous and cannot be used as labels. You may use "--labels".',
            type="ERROR")

    # -------------------------------------------------------------------------
    # Concatenate dataframes (row bind)
    # -------------------------------------------------------------------------

    message("Merging dataframes.")
    df_merged = pd.concat(df_list, axis=0)

    # -------------------------------------------------------------------------
    # Plotting
    # -------------------------------------------------------------------------

    message("Plotting")
    my_plot = ggplot(data=df_merged, mapping=aes(y='Feature', x='dataset'))
    my_plot += geom_tile(aes(fill='summed_bp_overlaps_log2_fold_change'))
    my_plot += scale_fill_gradient2()
    my_plot += labs(fill="log2(fold change) for summed bp overlaps")

    # Points for p-val. Must be after geom_tile()
    my_plot += geom_point(data=df_merged.loc[df_merged['pval_signif']],
                          mapping=aes(x='dataset',
                                      y='Feature',
                                      color='-log_10(pval)'),
                          size=5,
                          shape='D',
                          inherit_aes=False)
    my_plot += scale_color_gradientn(colors=["#160E00", "#FFB025", "#FFE7BD"])
    my_plot += labs(color="-log10(p-value)")

    # Theming
    my_plot += theme_bw()
    my_plot += theme(panel_grid_major=element_blank(),
                     axis_text_x=element_text(rotation=90),
                     panel_border=element_blank(),
                     axis_ticks=element_blank())

    # -------------------------------------------------------------------------
    # Saving
    # -------------------------------------------------------------------------

    message("Saving")
    nb_ft = len(list(df_merged['Feature'].unique()))
    nb_datasets = len(list(df_merged['dataset'].unique()))

    if pdf_width is None:
        panel_width = 0.6
        pdf_width = panel_width * nb_datasets

        if pdf_width > 100:
            pdf_width = 100
            message("Setting --pdf-width to 100 (limit)")

    if pdf_height is None:
        panel_height = 0.6
        pdf_height = panel_height * nb_ft

        if pdf_height > 100:
            pdf_height = 100
            message("Setting --pdf-height to 100 (limit)")

    message("Page width set to " + str(pdf_width))
    message("Page height set to " + str(pdf_height))
    figsize = (pdf_width, pdf_height)

    # -------------------------------------------------------------------------
    # Turn warning off. Both pandas and plotnine use warnings for deprecated
    # functions. I need to turn they off although I'm not really satisfied with
    # this solution...
    # -------------------------------------------------------------------------

    def fxn():
        warnings.warn("deprecated", DeprecationWarning)

    # -------------------------------------------------------------------------
    # Saving
    # -------------------------------------------------------------------------

    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        fxn()

        message("Saving diagram to file : " + output.name)
        message("Be patient. This may be long for large datasets.")

        # NOTE : We must manually specify figure size with save_as_pdf_pages
        save_as_pdf_pages(filename=output.name,
                          plots=[my_plot + theme(figure_size=figsize)],
                          width=pdf_width,
                          height=pdf_height)
                aes(x=lst_num_experiments, y='score', color='Group'),
                size=1.5) \
    + geom_point(aes(x=lst_num_experiments, y='score'),
                 color ='darkgrey',
                size=0.5) \
    + geom_errorbar(all_svcca[all_svcca['Group'] == 'uncorrected'],
                  aes(x=lst_num_experiments, ymin='ymin', ymax='ymax'),
                   color='darkgrey') \
    + geom_line(threshold,
                aes(x=lst_num_experiments, y='score'),
                linetype='dashed',
                size=1,
                color="darkgrey",
                show_legend=False) \
    + labs(x = "Number of Partitions",
           y = "Similarity score (SVCCA)",
           title = "Similarity across varying numbers of partitions") \
    + theme(plot_title=element_text(weight='bold'),
            plot_background=element_rect(fill="white"),
            panel_background=element_rect(fill="white"),
            panel_grid_major_x=element_line(color="lightgrey"),
            panel_grid_major_y=element_line(color="lightgrey"),
            axis_line=element_line(color="grey"),
            legend_key=element_rect(fill='white', colour='white')
           ) \
    + scale_color_manual(['#b3e5fc']) \

print(g)
ggsave(plot=g, filename=svcca_uncorrected_file, dpi=300)

# In[9]:
示例#49
0
def clone_rarefaction(self: Union[AnnData, Dandelion],
                      color: str,
                      clone_key: Union[None, str] = None,
                      palette: Union[None, Sequence] = None,
                      figsize: Tuple[Union[int, float], Union[int,
                                                              float]] = (6, 4),
                      save: Union[None, str] = None) -> ggplot:
    """
    Plots rarefaction curve for cell numbers vs clone size.

    Parameters
    ----------
    self : `AnnData`, `Dandelion`
        `AnnData` or `Dandelion` object.
    color : str
        Column name to split the calculation of clone numbers for a given number of cells for e.g. sample, patient etc.
    clone_key : str, optional
        Column name specifying the clone_id column in metadata/obs.
    palette : Sequence, optional
        Color mapping for unique elements in color. Will try to retrieve from AnnData `.uns` slot if present.
    figsize :  Tuple[Union[int,float], Union[int,float]]
        Size of plot.
    save : str, optional
        Save path.

    Returns
    -------
    rarefaction curve plot.
    """

    if self.__class__ == AnnData:
        metadata = self.obs.copy()
    elif self.__class__ == Dandelion:
        metadata = self.metadata.copy()
    if clone_key is None:
        clonekey = 'clone_id'
    else:
        clonekey = clone_key

    groups = list(set(metadata[color]))
    metadata = metadata[metadata['contig_QC_pass'].isin([True, 'True'])]
    if type(metadata[clonekey]) == 'category':
        metadata[clonekey] = metadata[clonekey].cat.remove_unused_categories()
    res = {}
    for g in groups:
        _metadata = metadata[metadata[color] == g]
        res[g] = _metadata[clonekey].value_counts()
    res_ = pd.DataFrame.from_dict(res, orient='index')

    # remove those with no counts
    rowsum = res_.sum(axis=1)
    print(
        'removing due to zero counts:', ', '.join(
            [res_.index[i] for i, x in enumerate(res_.sum(axis=1) == 0) if x]))
    sleep(0.5)
    res_ = res_[~(res_.sum(axis=1) == 0)]

    # set up for calculating rarefaction
    tot = res_.apply(sum, axis=1)
    S = res_.apply(lambda x: x[x > 0].shape[0], axis=1)
    nr = res_.shape[0]

    # append the results to a dictionary
    rarecurve = {}
    for i in tqdm(range(0, nr), desc='Calculating rarefaction curve '):
        n = np.arange(1, tot[i], step=10)
        if (n[-1:] != tot[i]):
            n = np.append(n, tot[i])
        rarecurve[res_.index[i]] = [
            rarefun(np.array(res_.iloc[i, ]), z) for z in n
        ]
    y = pd.DataFrame([rarecurve[c] for c in rarecurve]).T
    pred = pd.DataFrame(
        [np.append(np.arange(1, s, 10), s) for s in res_.sum(axis=1)],
        index=res_.index).T

    y = y.melt()
    pred = pred.melt()
    pred['yhat'] = y['value']

    options.figure_size = figsize
    if palette is None:
        if self.__class__ == AnnData:
            try:
                pal = self.uns[str(color) + '_colors']
            except:
                if len(list(set((pred.variable)))) <= 20:
                    pal = palettes.default_20
                elif len(list(set((pred.variable)))) <= 28:
                    pal = palettes.default_28
                elif len(list(set((pred.variable)))) <= 102:
                    pal = palettes.default_102
                else:
                    pal = None

            if pal is not None:
                p = (ggplot(pred, aes(x="value", y="yhat", color="variable")) +
                     theme_classic() + xlab('number of cells') +
                     ylab('number of clones') + ggtitle('rarefaction curve') +
                     labs(color=color) + scale_color_manual(values=(pal)) +
                     geom_line())
            else:
                p = (ggplot(pred, aes(x="value", y="yhat", color="variable")) +
                     theme_classic() + xlab('number of cells') +
                     ylab('number of clones') + ggtitle('rarefaction curve') +
                     labs(color=color) + geom_line())
        else:
            if len(list(set((pred.variable)))) <= 20:
                pal = palettes.default_20
            elif len(list(set((pred.variable)))) <= 28:
                pal = palettes.default_28
            elif len(list(set((pred.variable)))) <= 102:
                pal = palettes.default_102
            else:
                pal = None

            if pal is not None:
                p = (ggplot(pred, aes(x="value", y="yhat", color="variable")) +
                     theme_classic() + xlab('number of cells') +
                     ylab('number of clones') + ggtitle('rarefaction curve') +
                     labs(color=color) + scale_color_manual(values=(pal)) +
                     geom_line())
            else:
                p = (ggplot(pred, aes(x="value", y="yhat", color="variable")) +
                     theme_classic() + xlab('number of cells') +
                     ylab('number of clones') + ggtitle('rarefaction curve') +
                     labs(color=color) + geom_line())
    else:
        p = (ggplot(pred, aes(x="value", y="yhat", color="variable")) +
             theme_classic() + xlab('number of cells') +
             ylab('number of clones') + ggtitle('rarefaction curve') +
             labs(color=color) + geom_line())
    if save:
        p.save(filename='figures/rarefaction' + str(save),
               height=plt.rcParams['figure.figsize'][0],
               width=plt.rcParams['figure.figsize'][1],
               units='in',
               dpi=plt.rcParams["savefig.dpi"])

    return (p)
    out_i = pandas.DataFrame(sim_res_fwd[i], columns=out.columns[3:])
    out_i['time'] = t
    out_i['signal'] = C3_scan[i]
    out_i['dir'] = 'Low $[S^{**}]$'
    out = pandas.concat([out, out_i[out.columns]])
for i in range(len(sim_res_rev)):
    out_i = pandas.DataFrame(sim_res_rev[i], columns=out.columns[3:])
    out_i['time'] = t
    out_i['signal'] = numpy.flip(C3_scan)[i]
    out_i['dir'] = 'High $[S^{**}]$'
    out = pandas.concat([out, out_i[out.columns]])
out.to_csv("./num_cont_nuts_sub_1_model/sim.txt", sep="\t", index=False)

###################### plotting ##################################
g = (ggplot(out, aes('time', response, group='signal', color='signal')) +
     geom_line(size=0.5) + ylim(0, 250) + labs(x="time", y="$[S^{**}]$") +
     scale_color_distiller(
         palette='RdYlBu', type="diverging", name="$B_{tot}$") +
     facet_wrap('~dir') + theme_bw())
g.save(filename="./num_cont_nuts_sub_1_model/sim_fwd_rev.png",
       format="png",
       width=8,
       height=4,
       units='in',
       verbose=False)

eq = out[out.time == max(out.time)]

print(eq['signal'])
print(eq['s11'])
示例#51
0
文件: protobowl.py 项目: NPSDC/qb
def plot():
    outdir = "output/protobowl/"
    pathlib.Path(outdir).mkdir(parents=True, exist_ok=True)

    df = load_protobowl()
    df.result = df.result.apply(lambda x: x is True)
    df["log_n_records"] = df.user_n_records.apply(np.log)

    df_user_grouped = df.groupby("uid")
    user_stat = df_user_grouped.agg(np.mean)
    print("{} users".format(len(user_stat)))
    print("{} records".format(len(df)))
    max_color = user_stat.log_n_records.max()
    user_stat["alpha"] = pd.Series(
        user_stat.log_n_records.apply(lambda x: x / max_color),
        index=user_stat.index)

    # 2D user plot
    p0 = (ggplot(user_stat) + geom_point(
        aes(
            x="relative_position",
            y="result",
            size="user_n_records",
            color="log_n_records",
            alpha="alpha",
        ),
        show_legend={
            "color": False,
            "alpha": False,
            "size": False
        },
    ) + scale_color_gradient(high="#e31a1c", low="#ffffcc") +
          labs(x="Average buzzing position", y="Accuracy") +
          theme(aspect_ratio=1))
    p0.save(os.path.join(outdir, "protobowl_users.pdf"))
    # p0.draw()
    print("p0 done")

    # histogram of number of records
    p1 = (ggplot(user_stat, aes(x="log_n_records", y="..density..")) +
          geom_histogram(color="#e6550d", fill="#fee6ce") + geom_density() +
          labs(x="Log number of records", y="Density") +
          theme(aspect_ratio=0.3))
    p1.save(os.path.join(outdir, "protobowl_hist.pdf"))
    # p1.draw()
    print("p1 done")

    # histogram of accuracy
    p2 = (ggplot(user_stat, aes(x="result", y="..density..")) +
          geom_histogram(color="#31a354", fill="#e5f5e0") + geom_density() +
          labs(x="Accuracy", y="Density") + theme(aspect_ratio=0.3))
    p2.save(os.path.join(outdir, "protobowl_acc.pdf"))
    # p2.draw()
    print("p2 done")

    # histogram of buzzing position
    p3 = (ggplot(user_stat, aes(x="relative_position", y="..density..")) +
          geom_histogram(color="#3182bd", fill="#deebf7") + geom_density() +
          labs(x="Average buzzing position", y="Density") +
          theme(aspect_ratio=0.3))
    p3.save(os.path.join(outdir, "protobowl_pos.pdf"))
    # p3.draw()
    print("p3 done")
示例#52
0
                                                         max_depth=5,
                                                         min_samples_split=2,
                                                         max_features=5,
                                                         n_jobs=n_threads)

        sklearn_forest.fit(X, y)
        current_timing = (time.time() - start_time)
        if n >= n_burn_in:
            timing_data['implementation'].append('scikit-learn 0.23.1')
            timing_data['threads'].append(n_threads)
            timing_data['timing'].append(current_timing)

df = pd.DataFrame(data=timing_data)
df = df.groupby(['implementation', 'threads']).agg(['mean',
                                                    'std']).reset_index()
df.columns = ['Implementation', 'threads', 'mean', 'std']
print(df)

df['error_min'] = df['mean'] - df['std']
df['error_max'] = df['mean'] + df['std']
p = (ggplot(
    df,
    aes(x='threads', y='mean', group='Implementation',
        color='Implementation')) + geom_line() + geom_point() +
     geom_errorbar(aes(ymin='error_min', ymax='error_max'),
                   width=.2,
                   position=position_dodge(0.05)) +
     labs(x="Number of threads", y="timing [s]"))

p.save(filename='benchmark.png')
normalized_all_data_UMAPencoded = model.transform(normalized_all_data_numeric)
normalized_all_data_UMAPencoded_df = pd.DataFrame(
    data=normalized_all_data_UMAPencoded,
    index=normalized_all_data.index,
    columns=["1", "2"],
)

# Add back label column
normalized_all_data_UMAPencoded_df["sample group"] = normalized_all_data[
    "sample group"]

# Plot
fig = pn.ggplot(normalized_all_data_UMAPencoded_df, pn.aes(x="1", y="2"))
fig += pn.geom_point(pn.aes(color="sample group"), alpha=0.4)
fig += pn.labs(x="UMAP 1",
               y="UMAP 2",
               title="Gene expression data in gene space")
fig += pn.theme_bw()
fig += pn.theme(
    legend_title_align="center",
    plot_background=pn.element_rect(fill="white"),
    legend_key=pn.element_rect(fill="white", colour="white"),
    legend_title=pn.element_text(family="sans-serif", size=15),
    legend_text=pn.element_text(family="sans-serif", size=12),
    plot_title=pn.element_text(family="sans-serif", size=15),
    axis_text=pn.element_text(family="sans-serif", size=12),
    axis_title=pn.element_text(family="sans-serif", size=15),
)
fig += pn.scale_color_manual(["#bdbdbd", "red", "blue"])
fig += pn.guides(colour=pn.guide_legend(override_aes={"alpha": 1}))