Exemplo n.º 1
0
def create_length_plot(len_df, legend_position='right', legend_box='vertical'):
    mean_len_df = len_df.groupby(['Task', 'Method']).mean().reset_index()
    mean_len_df[' '] = 'Mean Length'

    plt = (ggplot(len_df) + aes(x='x', fill='Method', y='..density..') +
           geom_histogram(binwidth=2, position='identity', alpha=.6) +
           geom_text(aes(x='x', y=.22, label='x', color='Method'),
                     mean_len_df,
                     inherit_aes=False,
                     format_string='{:.1f}',
                     show_legend=False) +
           geom_segment(aes(x='x', xend='x', y=0, yend=.205, linetype=' '),
                        mean_len_df,
                        inherit_aes=False,
                        color='black') + scale_linetype_manual(['dashed']) +
           facet_wrap('Task') + xlim(0, 20) + ylim(0, .23) +
           xlab('Example Length') + ylab('Frequency') +
           scale_color_manual(values=COLORS) +
           scale_fill_manual(values=COLORS) + theme_fs() + theme(
               aspect_ratio=1,
               legend_title=element_blank(),
               legend_position=legend_position,
               legend_box=legend_box,
           ))

    return plt
Exemplo n.º 2
0
    def show_community_prediction(
        self,
        percent_kept: float = 0.95,
        side_cut_from: str = "both",
        num_samples: int = 1000,
    ):
        """
        Plot samples from the community prediction on this question

        :param percent_kept: percentage of sample distrubtion to keep
        :param side_cut_from: which side to cut tails from, either 'both','lower', or 'upper'
        :param num_samples: number of samples from the community
        :return: ggplot graphics object
        """
        community_samples = pd.DataFrame(data={
            "samples":
            [self.sample_community() for _ in range(0, num_samples)]
        }  # type: ignore
                                         )

        (_xmin,
         _xmax) = self.get_central_quantiles(community_samples,
                                             percent_kept=percent_kept,
                                             side_cut_from=side_cut_from)
        title_name = (
            f"Q: {self.name}" if self.name else
            "\n".join(textwrap.wrap(self.data["title"], 60)) +
            "\n\n"  # type: ignore
        )
        return (ggplot(community_samples, aes("samples")) +
                geom_density(fill="#b3cde3", alpha=0.8) + xlim(_xmin, _xmax) +
                self._scale_x() +
                labs(x="Prediction",
                     y="Density",
                     title=title_name + "Community Predictions") + ergo_theme)
Exemplo n.º 3
0
def test_changing_xlim_in_stat_density():
    n = 100
    _xlim = (5, 10)
    df = pd.DataFrame({'x': np.linspace(_xlim[0] - 1, _xlim[1] + 1, n)})
    p = (ggplot(df, aes('x')) + stat_density() + xlim(*_xlim))
    # No exceptions
    p._build()
Exemplo n.º 4
0
def test_coord_trans_backtransforms():
    df = pd.DataFrame({'x': [-np.inf, np.inf], 'y': [1, 2]})
    p = (ggplot(df, aes('x', 'y'))
         + geom_line(size=2)
         + xlim(1, 2)
         + coord_trans(x='log10')
         )
    assert p == 'coord_trans_backtransform'
Exemplo n.º 5
0
def test_changing_xlim_in_stat_density():
    n = 100
    _xlim = (5, 10)
    df = pd.DataFrame({'x': np.linspace(_xlim[0] - 1, _xlim[1] + 1, n)})
    p = (ggplot(df, aes('x')) + stat_density() + xlim(*_xlim))
    # No exceptions
    with pytest.warns(PlotnineWarning):
        # warns about removed points.
        p._build()
def test_changing_xlim_in_stat_density():
    n = 100
    _xlim = (5, 10)
    df = pd.DataFrame({'x': np.linspace(_xlim[0]-1, _xlim[1]+1, n)})
    p = (ggplot(df, aes('x'))
         + stat_density()
         + xlim(*_xlim)
         )
    # No exceptions
    p._build()
Exemplo n.º 7
0
    def scatterplot(cls, df):
        Utils.check_and_make_dir("Figures/Scatterplots")
        df = df[(df['index'] != 'Overall') &
                (df['index'] != 'No ROI')]  # Remove No ROI and Overall rows

        df = df.groupby([config.table_cols, config.table_rows]).apply(
            lambda x: x.sort_values(['Mean']))  # Group by parameters and sort
        df = df.reset_index(drop=True)  # Reset index to remove grouping

        scatterplots = ['roi_ordered', 'stat_ordered']
        if config.table_row_order == 'roi':
            scatterplots.remove('stat')
        elif config.table_row_order == 'statorder':
            scatterplots.remove('roi_ordered')

        for scatterplot in scatterplots:
            if config.verbose:
                print(f"Saving {scatterplot} scatterplot!")

            if scatterplot == 'roi_ordered':
                roi_ord = pd.Categorical(df['index'],
                                         categories=df['index'].unique()
                                         )  # Order rows based on first facet
            else:
                roi_ord = pd.Categorical(
                    df.groupby(['MB', 'SENSE'
                                ]).cumcount())  # Order each facet individually

            figure_table = (
                pltn.ggplot(df, pltn.aes(x="Mean", y=roi_ord)) +
                pltn.geom_point(na_rm=True, size=1) + pltn.geom_errorbarh(
                    pltn.aes(xmin="Mean-Conf_Int_95", xmax="Mean+Conf_Int_95"),
                    na_rm=True,
                    height=None) + pltn.xlim(0, None) +
                pltn.scale_y_discrete(labels=[]) +
                pltn.ylab(config.table_y_label) +
                pltn.xlab(config.table_x_label) +
                pltn.facet_grid('{rows}~{cols}'.format(rows=config.table_rows,
                                                       cols=config.table_cols),
                                drop=True,
                                labeller="label_both") +
                pltn.theme_538()  # Set theme
                + pltn.theme(
                    panel_grid_major_y=pltn.themes.element_line(alpha=0),
                    panel_grid_major_x=pltn.themes.element_line(alpha=1),
                    panel_background=pltn.element_rect(fill="gray", alpha=0.1),
                    dpi=config.plot_dpi))

            figure_table.save(
                f"Figures/Scatterplots/{scatterplot}_scatterplot.png",
                height=config.plot_scale,
                width=config.plot_scale * 3,
                verbose=False,
                limitsize=False)
def density_plot1(num_matches_per_round: int,
                  match_lengths_from_one_round: list):
    """ Density plot for match lengths, new rules, no blowouts, 85 matches/round """

    match_lengths = pd.DataFrame(
        {'Match length': match_lengths_from_one_round})
    (plt.ggplot(match_lengths, plt.aes(x='Match length')) +
     plt.geom_density() +
     plt.geom_vline(xintercept=50, color='black', size=2) +
     plt.theme_classic() +
     plt.xlim([0, 55])).save(filename='figures/match_length_density_plot.png')
Exemplo n.º 9
0
Arquivo: imgviz.py Projeto: orm011/vls
def ggimg(image, mapping=None, data=None, dpi=80):
    w, h = image.size
    return (
        ggplot(mapping=mapping, data=data)
        + scale_y_reverse(limits=(0, h))
        + xlim(0, w)
        + scale_color_discrete(guide=False)  # removes legend for line color
        + theme_image(w, h, dpi=dpi)
        + annotate(
            "rect", xmin=0, xmax=w, ymin=0, ymax=h, color="black", fill=None
        )  # box around image
    )
Exemplo n.º 10
0
def plot_continuous_distribution(data_table,
                                 continuous_metric_name,
                                 segment_name,
                                 title,
                                 xlim=None):
    filtered_data = data_table[
        pd.notnull(data_table[continuous_metric_name]) & pd.notnull(data_table[continuous_metric_name])]
    result = plot.ggplot(data=filtered_data) + plot.aes(x=continuous_metric_name, color=segment_name) + \
             plot.geom_density() + plot.labs(x=continuous_metric_name, title=title, fill=segment_name)

    if pd.notnull(xlim):
        result = result + plot.xlim(xlim)
    return result
 def create(self, file_path: str) -> None:
     (ggplot(self._data, aes("value")) +
      geom_histogram(bins=100, fill="#1e4f79") +
      facet_wrap(facets="variable", scales="free", ncol=3) + xlim(0, 1) +
      scale_y_continuous(labels=comma_format()) +
      ggtitle("Intensity of Design Pattern Use") +
      xlab("Percentage of Classes Participating in Design Pattern") +
      ylab("Number of Projects") +
      theme_classic(base_size=32, base_family="Helvetica") +
      theme(text=element_text(size=32),
            axis_title_y=element_text(margin={"r": 40}),
            subplots_adjust={
                "wspace": 0.3,
                "hspace": 0.5
            })).save(file_path, width=24, height=24)
Exemplo n.º 12
0
def test_inplace_add():
    p = _p = ggplot(df)

    p += aes('x', 'y')
    assert p is _p

    p += geom_point()
    assert p is _p

    p += stat_identity()
    assert p is _p

    p += scale_x_continuous()
    assert p is _p

    with pytest.warns(PlotnineWarning):
        # Warning for; replacing existing scale added above
        p += xlim(0, 10)
        assert p is _p

    p += lims(y=(0, 10))
    assert p is _p

    p += labs(x='x')
    assert p is _p

    p += coord_trans()
    assert p is _p

    p += facet_null()
    assert p is _p

    p += annotate('point', 5, 5, color='red', size=5)
    assert p is _p

    p += guides()
    assert p is _p

    p += theme_gray()
    assert p is _p

    th = _th = theme_gray()
    th += theme(aspect_ratio=1)
    assert th is _th
Exemplo n.º 13
0
def gene_log_HR_plot(inFile, pcaFile=None, model=None):
    # get logHRs
    par = get_params(inFile)
    pca_components = par["means"]["logHR"].shape[0] >> 1
    components = range(pca_components)
    tf_components = slice(pca_components, 2 * pca_components)

    t_logHR = par["means"]["logHR"][components, 0]
    tf_logHR = par["means"]["logHR"][tf_components, 0]

    t_logHR_sd = par["stds"]["logHR"][components, 0]
    tf_logHR_sd = par["stds"]["logHR"][tf_components, 0]

    # get pca
    if pcaFile is None:
        pcaFile = inFile.replace("_params.hdf5", "_pca.pkl")
    with open(pcaFile, "rb") as buff:
        pca = pickle.load(buff)

    # prep dataframe
    n_genes = pca.components_.shape[1]
    if model is None:
        logHR_df = pd.DataFrame(index=[f"{i+1}" for i in range(n_genes)])
    else:
        logHR_df = pd.DataFrame(index=model.counts.index)
    logHR_df["tumor logHR"] = pca.inverse_transform(t_logHR)
    logHR_df["non-tumor logHR"] = pca.inverse_transform(tf_logHR)
    logHR_df["tumor logHR sd"] = np.sqrt(
        np.sum((pca.components_ * t_logHR_sd[:, None])**2, axis=0))
    logHR_df["non-tumor logHR sd"] = np.sqrt(
        np.sum((pca.components_ * tf_logHR_sd[:, None])**2, axis=0))
    logHR_df["tumor Z"] = logHR_df["tumor logHR"] / logHR_df["tumor logHR sd"]
    logHR_df["non-tumor Z"] = (logHR_df["non-tumor logHR"] /
                               logHR_df["tumor logHR sd"])
    logHR_df["tumor p-value"] = norm.sf(abs(logHR_df["tumor Z"])) * 2
    logHR_df["non-tumor p-value"] = norm.sf(abs(logHR_df["non-tumor Z"])) * 2

    # make plot
    lb = min(logHR_df["non-tumor logHR"].min(), logHR_df["tumor logHR"].min())
    ub = max(logHR_df["non-tumor logHR"].max(), logHR_df["tumor logHR"].max())
    pl = (pn.ggplot(pn.aes("non-tumor logHR", "tumor logHR"), logHR_df) +
          pn.xlim(lb, ub) + pn.ylim(lb, ub) + pn.theme_minimal() +
          pn.geom_point(alpha=0.3, color="red") + pn.geom_abline())
    return pl, logHR_df
Exemplo n.º 14
0
def main(argv: List[str]) -> None:
    parser = argparse.ArgumentParser()

    parser.add_argument("roll_rule", type=RollRule, choices=list(RollRule))
    parser.add_argument("--num_iterations", type=int, default=10000)
    parser.add_argument("--seed", type=int, default=None)
    parser.add_argument("--plot_file", default="ability_roll_distribution.png")

    args = parser.parse_args(argv)

    if args.seed is not None:
        random.seed(args.seed)

    # Run the simulation and process the data
    roll_counts = simulate(args.roll_rule, args.num_iterations)
    data = process_data(roll_counts)

    # Calculate statistics
    mean = sum(data["value"] * data["percent"] / 100.0)
    mode = data.iloc[data["count"].idxmax()]["value"]
    stddev = math.sqrt(
        sum(data["percent"] / 100.0 * (data["value"] - mean)**2.0))
    skewness = pearson_first_skewness(mean, mode, stddev)

    # Print out result information
    print(data)
    print()
    print("Mean:", mean)
    print("Mode:", mode)
    print("Standard deviation:", stddev)
    print("Skewness:", skewness)

    # Plot the data
    plot = (plt9.ggplot(data, plt9.aes("value", "percent")) +
            plt9.geom_bar(stat="identity") +
            plt9.geom_vline(xintercept=mean, color="black") +
            plt9.xlim(0, 21) + plt9.ylab("Chance (%)") +
            plt9.xlab("Ability Score") +
            plt9.ggtitle("Ability Score Distribution ({} iterations)".format(
                args.num_iterations)))

    plot.save(args.plot_file, dpi=300)
    print("Wrote plot image to:", args.plot_file)
def density_plot2(num_matches_per_round: int,
                  match_lengths_from_one_round: list,
                  match_lengths_from_one_round_with_blowouts: list):
    """ Density plot for match lengths, new rules, blowouts vs. no blowouts, 85 matches/round """

    match_lengths_blowout = pd.DataFrame({
        'Match length':
        np.concatenate([
            match_lengths_from_one_round,
            match_lengths_from_one_round_with_blowouts
        ]),
        'Blowouts':
        np.concatenate([
            np.repeat('No', num_matches_per_round),
            np.repeat('Yes', num_matches_per_round)
        ])
    })
    (plt.ggplot(match_lengths_blowout,
                plt.aes(x='Match length', color='Blowouts')) +
     plt.geom_density() +
     plt.geom_vline(xintercept=50, color='black', size=2) + plt.xlim([0, 55]) +
     plt.theme_classic()).save(
         filename='figures/match_length_with_blowout_density_plot.png')
Exemplo n.º 16
0
def create_length_plot(len_df, legend_position='right', legend_box='vertical'):
    mean_len_df = len_df.groupby(['Task', 'Method']).mean().reset_index()
    mean_len_df[' '] = 'Mean Length'

    plt = (
        ggplot(len_df)
        + aes(x='x', fill='Method', y='..density..')
        + geom_histogram(binwidth=2, position='identity', alpha=.6)
        + geom_text(
            aes(x='x', y=.22, label='x', color='Method'),
            mean_len_df,
            inherit_aes=False,
            format_string='{:.1f}',
            show_legend=False
        )
        + geom_segment(
            aes(x='x', xend='x', y=0, yend=.205, linetype=' '),
            mean_len_df,
            inherit_aes=False, color='black'
        )
        + scale_linetype_manual(['dashed'])
        + facet_wrap('Task')
        + xlim(0, 20) + ylim(0, .23)
        + xlab('Example Length') + ylab('Frequency')
        + scale_color_manual(values=COLORS)
        + scale_fill_manual(values=COLORS)
        + theme_fs()
        + theme(
            aspect_ratio=1,
            legend_title=element_blank(),
            legend_position=legend_position,
            legend_box=legend_box,
        )
    )

    return plt
Exemplo n.º 17
0
def log_HR_plot(inFile, label_unit=10, log_scale_color=True):
    par = get_params(inFile)
    pca_components = par["means"]["logHR"].shape[0] >> 1
    components = range(pca_components)
    tf_components = slice(pca_components, 2 * pca_components)

    logHR_df = pd.DataFrame(index=[f"{i+1}" for i in components])
    logHR_df["tumor logHR"] = par["means"]["logHR"][components, 0]
    logHR_df["non-tumor logHR"] = par["means"]["logHR"][tf_components, 0]
    logHR_df["component"] = components
    logHR_df["label"] = [
        logHR_df.index[i] if i <= label_unit else "" for i in components
    ]
    logHR_df["tumor logHR sd"] = par["stds"]["logHR"][components, 0]
    logHR_df["non-tumor logHR sd"] = par["stds"]["logHR"][tf_components, 0]
    logHR_df["tumor Z"] = logHR_df["tumor logHR"] / logHR_df["tumor logHR sd"]
    logHR_df["non-tumor Z"] = (logHR_df["non-tumor logHR"] /
                               logHR_df["tumor logHR sd"])
    logHR_df["tumor p-value"] = norm.sf(abs(logHR_df["tumor Z"])) * 2
    logHR_df["non-tumor p-value"] = norm.sf(abs(logHR_df["non-tumor Z"])) * 2
    logHR_df["tumor -log10(p-value)"] = -np.log10(logHR_df["tumor p-value"])
    logHR_df["non-tumor -log10(p-value)"] = -np.log10(
        logHR_df["non-tumor p-value"])

    lb = min(logHR_df["non-tumor logHR"].min(), logHR_df["tumor logHR"].min())
    ub = max(logHR_df["non-tumor logHR"].max(), logHR_df["tumor logHR"].max())
    pl = (pn.ggplot(
        pn.aes(
            "non-tumor logHR",
            "tumor logHR",
            color="non-tumor p-value",
            fill="tumor p-value",
            label="label",
        ),
        logHR_df,
    ) + pn.xlim(lb, ub) + pn.ylim(lb, ub) + pn.geom_abline() +
          pn.geom_point() + pn.theme_minimal() +
          pn.geom_text(ha="left", va="bottom", color="black"))
    if log_scale_color:
        pl += pn.scale_color_cmap(trans="log")
        pl += pn.scale_fill_cmap(trans="log")

    lb = min(
        logHR_df["non-tumor -log10(p-value)"].min(),
        logHR_df["tumor -log10(p-value)"].min(),
    )
    ub = max(
        logHR_df["non-tumor -log10(p-value)"].max(),
        logHR_df["tumor -log10(p-value)"].max(),
    )
    pl_p = (pn.ggplot(
        pn.aes(
            "non-tumor -log10(p-value)",
            "tumor -log10(p-value)",
            color="component",
            label="label",
        ),
        logHR_df,
    ) + pn.geom_point() + pn.xlim(lb, ub) + pn.ylim(lb, ub) +
            pn.theme_minimal() +
            pn.geom_text(ha="left", va="bottom", color="black"))
    return pl, pl_p, logHR_df
Exemplo n.º 18
0
def search_room(dataframe: pd.DataFrame) -> bool:

    # Search top 100
    top100 = st.sidebar.checkbox(
        "Filter top 100 apartments",
        help="filter only the top 100 apartments by price",
    )

    # Search by price
    min_price, max_price = st.sidebar.slider(
        "Search apartments by price",
        min(dataframe.price),
        max(dataframe.price),
        (min(dataframe.price), max(dataframe.price)),
        help="Insert the min and max price",
    )

    # Search by review_scores_rating

    # Search by room type

    # Search by Beds

    # Search by Beds

    # Search by Bathrooms

    # Search by Accomodates

    # Select columns for plot
    to_select = st.sidebar.multiselect(
        "Seleziona le colonne che vuoi visualizzare",
        list(dataframe.columns),
        [i for i in list(dataframe.columns)],
        help="Seleziona le colonne che vuoi considerare",
    )

    if top100:
        dataframe = dataframe.groupby("price").head(100)

    dataframe_filtered = dataframe[to_select]

    dataframe_filtered = dataframe_filtered.loc[
        dataframe.price.between(min_price, max_price)
    ]
    # Launch the data visualization
    main_room_type(dataframe_filtered)

    st.sidebar.markdown("Select plot axis")
    axis1 = st.sidebar.selectbox(
        "Select first axis", list(dataframe_filtered.columns)
    )
    axis2 = st.sidebar.selectbox(
        "Select second axis", list(dataframe_filtered.columns)
    )

    scatterplot = st.sidebar.button(
        "Scatterplot", key="bscatterplot", help="Launch the scatterplot"
    )
    if scatterplot:
        fig = px.scatter(dataframe_filtered, x=axis1, y=axis2)
        st.markdown(f"Plot with: {axis1}, {axis2}")
        st.plotly_chart(fig)
        st.markdown("Raw data used")

        st.dataframe(
            dataframe_filtered.style.highlight_max(axis=0)
            .format({axis2: "{:.2%}"})
            .highlight_null(null_color="red")
            .set_caption("Result table with all the data filtered")
        )
        return True

    barplot = st.sidebar.button(
        "Barplot", key="bggplot", help="Launch the ggplot"
    )
    if barplot:

        st.markdown(
            "To launch this plot please remember to select all the columns in the data"
        )
        # plot_folder_path = os.path.join(get_folder_path("."), "plots")

        fig = (
            pn.ggplot(dataframe_filtered)
            + pn.aes(x=axis1, fill=axis2)
            + pn.geom_bar()
            + pn.theme(axis_text_x=pn.element_text(angle=45, hjust=1))
        )

        st.markdown("### Barplot")
        st.markdown(f"Displaying: {axis1} over {axis2}")
        st.pyplot(
            pn.ggplot.draw(fig),
            clear_figure=True,
            width=100,
            height=200,
            dpi=600,
        )
        # st.image(fig_path)
        # st.write(fig)

        # st.pyplot(fig)

    histogram = st.sidebar.button(
        "Histogram", key="bp9histogram", help="Launch the ggplot histogram"
    )
    if histogram:
        fig = (
            pn.ggplot(dataframe_filtered)
            + pn.aes(x="price")
            + pn.geom_histogram(fill="blue", colour="black", bins=30)
            + pn.xlim(0, 200)
        )

        st.markdown("### Histogram")
        st.markdown(f"Displaying: {axis1} over {axis2}")
        st.pyplot(
            pn.ggplot.draw(fig),
            clear_figure=True,
            width=100,
            height=200,
            dpi=600,
        )

    density = st.sidebar.button(
        "Density", key="bp9density", help="Launch the ggplot density"
    )
    if density:

        fig = (
            pn.ggplot(dataframe_filtered.head(1000))
            + pn.aes(x="price")
            + pn.geom_density(fill="blue", colour="black", alpha=0.5)
            + pn.xlim(0, 200)
        )

        st.markdown("### Density Plot")
        st.pyplot(
            pn.ggplot.draw(fig),
            clear_figure=True,
            width=100,
            height=200,
            dpi=600,
        )

    latlong = st.sidebar.button(
        "Latitude-Longitude",
        key="bp9latlon",
        help="Launch the ggplot latitude and longitude categorical comparison",
    )
    if latlong:
        # color categorical variable
        fig = (
            pn.ggplot(
                dataframe_filtered,
                pn.aes(x="latitude", y="longitude", colour="room_type"),
            )
            + pn.geom_point(alpha=0.5)
        )

        st.markdown("### Color categorical variable")
        st.pyplot(
            pn.ggplot.draw(fig),
            clear_figure=True,
            width=100,
            height=200,
            dpi=600,
        )

        return True

    return False
Exemplo n.º 19
0
sv = scale_predictors(df, predictor='SVC')
# ld = scale_predictors(df, predictor='LDA')
nb = scale_predictors(df, predictor='naive_bayes')
rn = scale_predictors(df, predictor='Random')
ac = scale_predictors(df, predictor='acg_ip_risk')
rf = scale_predictors(df, predictor='RandmForest')
ct = scale_predictors(df, predictor='cheating')

df2 = pd.concat([nb, rn, ac, rf, sv, ct])
# df2 = pd.concat([nb, rn, ac, rf, ct])

print(df2.head(20))
print(df2.describe())
p = pn.ggplot(df2, pn.aes(x='num_examined', y='num_detected', group='classifier', colour='classifier')) +\
    pn.geom_step() +\
    pn.ggtitle("How Many ppl would we need to intervene on to prevent Y hospitalizations?")
    # pn.scales.scale_x_reverse()

p.save(HOME_DIR + 'all_together_d.png', height=8, width=10, units='in', verbose=False)

p2 = pn.ggplot(df2, pn.aes(x='num_examined', y='num_detected', group='classifier', colour='classifier')) +\
    pn.geom_step() +\
    pn.ggtitle("How Many ppl would we need to intervene on to prevent Y hospitalizations?") +\
    pn.xlim(0, 300) + pn.ylim(0, 300)
    # pn.scales.scale_x_reverse()

p2.save(HOME_DIR + 'all_together_trunc.png', height=8, width=10, units='in', verbose=False)


print("Finished!")
Exemplo n.º 20
0
plot_title = 'SHAP-Based Clusters in T-SNE SHAP Space'
x_axis_label = 'T-SNE Component 1'
y_axis_label = 'T-SNE Component 2'
xlim = [tsne_results_df.iloc[:, 0].min(), tsne_results_df.iloc[:, 0].max()]
ylim = [tsne_results_df.iloc[:, 1].min(), tsne_results_df.iloc[:, 1].max()]

plot = (p9.ggplot(tsne_results_df,
                    p9.aes(y=tsne_results_df.columns[1], 
                           x=tsne_results_df.columns[0],
                           group=clusters_colname,
                           color=clusters_colname
                           ))
        + p9.geom_point(size=2)
        + p9.geom_rug()
        + p9.stat_ellipse()
        + p9.xlim(xlim[0], xlim[1])
        + p9.ylim(ylim[0], ylim[1])
        #+ p9.scale_color_gradient(low='blue', high='yellow')
        #+ p9.scale_color_manual(values=colors)
        + p9.theme_light(base_size=18)
        + p9.ggtitle(plot_title)
        + p9.labs(y=y_axis_label,
                  x=x_axis_label)
        )

plot_filename = 'shap_clusters.png'
plot.save(plot_filename, width=10, height=10)
from IPython.display import Image
Image(filename=plot_filename)

# + [markdown]
Exemplo n.º 21
0
    def show_prediction(
        self,
        samples,
        percent_kept: float = 0.95,
        side_cut_from: str = "both",
        show_community: bool = False,
        num_samples: int = 1000,
    ):
        """Plot prediction on the true question scale from samples or a submission object. Optionally compare prediction against a sample from the distribution of community predictions

        :param samples: samples from a distribution answering the prediction question (true scale) or a prediction object
        :param percent_kept: percentage of sample distrubtion to keep
        :param side_cut_from: which side to cut tails from, either 'both','lower', or 'upper'
        :param show_community: boolean indicating whether comparison to community predictions should be made
        :param num_samples: number of samples from the community
        :return: ggplot graphics object
        """

        if isinstance(samples, SubmissionMixtureParams):
            prediction = samples
            prediction_normed_samples = pd.Series([
                logistic.sample_mixture(prediction)
                for _ in range(0, num_samples)
            ])
            prediction_true_scale_samples = self.denormalize_samples(
                prediction_normed_samples)
        else:
            if isinstance(samples, list):
                samples = pd.Series(samples)
            if not type(samples) in [pd.Series, np.ndarray]:
                raise ValueError(
                    "Samples should be a list, numpy arrray or pandas series")
            num_samples = samples.shape[0]
            prediction_true_scale_samples = samples

        title_name = (
            f"Q: {self.name}" if self.name else "\n".join(
                textwrap.wrap(self.data["title"], 60))  # type: ignore
        )

        if show_community:
            df = pd.DataFrame(
                data={
                    "community": [  # type: ignore
                        self.sample_community() for _ in range(0, num_samples)
                    ],
                    "prediction":
                    prediction_true_scale_samples,
                })
            # get domain for graph given the percentage of distribution kept
            (_xmin,
             _xmax) = self.get_central_quantiles(df,
                                                 percent_kept=percent_kept,
                                                 side_cut_from=side_cut_from)
            df = pd.melt(df, var_name="sources",
                         value_name="samples")  # type: ignore
            return (ggplot(df, aes("samples", fill="sources")) +
                    scale_fill_brewer(type="qual", palette="Pastel1") +
                    geom_density(alpha=0.8) + xlim(_xmin, _xmax) +
                    self._scale_x() +
                    labs(x="Prediction", y="Density", title=title_name) +
                    ergo_theme +
                    theme(axis_text_x=element_text(rotation=45, hjust=1)))
        else:
            df = pd.DataFrame(
                data={"prediction": prediction_true_scale_samples})
            # get domain for graph given the percentage of distribution kept
            (_xmin,
             _xmax) = self.get_central_quantiles(df,
                                                 percent_kept=percent_kept,
                                                 side_cut_from=side_cut_from)

            return (ggplot(df, aes("prediction")) +
                    geom_density(fill="#b3cde3", alpha=0.8) +
                    scale_fill_brewer(type="qual", palette="Pastel1") +
                    geom_density(alpha=0.8) + xlim(_xmin, _xmax) +
                    self._scale_x() +
                    labs(x="Prediction", y="Density", title=title_name) +
                    ergo_theme +
                    theme(axis_text_x=element_text(rotation=45, hjust=1)))
sensitivities.append(0)
especifities_1.append(0)  #para que al plotearlo acabe en la diagonal
#pintamos ahora la curva
import matplotlib.pyplot as plt
"""%matplotlib inline
plt.plot(especifities_1,sensitivities, marker="o", linestyle="--", color="r")
x=[i*0.01 for i in range(100)]
y=[i*0.01 for i in range(100)]
plt.plot(x,y) #pinto la diagonal (el peor modelo que existe)
plt.xlabel("1-Especificidad")
plt.ylabel("Sensibilidad")
plt.title("Curva ROC")
#recordemos que mi seleccion de variables era una mierda absoluta
"""
#cuanto mayor sea el área entre la curva y la diagonal, mejor es el modelo predictivo
from sklearn import metrics
from plotnine import ggplot, aes, geom_line, geom_area, ggtitle, xlim, ylim  #si quiero importar todo pongo solo *

espec_1, sensit, _ = metrics.roc_curve(Y_test, prob)
df = pd.DataFrame({"x": espec_1, "y": sensit})

auc = metrics.auc(espec_1, sensit)  #área bajo la curva

print(df.head())
print(
    ggplot(df, aes(x="x", y="y")) + geom_line() +
    geom_line(linetype="dashed") + xlim(-0.01, 1.01) + ylim(-0.01, 1.01))
print(
    ggplot(df, aes(x="x", y="y")) + geom_area(alpha=0.25) +
    geom_line(aes(y="y")) + ggtitle("Curva ROC y AUC=%s " % str(auc)))
Exemplo n.º 23
0
lmb_data['demvoteshare_c'] = lmb_data['demvoteshare'] - 0.5
# drop missing values
lmb_data = lmb_data[~pd.isnull(lmb_data.demvoteshare_c)]
lmb_data['demvoteshare_sq'] = lmb_data['demvoteshare_c']**2

#aggregating the data
lmb_data = lmb_data[lmb_data.demvoteshare.between(.45, .55)]
categories = lmb_data.lagdemvoteshare
lmb_data['lagdemvoteshare_100'] = pd.cut(lmb_data.lagdemvoteshare, 100)

agg_lmb_data = lmb_data.groupby('lagdemvoteshare_100')['score'].mean().reset_index()
lmb_data['gg_group'] = [1 if x>.5 else 0 for x in lmb_data.lagdemvoteshare]
agg_lmb_data['lagdemvoteshare'] = np.arange(0.01, 1.01, .01)

# plotting
p.ggplot(lmb_data, p.aes('lagdemvoteshare', 'score')) +    p.geom_point(p.aes(x = 'lagdemvoteshare', y = 'score'), data = agg_lmb_data) +    p.stat_smooth(p.aes('lagdemvoteshare', 'score', group = 'gg_group'), 
                  data=lmb_data, method = "lm", 
              formula = 'y ~ x + I(x**2)') +\
    p.xlim(0,1) + p.ylim(0,100) +\
    p.geom_vline(xintercept = 0.5)

p.ggplot(lmb_data, p.aes('lagdemvoteshare', 'score')) +    p.geom_point(p.aes(x = 'lagdemvoteshare', y = 'score'), data = agg_lmb_data) +    p.stat_smooth(p.aes('lagdemvoteshare', 'score', group = 'gg_group'), 
                  data=lmb_data, method = "lowess") +\
    p.xlim(0,1) + p.ylim(0,100) +\
    p.geom_vline(xintercept = 0.5)

p.ggplot(lmb_data, p.aes('lagdemvoteshare', 'score')) +    p.geom_point(p.aes(x = 'lagdemvoteshare', y = 'score'), data = agg_lmb_data) +    p.stat_smooth(p.aes('lagdemvoteshare', 'score', group = 'gg_group'), 
                  data=lmb_data, method = "lm")+\
    p.xlim(0,1) + p.ylim(0,100) +\
    p.geom_vline(xintercept = 0.5)
Exemplo n.º 24
0
            ["Metadata_cell_line", "Metadata_gene_name", "replicate_type"]
        ),
        gg.aes(x="correlation_guide")) + \
        gg.geom_density(gg.aes(fill="Metadata_cell_line"),
                        alpha=0.4) + \
        gg.geom_rug(gg.aes(color="Metadata_cell_line"),
                    show_legend={'color': False}) + \
        gg.theme_bw() + \
    gg.theme(
            subplots_adjust={"wspace": 0.2},
            axis_text=gg.element_text(size=7),
            axis_title=gg.element_text(size=9),
            strip_text=gg.element_text(size=6, color="black"),
            strip_background=gg.element_rect(colour="black", fill="#fdfff4"),
        ) + \
        gg.xlim([-0.5, 1]) + \
        gg.xlab("Median Correlation of All Guides Across Genes") + \
        gg.ylab("Density") + \
        gg.facet_wrap("~replicate_type", nrow=2, scales="free") + \
        gg.scale_fill_manual(name="Cell Line",
                             values=["#1b9e77", "#d95f02", "#7570b3"]) + \
        gg.scale_color_manual(name="Cell Line",
                              values=["#1b9e77", "#d95f02", "#7570b3"])
)

file = os.path.join("figures", "median-guide-correlation-density")
for extension in ['.png', '.pdf']:
    gg.ggsave(cor_density_gg,
              filename='{}{}'.format(file, extension),
              dpi=500,
              height=2,
Exemplo n.º 25
0

def read_data(file):
    return pd.read_stata(
        "https://raw.github.com/scunning1975/mixtape/master/" + file)


start_is_born = pd.DataFrame({
    'beauty': np.random.normal(size=2500),
    'talent': np.random.normal(size=2500)
})

start_is_born['score'] = start_is_born['beauty'] + start_is_born['talent']
start_is_born['c85'] = np.percentile(start_is_born['score'], q=85)
start_is_born['star'] = 0
start_is_born.loc[start_is_born['score'] > start_is_born['c85'], 'star'] = 1
start_is_born.head()

lm = sm.OLS.from_formula('beauty ~ talent', data=start_is_born).fit()

p.ggplot(start_is_born, p.aes(x='talent', y='beauty')) + p.geom_point(
    size=0.5) + p.xlim(-4, 4) + p.ylim(-4, 4)

p.ggplot(start_is_born[start_is_born.star == 1], p.aes(
    x='talent', y='beauty')) + p.geom_point(size=0.5) + p.xlim(-4, 4) + p.ylim(
        -4, 4)

p.ggplot(start_is_born[start_is_born.star == 0], p.aes(
    x='talent', y='beauty')) + p.geom_point(size=0.5) + p.xlim(-4, 4) + p.ylim(
        -4, 4)
Exemplo n.º 26
0
sizes = []
for sha1, sha2 in zip(commits, commits[1:]):
    res = subprocess.run(['git', 'diff', '--shortstat', sha1, sha2],
                         stdout=subprocess.PIPE)
    words = res.stdout.decode().split()
    plus = 0
    minus = 0
    for i, word in enumerate(words):
        if 'insertion' in word:
            plus = int(words[i - 1])
        if 'deletion' in word:
            minus = int(words[i - 1])
    sizes.append({'insertions': plus, 'deletions': minus})

df = pandas.DataFrame(sizes)
df['newlines'] = df.insertions - df.deletions
df.describe()

# show some basic stat
for n in (-500, -100):
    rat = df[df.newlines < n].size / df.size
    print('<', n, round(rat * 100, 2), '%')
for n in (0, 100, 500, 1000, 2000):
    rat = df[df.newlines > n].size / df.size
    print('>', n, round(rat * 100, 2), '%')

# draw charts
(gg.ggplot(df, gg.aes(x='newlines')) + gg.geom_density() + gg.xlim(-2000, 0))

(gg.ggplot(df, gg.aes(x='newlines')) + gg.geom_density() + gg.xlim(0, 2000))
targene_geo_wt = output[output['status_sign'] == -1]

# Output t-test results
t_results_geo_targene = ttest_ind(a = targene_geo_mutant['weight'],
                              b = targene_geo_wt['weight'], equal_var = False)
print('Statistic = {:.2f}, p = {:.2E}'.format(t_results_geo_targene[0],
                                              Decimal(t_results_geo_targene[1])))

# graphical output for predictions
p = (gg.ggplot(output,
               gg.aes(x='weight', y='dummy_y', color='factor(status_sign)')) +
     gg.geom_hline(gg.aes(yintercept=0), linetype='solid') +
     gg.geom_point(size=4) +
     gg.scale_color_manual(values=["#377eb8", "#ff7f00"], labels=['WT', 'Mutant']) +
     gg.ylim([-0.1, 0.1]) +
     gg.xlim([-0.001, 1.001]) +
     gg.theme_seaborn(style='whitegrid') +
     gg.xlab('Targene Classifier Score') +
     gg.ylab('') +
     gg.labs(color='Sample_status') +
     gg.ggtitle('Mutant vs WT \n') +
     gg.theme(
        plot_title=gg.element_text(size=22),
        axis_title_x=gg.element_text(size=16),
        axis_text_x=gg.element_text(size=16),
        axis_text_y=gg.element_blank(),
        axis_ticks_length=4,
        axis_ticks_major_y=gg.element_blank(),
        axis_ticks_minor_y=gg.element_blank(),
        axis_ticks_minor_x=gg.element_blank(),
        legend_position=(1.02, 0.8),
Exemplo n.º 28
0
    def histogram_make(roi, combined_raw_df, list_rois, config, xlimit,
                       save_function, find_xlim_function):
        if combined_raw_df.empty:
            if config.verbose:
                print(
                    'INFO: Histograms cannot be made for the No ROI category.')
            return
        else:
            thisroi = list_rois[roi]

            figure = (
                pltn.ggplot(combined_raw_df, pltn.aes(x="voxel_value")) +
                pltn.theme_538() + pltn.geom_histogram(
                    binwidth=config.histogram_binwidth,
                    fill=config.histogram_fig_colour,
                    boundary=0,
                    na_rm=True
                )  # Boundary centers the bars, na_rm cancels error from setting an xlimit
                + pltn.facet_grid(
                    f"{config.histogram_fig_y_facet}~{config.histogram_fig_x_facet}",
                    drop=True,
                    labeller="label_both") +
                pltn.labs(x=config.histogram_fig_label_x,
                          y=config.histogram_fig_label_y) +
                pltn.theme(
                    panel_grid_minor_x=pltn.themes.element_line(alpha=0),
                    panel_grid_major_x=pltn.themes.element_line(alpha=1),
                    panel_grid_major_y=pltn.element_line(alpha=0),
                    plot_background=pltn.element_rect(fill="white"),
                    panel_background=pltn.element_rect(fill="gray", alpha=0.1),
                    axis_title_x=pltn.element_text(
                        weight='bold', color='black', size=20),
                    axis_title_y=pltn.element_text(
                        weight='bold', color='black', size=20),
                    strip_text_x=pltn.element_text(
                        weight='bold', size=10, color='black'),
                    strip_text_y=pltn.element_text(
                        weight='bold', size=10, color='black'),
                    axis_text_x=pltn.element_text(size=10, color='black'),
                    axis_text_y=pltn.element_text(size=10, color='black'),
                    dpi=config.plot_dpi))

            # Display mean or median as vertical lines on plot
            if config.histogram_show_mean or config.histogram_show_median:
                figure += pltn.geom_vline(pltn.aes(xintercept="stat_value",
                                                   color="Statistic"),
                                          size=config.histogram_stat_line_size)
                figure += pltn.scale_color_manual(values=[
                    config.colorblind_friendly_plot_colours[3],
                    config.colorblind_friendly_plot_colours[1]
                ])

            # Display legend for mean and median
            if not config.histogram_show_legend:
                figure += pltn.theme(legend_position='none')

            if xlimit:
                # Set y limit of figure (used to make it the same for every barchart)
                figure += pltn.xlim(-1, xlimit)
                thisroi += '_same_xlim'
            else:
                figure += pltn.xlim(-1, None)

            returned_xlim = 0
            if config.use_same_axis_limits in ('Same limits',
                                               'Create both') and xlimit == 0:
                returned_xlim = find_xlim_function(thisroi, figure, 'xaxis')

            if config.use_same_axis_limits == 'Same limits' and xlimit == 0:
                return returned_xlim
            elif xlimit != 0:
                folder = 'Same_xaxis'
            else:
                folder = 'Different_xaxis'

            # Suppress Pandas warning about alignment of non-concatenation axis
            warnings.simplefilter(action='ignore', category=FutureWarning)

            save_function(figure, thisroi, config, folder, 'histogram')

            warnings.simplefilter(action='default', category=FutureWarning)

            return returned_xlim
Exemplo n.º 29
0
def concurrent_agents_plot(experiment_name='graph_indep_concurrent',
                           data_path=_DEFAULT_DATA_PATH,
                           paper_version=True):
    '''Passing paper_version=True should be used to reproduce Fig. 14 of the paper 
  for K = 1,10,20,50,100. In this case, the labels in the legend are manually 
  ordered by the values of K. Otherwise, the labels are ordered alphabetically.'''

    df = load_data(data_path, experiment_name)

    plt_df_per_action = (df.groupby(['agent', 't', 'agent_id',
                                     'action_id']).agg({
                                         'instant_regret':
                                         np.mean
                                     }).reset_index())

    plt_df_per_period = (df.groupby(['agent', 't']).agg({
        'instant_regret':
        np.mean
    }).reset_index())

    if not paper_version:
        p_per_action = (
            gg.ggplot(plt_df_per_action) +
            gg.aes('action_id', 'instant_regret', colour='agent') +
            gg.geom_line() + gg.geom_line(size=1.25, alpha=0.75) +
            gg.xlim(0, 2.5 * len(plt_df_per_period.groupby('t'))) +
            gg.scale_colour_brewer(name='agent', type='qual', palette='Set1') +
            gg.labels.xlab('number of actions') +
            gg.labels.ylab('per-period regret'))

        p_per_period = (
            gg.ggplot(plt_df_per_period) +
            gg.aes('t', 'instant_regret', colour='agent') + gg.geom_line() +
            gg.geom_line(size=1.25, alpha=0.75) +
            gg.scale_colour_brewer(name='agent', type='qual', palette='Set1') +
            gg.labels.xlab('time period (t)') +
            gg.labels.ylab('per-period regret'))
    else:
        plt_df_per_action['agent_id'] = plt_df_per_action.agent.apply(
            get_agent_id)
        plt_df_per_period['agent_id'] = plt_df_per_period.agent.apply(
            get_agent_id)

        custom_labels = ['K = 1', 'K = 10', 'K = 20', 'K = 50', 'K = 100']
        custom_colors = ["#E41A1C", "#377EB8", "#4DAF4A", "#984EA3", "#FF7F00"]

        p_per_action = (
            gg.ggplot(plt_df_per_action) +
            gg.aes('action_id', 'instant_regret', colour='agent_id') +
            gg.geom_line() + gg.geom_line(size=1.25, alpha=0.75) +
            gg.xlim(0, 2.5 * len(plt_df_per_period.groupby('t'))) +
            gg.scale_color_manual(
                name='agent', labels=custom_labels, values=custom_colors) +
            gg.labels.xlab('number of actions') +
            gg.labels.ylab('per-action regret'))

        p_per_period = (
            gg.ggplot(plt_df_per_period) +
            gg.aes('t', 'instant_regret', colour='agent_id') + gg.geom_line() +
            gg.geom_line(size=1.25, alpha=0.75) + gg.scale_color_manual(
                name='agent', labels=custom_labels, values=custom_colors) +
            gg.labels.xlab('time period (t)') +
            gg.labels.ylab('per-period regret'))

    plot_dict = {}
    plot_dict['per_action_plot'] = p_per_action
    plot_dict['per_period_plot'] = p_per_period
    return plot_dict
Exemplo n.º 30
0
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf
from itertools import combinations
import plotnine as p

# read data
import ssl

ssl._create_default_https_context = ssl._create_unverified_context


def read_data(file):
    return pd.read_stata(
        "https://raw.github.com/scunning1975/mixtape/master/" + file)


tb = pd.DataFrame({
    'd':
    np.concatenate((np.repeat(0, 20), np.repeat(1, 20))),
    'y': (0.22, -0.87, -2.39, -1.79, 0.37, -1.54, 1.28, -0.31, -0.74, 1.72,
          0.38, -0.17, -0.62, -1.10, 0.30, 0.15, 2.30, 0.19, -0.50, -0.9,
          -5.13, -2.19, 2.43, -3.83, 0.5, -3.25, 4.32, 1.63, 5.18, -0.43, 7.11,
          4.87, -3.10, -5.81, 3.76, 6.31, 2.58, 0.07, 5.76, 3.50)
})

p.ggplot() + p.geom_density(tb, p.aes(x='y', color='factor(d)')) + p.xlim(
    -7, 8) + p.labs(title="Kolmogorov-Smirnov Test") + p.scale_color_discrete(
        labels=("Control", "Treatment"))