示例#1
0
    def hist(self, bins=None, stacked=None, orientation="vertical", **kwargs):
        data = self._preprocess_data(with_index=False)
        if isinstance(bins, int):
            bins = alt.Bin(maxbins=bins)
        elif bins is None:
            bins = True
        if orientation == "vertical":
            Indep, Dep = alt.X, alt.Y
        elif orientation == "horizontal":
            Indep, Dep = alt.Y, alt.X
        else:
            raise ValueError("orientation must be 'horizontal' or 'vertical'.")

        mark = self._get_mark_def({
            "type": "bar",
            "orient": orientation
        }, kwargs)
        chart = (alt.Chart(data, mark=mark).transform_fold(
            list(data.columns), as_=["column", "value"]).encode(
                Indep("value:Q", title=None, bin=bins),
                Dep("count()", title="Frequency", stack=stacked),
                color="column:N",
            ))

        if kwargs.get("subplots"):
            nrows, ncols = _get_layout(data.shape[1],
                                       kwargs.get("layout", (-1, 1)))
            chart = chart.encode(
                facet=alt.Facet("column:N", title=None)).properties(
                    columns=ncols)

        return chart
示例#2
0
def altPlotNewDeaths(df: pd.DataFrame, save_chart=False):
    dChart = alt.Chart(
        df[df['denominazione_regione'] != 'Molise']).mark_line().encode(
            alt.X('data:T', title=None),
            alt.Y('rolling_mean:Q', title=None),
            color=alt.Color('denominazione_regione:N',
                            legend=None,
                            scale=alt.Scale(scheme='dark2')),
            facet=alt.Facet('denominazione_regione:N', columns=4, title=None),
            tooltip=[
                alt.Tooltip('nuovi_decessi:Q', title='Nuovi decessi')
            ]).properties(
                width=160, height=90, title='Nuovi decessi').configure_view(
                    strokeWidth=0).configure_axis(grid=False).configure_title(
                        color='gray',
                        fontSize=24,
                    ).configure_line(size=4).transform_window(
                        rolling_mean='mean(nuovi_decessi)',
                        frame=[-1, 1],
                        groupby=['denominazione_regione'])

    if save_chart:
        dChart.save('newDeaths.png', scale_factor=2.0)

    return dChart
def visualise_tsne(tsne_df, save=True, fig_num=15):
    """Visualise tsne plot"""
    tsne_base = alt.Chart(tsne_df).encode(
        x=alt.X("x:Q", title="", axis=alt.Axis(ticks=False, labels=False)),
        y=alt.Y("y:Q", title="", axis=alt.Axis(ticks=False, labels=False)),
    )

    tsne_points = ((
        tsne_base.mark_point(
            filled=True, opacity=0.5, stroke="black",
            strokeOpacity=0.5).encode(
                color=alt.Color("org_type", title="Organisation type"),
                strokeWidth=alt.Stroke("top",
                                       scale=alt.Scale(range=[0, 1]),
                                       legend=None),
                # stroke = alt.value('blue'),
                size=alt.Size("activity:Q", title="Number of papers"),
                facet=alt.Facet("size",
                                columns=2,
                                title="Number of organisations in plot"),
                tooltip=["index"],
            )).interactive().resolve_scale(
                y="independent", x="independent").properties(width=250,
                                                             height=250))

    if save is True:
        save_altair(tsne_points, "fig_15_tsne", driv)

    return tsne_points
示例#4
0
def altPlotNewICU(df: pd.DataFrame, save_chart=False):

    tiChart = alt.Chart(df).mark_line().encode(
        alt.X('data:T', title=None),
        alt.Y('rolling_mean:Q', title=None),
        color=alt.Color('denominazione_regione:N',
                        legend=None,
                        scale=alt.Scale(scheme='dark2')),
        facet=alt.Facet('denominazione_regione:N', columns=4, title=None),
        tooltip=[
            alt.Tooltip('ingressi_terapia_intensiva:Q', title='Ingressi TI')
        ]).properties(
            width=160, height=90,
            title='Terapie intensive: nuovi ingressi').configure_view(
                strokeWidth=0).configure_axis(grid=False).configure_title(
                    color='gray',
                    fontSize=24,
                ).configure_line(size=4).transform_window(
                    rolling_mean='mean(ingressi_terapia_intensiva)',
                    frame=[-1, 1],
                    groupby=['denominazione_regione'])

    if save_chart:
        tiChart.save('newTI.png', scale_factor=2.0)

    return tiChart
示例#5
0
    def plot_covariate_effects(self):
        """Plot covariate effects
        """
        ce = (self.covariate_effects - 1) * 100
        cov_stats = pd.melt(self.covariate_statistics.reset_index(),
                            var_name='condition',
                            id_vars=['covariate'],
                            value_vars=['p5', 'p95', 'other'])

        cov_stats = cov_stats.replace({
            'p5': '5th',
            'p95': '95th'
        }).set_index(['covariate', 'condition'])

        ce = ce.join(cov_stats, how='inner')

        # The left join reorders the index, pandas bug #34133
        ce = ce.reorder_levels(['parameter', 'covariate', 'condition'])

        param_names = list(ce.index.get_level_values('parameter').unique())
        plots = []

        for parameter in param_names:
            df = ce.xs(parameter, level=0)
            df = df.reset_index()

            error_bars = alt.Chart(df).mark_errorbar(ticks=True).encode(
                x=alt.X('p5:Q',
                        title='Effect size in percent',
                        scale=alt.Scale(zero=False)),
                x2=alt.X2('p95:Q'),
                y=alt.Y('condition:N', title=None),
            )

            rule = alt.Chart(df).mark_rule(
                strokeDash=[10, 4], color='gray').encode(
                    x=alt.X('xzero:Q')).transform_calculate(xzero="0")

            points = alt.Chart(df).mark_point(filled=True,
                                              color='black').encode(
                                                  x=alt.X('mean:Q'),
                                                  y=alt.Y('condition:N'),
                                              )

            text = alt.Chart(df).mark_text(dy=-15, color="red").encode(
                x=alt.X("mean:Q"),
                y=alt.Y("condition:N"),
                text=alt.Text("value:Q"))

            plot = alt.layer(
                error_bars, rule, points, text, data=df, width=800,
                height=100).facet(
                    columns=1.0,
                    row=alt.Facet('covariate:N', title=None),
                    title=f'{parameter}').resolve_scale(y='independent')

            plots.append(plot)

        v = alt.vconcat(*plots).resolve_scale(x='shared')
        return v
示例#6
0
def display_the_plot():
    df = pd.concat(esb_data[esb_data["Type"] == var1]
                   for var1 in session["selection_city"])
    df1 = pd.concat(df[df["Council"] == var2]
                    for var2 in session["selection_council"])
    df2 = pd.concat(df1[df1["Year"] == int(var3)]
                    for var3 in session["selection_year"])
    print(df)
    print(df1)
    print(df2)

    ## print(df.shape)
    ## print(session["selection"])
    plot = alt.Chart(df2).mark_bar().encode(
        alt.X(
            'No_Of_Connections:Q',
            sort=alt.SortField(field='No_Of_Connections', order='ascending'),
            scale=alt.Scale(domain=(0, 1000)),
            axis=alt.Axis(title="Connection Count", tickCount=20),
        ),
        alt.Y('Month:O'),
        alt.Color('Month:N'),
        alt.Facet('Year:O'),
    ).properties(width=200)
    plot.save("templates/plot.html")
    return render_template("plot.html")
def make_chart_topic_spec(
    topic_rca,
    topic_mix,
    arxiv_cat_lookup,
    topic_thres=0.05,
    topic_n=150,
    save=False,
    fig_n="extra_1",
):
    """Visualises prevalence of topics in a category
    Args:
        topic_rca: relative specialisation of topics in categories
        arxiv_cat_lookup: lookup between category ids and names
        topic_thres: threshold for topic
        topic_n: number of topics to consider
        save: if we want to save the figure
        fig_n: figure id

    """
    logging.info("Extracting topic counts")
    # Visualise topic distributions
    topic_counts_long = topic_rca.reset_index(drop=False).melt(id_vars="index")

    # Extract top topics
    top_topics = list(
        topic_mix.iloc[:, 1:]
        .applymap(lambda x: x > topic_thres)
        .sum(axis=0)
        .sort_values(ascending=False)[:topic_n]
        .index
    )

    # Focus on those for the long topic
    topic_counts_long_ = topic_counts_long.loc[
        topic_counts_long["variable"].isin(top_topics)
    ]

    # Add nice names for categoru
    topic_counts_long_["arx_cat"] = [
        x.split(" ") for x in topic_counts_long_["index"].map(arxiv_cat_lookup)
    ]

    topic_spec = (
        alt.Chart(topic_counts_long_)
        .mark_bar(color="red")
        .encode(
            y=alt.Y(
                "variable", sort=top_topics, axis=alt.Axis(labels=False, ticks=False)
            ),
            x="value",
            facet=alt.Facet("arx_cat", columns=5),
            tooltip=["variable", "value"],
        )
    ).properties(width=100, height=100)

    if save is True:
        save_altair(topic_spec, f"fig_{fig_n}_topic_specialisations", driv)

    return topic_spec
示例#8
0
def visualize_line_facet(df):
    graph = alt.Chart(df).mark_line().encode(
        x='Year:T',
        y=alt.Y('£mn:Q', scale=alt.Scale(type='log', clamp=True)),
        color='Type',
        facet=alt.Facet('Region:O', columns=3),
    ).properties(width=175, height=150)
    st.write(graph)
def plot_median_ci_width_static():
    data2 = pd.DataFrame(results.groupby(['methodName', 'timeSeriesLength', 'trueRho'])['ci_size'].median()).reset_index()

    # the base chart
    base = alt.Chart(data2).transform_calculate(
        x_jittered = '0.05*random()*datum.timeSeriesLength+datum.timeSeriesLength',
        ymin = "datum.confIntLow",
        ymax = "datum.confIntHigh",
        )

    selector = alt.selection_single(
        fields=['methodName'],
        empty='all',
        bind='legend')
    opacity = alt.condition(selector, alt.value(1.0), alt.value(0.5))

    #generate the scatter points:
    points = base.mark_point(filled=True).add_selection(selector).encode(
        x=alt.X('x_jittered:Q',scale=alt.Scale(type='log'),title='Length of Timeseries'),
        y=alt.Y('ci_size:Q',scale=alt.Scale(type='log'),title='Median size of the CI'),
        size=alt.value(80),
        color=alt.condition(selector, col, alt.value('lightgrey')),
        opacity=opacity)

    selector = alt.selection_single(
        fields=['methodName'],
        empty='all',
        bind='legend')
    opacity = alt.condition(selector, alt.value(1.0), alt.value(0.5))

    #generate the scatter points:
    line = base.mark_line().add_selection(selector).encode(
        x=alt.X('x_jittered:Q'),
        y=alt.Y('ci_size:Q'),
        color=alt.condition(selector, col, alt.value('lightgrey')),
        opacity=opacity)

    chart = alt.layer(
        points,
        line
        ).properties(
        width=250,
        height=200
        ).facet(facet=alt.Facet('trueRho:N',title='Autocorrelation parameter (ρ)'), columns=3)

    chart = chart.configure_header(titleColor='darkred',
                                   titleFontSize=16,
                                   labelColor='darkred',
                                   labelFontSize=14)

    chart = chart.configure_legend(
        strokeColor='gray',
        fillColor='#EEEEEE',
        padding=10,
        cornerRadius=10,
        orient='top')

    return chart
示例#10
0
def scoring_confusion_matrix(
    data: pd.DataFrame,
    xvar: str,
    target_var: str,
    threshold: float = 0.5,
    bin_width: float = 0.1,
    width: int = 200,
    height: int = 200,
) -> alt.Chart:
    data = compute_confusion_categories(data, xvar, target_var, threshold)
    confusion_categories_with_counts = data[
        CONFUSION_CATEGORIES_COL_NAME].unique()

    binning = alt.Bin(step=bin_width)
    base = alt.Chart(
        data,
        width=width,
        height=height,
        usermeta={
            "embedOptions": {
                "scaleFactor": 5,
                "downloadFileName": "scoring_confusion_matrix",
            }
        },
    )

    # It is necessary to use transforms, so that the faceted chart is sorted as intended.
    # More info: https://github.com/altair-viz/altair/issues/2303.
    hist = (base.mark_bar(tooltip=True).encode(
        x=alt.X(
            f"binned_{xvar}:Q",
            bin="binned",
            axis=alt.Axis(format="~", title="Score"),
        ),
        x2=f"binned_{xvar}_end:Q",
        y=alt.Y("y_count:Q", axis=alt.Axis(title="Count")),
        facet=alt.Facet(
            f"{CONFUSION_CATEGORIES_COL_NAME}:O",
            sort=confusion_categories_with_counts,
            title=None,
            columns=2,
        ),
    ).transform_bin(f"binned_{xvar}", xvar,
                    bin=binning).transform_joinaggregate(
                        y_count=f"count()",
                        groupby=[
                            f"binned_{xvar}",
                            f"binned_{xvar}_end",
                            CONFUSION_CATEGORIES_COL_NAME,
                        ],
                    ))

    return hist.properties(
        title={
            "text": "Scoring confusion matrix",
            "subtitle": f"Threshold: {threshold}",
        })
def boxplot(df: pd.DataFrame(), xaxis: str, x_title: str, yaxis: str,
            y_title: str, color_col: str, color_title: str, facet_column: str,
            facet_title: str, title: str) -> alt.vegalite.v4.api.Chart:
    """ Creates a boxplot based on the df, yaxis and title """
    return alt.Chart(df).mark_boxplot().encode(
        x=alt.X(xaxis + ':O', title=x_title),
        y=alt.Y(yaxis, scale=alt.Scale(type="log"), title=y_title),
        color=alt.Color(color_col + ':O', title=color_title),
    ).facet(alt.Facet(facet_column, title=facet_title)).properties(
        title=title, ).interactive()
示例#12
0
def non_ducs_per_formation(df, step, facet):

    df = sanitize_dataframe(df)
    chart = alt.Chart(df).mark_bar().encode(
        alt.Y('Days of Uncompleted Status',
              bin=alt.Bin(extent=[0, 500], step=step)),
        alt.X('count()', title='Number Wells'),
        facet=alt.Facet(facet, columns=3),
        color=alt.value('#e68805'),
        opacity=alt.value(0.7)).properties(width=200).interactive()
    return st.altair_chart(chart)
def plot_results_static():
    # the base chart
    base = alt.Chart(data).transform_calculate(
        x_jittered = '0.15*(random()-0.5)*datum.timeSeriesLength+datum.timeSeriesLength',
        ymin="datum.confIntLow",
        ymax="datum.confIntHigh",
        goal='0.95')

    #generate the scatter points:
    points = base.mark_point(filled=True).encode(
        x=alt.X('x_jittered:Q', scale=alt.Scale(type='log'), title='Length of Timeseries'),
        y=alt.Y('rate:Q', scale=alt.Scale(domain=[0,1.04]), title='Rate of correct SEM'),
        size=alt.value(80),
        #color=alt.Color('methodName', sort=sort, legend=alt.Legend(title="SEM method")))
        color=col)
    
    #generate the scatter points:
    line = base.mark_line().encode(
        x=alt.X('x_jittered:Q'),
        y=alt.Y('rate:Q'),
        color=col)
    
    #generate the 95% mark:
    rule = base.mark_rule(color='black').encode(
        alt.Y('goal:Q'))

    errorbars = base.mark_rule(strokeWidth=3).encode(
        alt.X("x_jittered:Q"),
        alt.Y("ymin:Q", title=''),
        alt.Y2("ymax:Q"),
        color=col)

    chart = alt.layer(errorbars,
                      points,
                      line,
                      rule,).properties(
        width=250,
        height=200).facet(facet=alt.Facet('trueRho:N',title='Autocorrelation parameter (ρ)'), columns=3)

    chart = chart.configure_header(titleColor='darkred',
                       titleFontSize=16,
                      labelColor='darkred',
                      labelFontSize=14)

    chart = chart.configure_legend(
        strokeColor='gray',
        fillColor='#EEEEEE',
        padding=10,
        cornerRadius=10,
        orient='top'
        )    
    return chart
 def make_chart(self, df):
     return alt.Chart(df).mark_area().encode(
         x=alt.X('yearmonthdate(bulletin_date):T',
                 title=None,
                 axis=alt.Axis(format='%d/%m')),
         y=alt.Y('new_confirmed_cases',
                 title=None,
                 scale=alt.Scale(type='linear'))).properties(
                     width=175, height=75).facet(
                         columns=3,
                         facet=alt.Facet(
                             'Municipio',
                             title=None)).resolve_axis(x='independent')
def display_the_facetplot():

    facetplot = alt.Chart(tidy_df).mark_area().encode(
        x='Year:O',
        y=alt.Y('sum(esb):Q',
                title='ESB Connections',
                axis=alt.Axis(format='~s')),
        facet=alt.Facet('County Councils:O', columns=4),
        color='County Councils').properties(
            title='ESB Connections trend for each County', )

    facetplot.save("templates/facetplot.html")

    return render_template("facetplot.html")
示例#16
0
def daily_reported_deaths(df, labels):
    hist = (alt.Chart(df, height=100, width=100).mark_bar().encode(
        x=alt.X("lag:O", title="Reporting Lag", sort=labels),
        y=alt.Y("sum(n_diff):Q", title="Reported Deaths"),
        color=alt.Color("day(publication_date):N",
                        title="Publication Day",
                        sort=["Mon"]),
    ))

    text = (alt.Chart(df).mark_text(align="right", x=95, y=28,
                                    fontSize=20).encode(
                                        alt.Text("sum(n_diff)"), ))

    chart = (hist + text).facet(
        facet=alt.Facet("publication_date:T", title="Reported Deaths per Day"),
        columns=7,
    )

    return chart
def get_peak_perf_bar_chart(csv_file) -> alt.vegalite.v4.api.Chart:
    """
    Method that creates a simple grouped bar chart from the csv file:
   
    Parameters
    ----------
    csv_file: str
        csv file from which the bar chart will be created.
    
    Returns
    -------
    alt.vegalite.v4.api.Chart
        Simple, grouped bar chart.
        
    """

    df = pd.read_csv(csv_file)
    df = df.drop(0)
    df = pd.melt(df,
                 id_vars=['Hardware Platforms'],
                 value_vars=[
                     'INT2', 'INT4', 'INT8', 'FP16', 'FP32', 'Memory Bandwidth'
                 ],
                 var_name='Datatypes and MB')
    df = df.dropna()

    bars = alt.Chart().mark_bar().encode(
        x=alt.X('Datatypes and MB:O', title=''),
        y=alt.Y('value:Q',
                scale=alt.Scale(type='log'),
                title='Peak Performance [TOP/sec] and MB [GBps]'),
        color='Datatypes and MB:N',
    )
    text = bars.mark_text(
        dy=-5  # Nudges text upwards so it doesn't appear on top of the bar
    ).encode(text='value:Q')
    return alt.layer(bars, text, data=df).facet(
        columns=5,
        facet=alt.Facet('Hardware Platforms:N', title='Hardware Platforms')
    ).properties(
        title='Peak Performance and Memory Bandwidth for All Hardware Platforms'
    )
示例#18
0
    def _xy(self,
            mark,
            x=None,
            y=None,
            stacked=False,
            subplots=False,
            **kwargs):
        data = self._preprocess_data(with_index=True)

        if x is None:
            x = data.columns[0]
        else:
            x = _valid_column(x)
            assert x in data.columns

        if y is None:
            y_values = list(data.columns[1:])
        else:
            y = _valid_column(y)
            assert y in data.columns
            y_values = [y]

        chart = (alt.Chart(data, mark=self._get_mark_def(
            mark, kwargs)).transform_fold(y_values,
                                          as_=["column", "value"]).encode(
                                              x=x,
                                              y=alt.Y("value:Q",
                                                      title=None,
                                                      stack=stacked),
                                              color=alt.Color("column:N",
                                                              title=None),
                                              tooltip=[x] + y_values,
                                          ).interactive())

        if subplots:
            nrows, ncols = _get_layout(len(y_values),
                                       kwargs.get("layout", (-1, 1)))
            chart = chart.encode(
                facet=alt.Facet("column:N", title=None)).properties(
                    columns=ncols)

        return chart
示例#19
0
def html_detail_company():
    df = create_df_service_amount()
    df.loc[df.ttc_amount.isnull(), 'payment_status'] = 'Refused'
    df.loc[~df.ttc_amount.isnull(), 'payment_status'] = 'Accepted'
    df.label_services = df.label_services.str.title()
    df.compagnie = df.compagnie.str.title()

    chart = alt.Chart(df).mark_bar().encode(
        y=alt.Y('label_services', title='Types prestations'),
        x=alt.X('count(label_services)', title='Nombre de contrats'),
        color=alt.Color('payment_status', title='Status paiement'),
        tooltip=[
            alt.Tooltip('label_services', title='Prestation'),
            alt.Tooltip('max(ht_amount)', title='Max amount'),
            alt.Tooltip('min(ht_amount)', title='Min amount'),
            alt.Tooltip('count()', title='Devis count')
        ]).facet(
            facet=alt.Facet('compagnie:N', title='Compagnies'),
            columns=2,
        )
    chart.save("app/templates/plot/detail_company.html")
示例#20
0
def make_cat_trend(linech, save=True, fig_n=2):
    """Makes chart 2"""

    ai_subtrends_chart = (
        alt.Chart(linech)
        .transform_window(
            roll="mean(n)", frame=[-10, 0], groupby=["category_clean", "type"]
        )
        .mark_line()
        .encode(
            x=alt.X("index:T", title=""),
            y=alt.X("roll:Q", title="Number of papers"),
            color=alt.Color("type", title="Source"),
        )
        .properties(width=200, height=100)
    ).facet(alt.Facet("category_clean", title="Category"), columns=2)

    if save is True:
        save_altair(ai_subtrends_chart, f"fig_{fig_n}_ai_subtrends", driver=driv)

    return ai_subtrends_chart
    def make_chart(self, df):
        sort_order = ['Confirmados', 'Probables', 'Muertes']
        bars = alt.Chart(df).mark_bar().encode(
            x=alt.X('value', title="Rezago estimado (días)"),
            y=alt.Y('variable', title=None, sort=sort_order, axis=None),
            color=alt.Color('variable',
                            sort=sort_order,
                            legend=alt.Legend(orient='bottom', title=None)),
            tooltip=[
                'variable', 'bulletin_date',
                alt.Tooltip(field='value', type='quantitative', format=".1f")
            ])

        text = bars.mark_text(align='right', baseline='middle', size=12,
                              dx=-5).encode(text=alt.Text('value:Q',
                                                          format='.1f'),
                                            color=alt.value('white'))

        return (bars + text).properties(width=300, ).facet(
            columns=2,
            facet=alt.Facet("bulletin_date",
                            sort="descending",
                            title="Fecha del boletín"))
示例#22
0
def altPosRate(df: pd.DataFrame):
    prChart = alt.Chart(df).mark_line().encode(
        alt.X('data:T', title=None),
        alt.Y('positivity_rate:Q', axis=alt.Axis(format='%'), title=None),
        color=alt.Color('denominazione_regione:N',
                        legend=None,
                        scale=alt.Scale(scheme='dark2')),
        facet=alt.Facet('denominazione_regione:N', columns=4, title=None),
        tooltip=[
            alt.Tooltip('positivity_rate:Q', title='Tasso positivi al tampone')
        ]).properties(
            width=160, height=90,
            title='Tasso di positivi al tampone').configure_view(
                strokeWidth=0).configure_axis(grid=False).configure_title(
                    color='gray',
                    fontSize=24,
                ).configure_line(size=4)
    # .transform_window(
    #     rolling_mean='mean(positivity_rate)',
    #     frame=[-1, 1],
    #     groupby=['denominazione_regione']
    # )

    return prChart
示例#23
0
embeddings_to_compare = compute_tsne_2d_components(embedding_dict)

t2dm_embeddings = concat_embeddings(embeddings_to_compare, t2dm_concept_list, 't2dm')
breast_cancer_embeddings = concat_embeddings(embeddings_to_compare, breast_cancer, 'breast_cancer')
union_embeddings = pd.concat([t2dm_embeddings, breast_cancer_embeddings], axis=0)

columns = union_embeddings.columns.to_list() + ['concept_name', 'domain_id']
union_embeddings = union_embeddings.merge(concept, left_on='standard_concept_id', right_on='concept_id')[columns]

alt.data_transformers.disable_max_rows()

alt.Chart(union_embeddings, title='embeddings').mark_point().encode(
    x='tsne-2d-one:Q',
    y='tsne-2d-two:Q',
    color='phenotype',
    facet=alt.Facet('name:O', columns=2),
    tooltip=['concept_name']
).interactive()

# ### Measure the average cosine distances for breast cancer

cumc_visit_pairwise_dist = EuclideanDistance(name='cumc_visit', path=get_pairwise_euclidean_distance_output(create_file_path(cumc_embeddings_folder, 'visit')))
ccae_visit_pairwise_dist = EuclideanDistance(name='ccae_visit', path=get_pairwise_euclidean_distance_output(create_file_path(ccae_embeddings_folder, 'visit')))

pd.concat([cumc_visit_pairwise_dist.compute_average_dist(breast_cancer), 
           cumc_visit_pairwise_dist.compute_random_average_dist(205),
           ccae_visit_pairwise_dist.compute_average_dist(breast_cancer),
           ccae_visit_pairwise_dist.compute_random_average_dist(392)], axis=1)

cumc_visit_pairwise_dist = EuclideanDistance(name='cumc_visit', path=get_pairwise_cosine_similarity_output(create_file_path(cumc_embeddings_folder, 'visit')))
示例#24
0
"""
Anscombe's Quartet
------------------

This example shows how to use the column channel to make a trellis plot. Anscombe's Quartet is a famous dataset constructed by Francis Anscombe. Common summary statistics are identical for each subset of the data, despite the subsets having vastly different characteristics.
"""
# category: case studies
import altair as alt
from vega_datasets import data

source = data.anscombe()

alt.Chart(source).mark_circle().encode(
    alt.X('X', scale=alt.Scale(zero=False)),
    alt.Y('Y', scale=alt.Scale(zero=False)),
    alt.Facet('Series', columns=2),
).properties(
    width=180,
    height=180,
)
示例#25
0
ac_layer = (ac_uncertainty + ac_best + ac_points).properties(
    title=
    f"acetate secretion = {yield_params['acetate']['slope']:0.2f} ± {yield_params['acetate']['err']:0.2f} mM / OD"
)

save(glu_layer | ac_layer, './output/2021-04-04_REL606_glucose_turnover.pdf')
save(glu_layer | ac_layer, './output/2021-04-04_REL606_glucose_turnover.png')

#%%
points = alt.Chart(samp_data, width=350,
                   height=300).mark_point(size=80).encode(
                       x=alt.X('od_600nm:Q', title='optical density [a.u.]'),
                       y=alt.Y('conc_mM:Q', title='concentration [mM]'),
                       color=alt.Color('replicate:N',
                                       title='biological replicate'),
                       facet=alt.Facet('compound:N',
                                       header=alt.Header(labelFontSize=15)))

fit = alt.Chart(fit_df, width=350,
                height=300).mark_line(color=colors['black']).encode(
                    x=alt.X('od_600nm:Q', title='optical density [a.u.]'),
                    y=alt.Y('conc_mM:Q', title='concentration [mM]'),
                    facet=alt.Facet('compound:N',
                                    header=alt.Header(labelFontSize=15)))

points + fit
# %%

# Load the calibration data
def make_chart_topic_comparison(
    topic_mix,
    arxiv_cat_lookup,
    comparison_ids,
    selected_categories,
    comparison_names,
    topic_list,
    topic_category_map,
    highlights=False,
    highlight_topics=None,
    highlight_class_table="Company",
    save=True,
    fig_num=15,
):
    """Creates a chart that compares the topic specialisations
    of different groups of organisations
    Args:
        topic_mix: topic mix
        arxiv_cat_lookup: lookup between category ids and names
        comparison_ids: ids we want to compare
        selected_categories: arXiv categories to focus on
        comparison_names: names for the categories we are comparing
        highlights: if we want to highlight particular topics
        highlight_topics: which ones
        highlight_class_table: topics to highlight in the table
    """

    # Extract the representations of categories
    comp_topic_rel = pd.DataFrame([
        topic_rep(
            ids,
            topic_mix,
            selected_categories,
            topic_list=topic_list,
            topic_category_map=topic_category_map,
        )[1].loc[True] for ids in comparison_ids
    ])
    comparison_df = comp_topic_rel.T
    comparison_df.columns = comparison_names

    comparison_df_long = comparison_df.reset_index(drop=False).melt(
        id_vars="index")
    comparison_df_long["cat"] = comparison_df_long["index"].map(
        topic_category_map)

    order = (comparison_df_long.groupby(
        ["index", "cat"])["value"].sum().reset_index(drop=False).sort_values(
            by=["cat", "value"], ascending=[True, False])["index"].tolist())

    comparison_df_filter = comparison_df_long.loc[
        comparison_df_long["cat"].isin(selected_categories)]

    comparison_df_filter["cat_clean"] = [
        arxiv_cat_lookup[x][:35] + "..." for x in comparison_df_filter["cat"]
    ]

    # Sort categories by biggest differences?
    diff_comp = (comparison_df_filter.pivot_table(
        index=["index", "cat_clean"], columns="variable",
        values="value").assign(
            diff=lambda x: x["company"] - x["academia"]).reset_index(
                drop=False).groupby("cat_clean")["diff"].mean().sort_values(
                    ascending=False).index.tolist())

    # Plot
    comp_ch = (alt.Chart(comparison_df_filter).mark_point(
        filled=True, opacity=0.5, stroke="black", strokeWidth=0.5).encode(
            x=alt.X("index",
                    title="",
                    sort=order,
                    axis=alt.Axis(labels=False, ticks=False)),
            y=alt.Y("value", title=["Share of papers", "with topic"]),
            color=alt.Color("variable", title="Organisation type"),
            tooltip=["index"],
        ))

    comp_lines = (alt.Chart(comparison_df_filter).mark_line(
        strokeWidth=1, strokeDash=[1, 1], stroke="grey").encode(
            x=alt.X("index",
                    sort=order,
                    axis=alt.Axis(labels=False, ticks=False)),
            y="value",
            detail="index",
        ))

    topic_comp_type = ((comp_ch + comp_lines).properties(
        width=200, height=150).facet(alt.Facet("cat_clean",
                                               sort=diff_comp,
                                               title="arXiv category"),
                                     columns=3).resolve_scale(x="independent"))

    if highlights is False:

        topic_comp_type = ((comp_ch + comp_lines).properties(
            width=200, height=150).facet(
                alt.Facet("cat_clean", sort=diff_comp, title="arXiv category"),
                columns=3,
            ).resolve_scale(x="independent"))

        if save is True:
            save_altair(topic_comp_type, f"fig_{fig_num}_topic_comp", driv)

        return topic_comp_type
    else:

        # Lookup for the selected categories
        code_topic_lookup = {
            v: str(n + 1)
            for n, v in enumerate(highlight_topics)
        }

        # Add a label per topic for the selected topics
        comparison_df_filter["code"] = [
            code_topic_lookup[x]
            if x in code_topic_lookup.keys() else "no_label"
            for x in comparison_df_filter["index"]
        ]

        # Need to find a way to remove the bottom one
        max_val = comparison_df_filter.groupby(
            "index")["value"].max().to_dict()
        comparison_df_filter["max"] = comparison_df_filter["index"].map(
            max_val)

        comp_text = (alt.Chart(comparison_df_filter).transform_filter(
            alt.datum.code != "no_label").mark_text(
                yOffset=-10, color="red").encode(
                    text=alt.Text("code"),
                    x=alt.X("index",
                            sort=order,
                            axis=alt.Axis(labels=False, ticks=False)),
                    y=alt.Y("max", title=""),
                    detail="index",
                ))

        topic_comp_type = ((comp_ch + comp_lines + comp_text).properties(
            width=200, height=150).facet(
                alt.Facet("cat_clean", sort=diff_comp, title="arXiv category"),
                columns=3,
            ).resolve_scale(x="independent"))

        if save is True:
            save_altair(topic_comp_type, "fig_9_topic_comp", driv)
            save_highlights_table(
                comparison_df_filter,
                highlight_topics,
                highlight_class_table,
                topic_category_map,
            )

        return topic_comp_type, comparison_df_filter
"""
US Population: Wrapped Facet
============================
This chart visualizes the age distribution of the US population over time,
using a wrapped faceting of the data by decade.
"""
# category: case studies
import altair as alt
from vega_datasets import data

source = data.population.url

alt.Chart(source).mark_area().encode(
    x='age:O',
    y=alt.Y('sum(people):Q', title='Population', axis=alt.Axis(format='~s')),
    facet=alt.Facet('year:O', columns=5),
).properties(title='US Age Distribution By Year', width=90, height=80)
示例#28
0
)

hline = alt.Chart().mark_rule(size=1, strokeDash=[10,
                                                  10]).encode(y=alt.Y('a:Q'), )

ch = alt.layer(
    hline,
    errorbars,
    lines,
    points,
    #    hline,
    data=results_df
).transform_calculate(a="0.003").properties(
    width=350,
    height=250,
).facet(
    #'Fingerprint', columns=2
    facet=alt.Facet(
        'Fingerprint',
        header=alt.Header(labelFontSize=15),
    ),

    #header=alt.Header(labelFontSize=25),
    #column=alt.Column(field=alt.Field('Fingerprint'),type='nominal'),
    columns=2).configure_axis(
        #labelFontSize=10,
        #titleFontSize=15
    ).configure_header(titleFontSize=15, )

ch.save('../../figures/trainingSetSize.html')
示例#29
0
def lambda_handler(event, context):
    # Get the secret
    sm = boto3.client('secretsmanager')
    secretobj = sm.get_secret_value(SecretId='ni-covid-tweets')
    secret = json.loads(secretobj['SecretString'])

    s3 = boto3.client('s3')

    messages = []
    # Download the most recently updated PDF file
    for change in event:
        tmp = tempfile.NamedTemporaryFile(suffix='.pdf')
        with open(tmp.name, 'wb') as fp:
            s3.download_fileobj(secret['bucketname'],change['keyname'],fp)
        # Get the date range covered by the report
        text = textract.process(tmp.name, method='pdfminer').decode('utf-8')
        regex = re.compile(r'(\d{1,2})(?:st|nd|rd|th)\s+([A-Z][a-z]+)\s+(\d{4})\s+\–+\s+(\d{1,2})(?:st|nd|rd|th)\s+([A-Z][a-z]+)\s+(\d{4})')
        start_date = None
        end_date = None
        for line in text.split('\n'):
            m = regex.search(line)
            if m:
                start_date = pandas.to_datetime('%s %s %s' %(m.group(1),m.group(2),m.group(3)), format='%d %B %Y').date()
                end_date = pandas.to_datetime('%s %s %s' %(m.group(4),m.group(5),m.group(6)), format='%d %B %Y').date()
                break
        if start_date is None:
            logging.error('Unable to find start date in report')
            return {
                "statusCode": 404,
                "body": 'Unable to find start date in report %s' %change['url'],
            }
        # Get the tables from the report - note that it was not possible to get data from 4th April or earlier due to
        # tables that will not parse properly in the PDF
        tables = tabula.read_pdf(tmp.name, pages = "all", multiple_tables = True)
        tablecount = 0
        dataset = pandas.DataFrame()
        for df in tables:
            if 'Total' not in df.columns:
                firstrow = df.iloc[0]
                newcols = []
                for i in range(len(firstrow)):
                    if isinstance(firstrow[i], float) and math.isnan(firstrow[i]):
                        newcols.append(df.columns[i])
                    else:
                        newcols.append(firstrow[i])
                df.columns = newcols
                df = df[1:]
            df['Setting'] = df['Setting'].str.strip()
            df.dropna(axis='index',subset=['Total','Open','Closed'],inplace=True)
            df['Total'] = df['Total'].astype(int)
            df['Open'] = df['Open'].astype(int)
            df['Closed'] = df['Closed'].astype(int)
            df = df[df['Setting']!='Total']
            if tablecount==0:
                df['Type'] = 'Probable Outbreak'
            elif tablecount==1:
                df['Type'] = 'Cluster'
            else:
                logging.warning('Unexpected table: %s' %df)
            tablecount += 1
            dataset = pandas.concat([dataset, df])
        dataset['Start Date'] = pandas.to_datetime(start_date)
        dataset['End Date'] = pandas.to_datetime(end_date)
        week = int((end_date - pandas.to_datetime('1 January 2020', format='%d %B %Y').date()).days / 7)
        dataset['Week'] = week
        # Create a simple summary and the tweet text
        summary = dataset.groupby('Type').sum()
        tweet = 'NI Contact Tracing reports from %s to %s:\n' %(start_date.strftime('%-d %B %Y'), end_date.strftime('%-d %B %Y'))
        for Type,data in summary.to_dict('index').items():
            tweet += '\u2022 %d %ss (%d open, %d closed)\n' %(data['Total'], Type.lower(), data['Open'], data['Closed'])
        tweet += '\n%s' %change['url']
        # Pull current data from s3
        try:
            obj = s3.get_object(Bucket=secret['bucketname'],Key=secret['pha-clusters-datastore'])['Body']
        except s3.exceptions.NoSuchKey:
            print("The object %s does not exist in bucket %s." %(secret['pha-clusters-datastore'], secret['bucketname']))
            datastore = pandas.DataFrame(columns=['Week'])
        else:
            stream = io.BytesIO(obj.read())
            datastore = pandas.read_csv(stream)
        # Clean out any data with matching dates
        datastore = datastore[datastore['Week'] != week]
        # Append the new data
        datastore = pandas.concat([datastore, dataset])
        datastore['Start Date'] = pandas.to_datetime(datastore['Start Date'])
        datastore['End Date'] = pandas.to_datetime(datastore['End Date'])
        # Replace any known duplicates
        datastore['Setting'] = datastore['Setting'].replace({
            'Cinema/ Theatre / Entertainment': 'Cinema / Theatre / Entertainment Venue',
            'Cinema/ Theatre / Entertainment Venue': 'Cinema / Theatre / Entertainment Venue',
            'Funeral / Wakes': 'Funeral / Wake',
            'Restaurant / Cafe': 'Restaurant / Café'
        })
        # Push the data to s3
        stream = io.BytesIO()
        datastore.to_csv(stream, index=False)
        stream.seek(0)
        s3.upload_fileobj(stream, secret['bucketname'], secret['pha-clusters-datastore'])
        # Set up chromedriver so we can save altair plots
        driver = get_chrome_driver()
        plots = []
        if driver is None:
            logging.error('Failed to start chrome')
        else:
            p = altair.vconcat(
                altair.Chart(
                    dataset
                ).mark_bar().encode(
                    x = altair.X('Total:Q', axis=altair.Axis(title='Total reported')),
                    y = altair.Y('Setting:O'),
                    color='Type',
                    order=altair.Order(
                        'Type',
                        sort='ascending'
                    ),
                ).properties(
                    height=450,
                    width=800,
                    title='NI COVID-19 Contact Tracing reports from %s to %s' %(start_date.strftime('%-d %B %Y'), end_date.strftime('%-d %B %Y'))
                ),
            ).properties(
                title=altair.TitleParams(
                    ['Data from Public Health Agency, does not include education or home settings',
                    'Covers the preceding four weeks',
                    'https://twitter.com/ni_covid19_data on %s'  %datetime.datetime.now().date().strftime('%A %-d %B %Y')],
                    baseline='bottom',
                    orient='bottom',
                    anchor='end',
                    fontWeight='normal',
                    fontSize=10,
                    dy=10
                ),
            )
            plotname = 'pha-outbreaks-week-%s.png'%datetime.datetime.now().date().strftime('%Y-%d-%m')
            plotstore = io.BytesIO()
            p.save(fp=plotstore, format='png', method='selenium', webdriver=driver)
            plotstore.seek(0)
            plots.append({'name': plotname, 'store': plotstore})
            p = altair.vconcat(
                altair.Chart(
                    datastore.groupby(['End Date','Type'])['Total'].sum().reset_index()
                ).mark_area().encode(
                    x = altair.X('End Date:T', axis=altair.Axis(title='Date reported (for preceding four weeks)')),
                    y = altair.Y('Total:Q', axis=altair.Axis(title='Total reported', orient="right")),
                    color='Type',
                    order=altair.Order(
                        'Type',
                        sort='ascending'
                    ),
                ).properties(
                    height=450,
                    width=800,
                    title='NI COVID-19 Contact Tracing reports from %s to %s' %(datastore['Start Date'].min().strftime('%-d %B %Y'), datastore['End Date'].max().strftime('%-d %B %Y'))
                ),
            ).properties(
                title=altair.TitleParams(
                    ['Data from Public Health Agency, does not include education or home settings',
                    'Reported weekly for the preceding four weeks',
                    'https://twitter.com/ni_covid19_data on %s'  %datetime.datetime.now().date().strftime('%A %-d %B %Y')],
                    baseline='bottom',
                    orient='bottom',
                    anchor='end',
                    fontWeight='normal',
                    fontSize=10,
                    dy=10
                ),
            )
            plotname = 'pha-outbreaks-time-%s.png'%datetime.datetime.now().date().strftime('%Y-%d-%m')
            plotstore = io.BytesIO()
            p.save(fp=plotstore, format='png', method='selenium', webdriver=driver)
            plotstore.seek(0)
            plots.append({'name': plotname, 'store': plotstore})
            p = altair.vconcat(
                altair.Chart(
                    datastore.groupby(['End Date','Setting','Type'])['Total'].sum().reset_index()
                ).mark_area().encode(
                    x = altair.X('End Date:T', axis=altair.Axis(title='')),
                    y = altair.Y('Total:Q', axis=altair.Axis(title='', orient="right")),
                    color='Type',
                    facet=altair.Facet('Setting:O', columns=5, title=None, spacing=0),
                    order=altair.Order(
                        'Type',
                        sort='ascending'
                    ),
                ).properties(
                    height=90,
                    width=160,
                    title=altair.TitleParams(
                        'NI COVID-19 Contact Tracing reports by setting from %s to %s' %(datastore['Start Date'].min().strftime('%-d %B %Y'), datastore['End Date'].max().strftime('%-d %B %Y')),
                        anchor='middle',
                    ),
                ),
            ).properties(
                title=altair.TitleParams(
                    ['Data from Public Health Agency, does not include education or home settings',
                    'Reported weekly for the preceding four weeks',
                    'https://twitter.com/ni_covid19_data on %s'  %datetime.datetime.now().date().strftime('%A %-d %B %Y')],
                    baseline='bottom',
                    orient='bottom',
                    anchor='end',
                    fontWeight='normal',
                    fontSize=10,
                    dy=10
                ),
            )
            plotname = 'pha-outbreaks-small-%s.png'%datetime.datetime.now().date().strftime('%Y-%d-%m')
            plotstore = io.BytesIO()
            p.save(fp=plotstore, format='png', method='selenium', webdriver=driver)
            plotstore.seek(0)
            plots.append({'name': plotname, 'store': plotstore})

        # Convert to dates to ensure correct output to CSV
        datastore['Start Date'] = datastore['Start Date'].dt.date
        datastore['End Date'] = datastore['End Date'].dt.date

        # Tweet out the text and images
        if change.get('notweet') is not True:
            api = TwitterAPI(secret['twitter_apikey'], secret['twitter_apisecretkey'], secret['twitter_accesstoken'], secret['twitter_accesstokensecret'])
            upload_ids = api.upload_multiple(plots)
            if change.get('testtweet') is True:
                if len(upload_ids) > 0:
                    resp = api.dm(secret['twitter_dmaccount'], tweet, upload_ids[0])
                    if len(upload_ids) > 1:
                        resp = api.dm(secret['twitter_dmaccount'], 'Test 1', upload_ids[1])
                        if len(upload_ids) > 2:
                            resp = api.dm(secret['twitter_dmaccount'], 'Test 2', upload_ids[2])
                else:
                    resp = api.dm(secret['twitter_dmaccount'], tweet)
                messages.append('Tweeted DM ID %s' %(resp.id))
            else:
                if len(upload_ids) > 0:
                    resp = api.tweet(tweet, media_ids=upload_ids)
                else:
                    resp = api.tweet(tweet)
                # Download and update the index
                status = S3_scraper_index(s3, secret['bucketname'], secret['pha-clusters-index'])
                index = status.get_dict()
                for i in range(len(index)):
                    if index[i]['filedate'] == change['filedate']:
                        index[i]['tweet'] = resp.id
                        break
                status.put_dict(index)
                messages.append('Tweeted ID %s and updated %s' %(resp.id, secret['pha-clusters-index']))
        else:
            print(tweet)
            messages.append('Did not tweet')

    return {
        "statusCode": 200,
        "body": json.dumps({
            "message": messages,
        }),
    }
示例#30
0
"""
US Income by State: Wrapped Facet
---------------------------------
This example shows how to create a map of income in the US by state,
faceted over income brackets 
"""
# category: maps

import altair as alt
from vega_datasets import data

states = alt.topo_feature(data.us_10m.url, 'states')
source = data.income.url

alt.Chart(source).mark_geoshape().encode(
    shape='geo:G',
    color='pct:Q',
    tooltip=['name:N', 'pct:Q'],
    facet=alt.Facet('group:N', columns=2),
).transform_lookup(lookup='id',
                   from_=alt.LookupData(data=states, key='id'),
                   as_='geo').properties(
                       width=300,
                       height=175,
                   ).project(type='albersUsa')