def _init_glyph(self, plot, mapping, properties): """ Returns a Bokeh glyph object. """ slope = Slope(level=properties.get('level', 'glyph'), **mapping) plot.add_layout(slope) return None, slope
def plot_observed_vs_expected_correlations(self, value, counts): """Plot_observed_vs_expected_correlations.""" names = [] obs = [] exps = [] countdict = counts.to_dict()['expected_count'] for name, exp in countdict.items(): names.append(name) exps.append(exp) obs.append(value['observed_references'].get(name, 0)) data = pd.DataFrame( zip(names, obs, exps), columns=['Name', 'Observed', 'Expected'] ) data['log_obs'] = [math.log(y+1, 10) for y in data['Observed']] data['log_exp'] = [math.log(y+1, 10) for y in data['Expected']] model = LinearRegression().fit( np.array(data['log_exp'].values).reshape(-1, 1), data['log_obs'] ) regression_line = Slope( gradient=model.coef_[0], y_intercept=model.intercept_, line_color=Colors.light_cornflower_blue ) plot = points.points( [data['log_exp'].tolist()], [data['log_obs'].tolist()], height=350, tools="", toolbar_location=None, output_backend="webgl", colors=[Colors.light_cornflower_blue], x_axis_label='log10(Expected Count)', y_axis_label='log10(Observed Count)', title='Expected vs Observed' ) corrs = ( "Spearmans: {:.2f}, (p = {:.2f}) " "Pearsons: {:.2f}, (p = {:.2f})".format( value['spearmans_rho'], value['spearmans_rho_pval'], value['pearson'], value['pearson_pval'] )) plot.add_layout(regression_line) self.add_plot_title(plot, {}, corrs) self.style_plot(plot) return plot
def test_Slope() -> None: slope = Slope() assert slope.gradient is None assert slope.y_intercept is None assert slope.x_range_name == 'default' assert slope.y_range_name == 'default' assert slope.level == 'annotation' check_line_properties(slope, "", 'black', 1.0) check_properties_existence(slope, ANNOTATION + [ "gradient", "y_intercept", ], LINE)
def joins_sides(stats, colorblind=False): """ Shows the join distribution using a scatter plot. Each join operator is shown by a dot. The x coordinate is the data read from the right-side table, and the y coordinate is the data read from the left-side table. Replicated joins are shown in a different color than Partitioned joins. For optimal performance, keep the right-side smaller than the left-side (the points should be above the y=x line). Replicated joins should be used as long as the right-side table is not too large, to prevent out-of-memory errors. If you are using CBO, ensure the correct statistics are estimated for all tables being joined using ANALYZE command. Optimization Tips - 1. Queries to the left of the black dashed line and above the orange dashed line should all use the REPLICATED join distribution type. 2. Queries to the right of the orange dashed line perform joins with an incorrect table order. Ensure statistics are used, or rewrite the queries to flip the table sides, to boost performance and save cluster resources. """ joins = list(iter_joins(stats)) if not joins: return p = figure( title="Joins distribution", x_axis_label="Right-side data read [bytes]", x_axis_type="log", y_axis_label="Left-side data read [bytes]", y_axis_type="log", sizing_mode="scale_width", tools=TOOLS, ) data = {} for stat, node, probe, build in joins: data.setdefault("x", []).append(build["input_size"]) # right-side data.setdefault("y", []).append(probe["input_size"]) # left-side data.setdefault("dist", []).append(node["distributionType"]) data.setdefault("copy_on_tap", []).append(stat["query_id"]) shape_size = _get_size(colorblind) color_map = {"PARTITIONED": "red", "REPLICATED": "blue"} marker_map = {"PARTITIONED": "circle", "REPLICATED": "square"} data["color"] = [color_map[d] for d in data["dist"]] data["marker"] = [marker_map[d] for d in data["dist"]] source = ColumnDataSource(data) p.scatter("x", "y", marker="marker", color="color", legend_group="dist", alpha=0.5, size=shape_size, source=source) p.select(type=TapTool).callback = CustomJS(args=dict(source=source), code=COPY_JS) p.legend.title = "Join distribution" p.xaxis.ticker = [1, 1e3, 1e6, 1e9, 1e12] p.yaxis.ticker = [1, 1e3, 1e6, 1e9, 1e12] add_constant_line(p, 'height', 1e6) slope = Slope(gradient=1, y_intercept=0, line_color='orange', line_dash='dashed', line_width=2) p.add_layout(slope) return p
def plot_games(data, model, features, **plot_kwargs): plot_kwargs.setdefault("x_axis_label", features[0]) plot_kwargs.setdefault("y_axis_label", features[1]) plot_kwargs.setdefault("tools", TOOLS) plot_kwargs.setdefault( "tooltips", [ ("name", "@name"), ("year", "@year"), ("complexity", "@complexity"), ("time", "@min_time–@max_time minutes"), ("age", "@min_age+"), ], ) plot = figure(**plot_kwargs) data["color"] = [ "#193F4A" if kennerspiel else "#E30613" for kennerspiel in data.ksdj ] data["marker"] = np.where( model.predict(data[features]) == data.ksdj, "circle", "square") plot.scatter( source=data, x=features[0], y=jitter(features[1], width=0.25, distribution="normal"), color="color", marker="marker", # alpha=0.9, size=8, ) w1 = model.coef_[0, 0] w2 = model.coef_[0, 1] b = model.intercept_[0] slope = Slope( gradient=-w1 / w2, y_intercept=-b / w2, line_color="black", line_dash="dashed", line_width=2, ) plot.add_layout(slope) return plot
def figures_chisq_detailed(init_group, df_chisq): df_chisq["dt_str"] = df_chisq.Date.dt.strftime("%Y-%m-%d") df_latest = df_chisq.groupby("CountryProv").apply(lambda g: g.tail(1)).reset_index(drop=True) df_latest["color"] = "#73b2ff" source_hist = ColumnDataSource(df_chisq) source_latest = ColumnDataSource(df_latest) # since cannot use View iwth LabelSet, creating a different source per continent srcLatest_continent = df_latest.groupby("Continent").apply(lambda g: ColumnDataSource(g)) srcLatest_continent = srcLatest_continent.reset_index().rename(columns={0:"src"}) gf = GroupFilter(column_name='CountryProv', group=init_group) view1 = CDSView(source=source_hist, filters=[gf]) plot_size_and_tools = {'plot_height': 300, 'plot_width': 600, 'tools':['box_select', 'reset', 'help', 'box_zoom'], 'x_axis_type': 'datetime', 'tooltips': [ ("Date", "@dt_str"), ], } # FIXME couldnt do p_a1.line below, so using hack of varea p_a1 = figure(title="Confirmed and thresholds. Below threshold: good, above: bad, within: ok", **plot_size_and_tools) p_a1.varea(x='Date', y1='case_ma07_lower', y2='case_ma07_upper', source=source_hist, color='pink', view=view1, fill_alpha=.7, legend_label="mean +/- std band") p_a1.varea(x='Date', y1='case_ma07', y2='case_ma07_eps', source=source_hist, color='red', view=view1, legend_label="7-day moving avg") #p_a1.varea(x='Date', y1='case_ma14', y2='case_ma14_eps', source=source_hist, color='purple', view=view1, legend_label="14-day moving avg") p_a1.varea(x='Date', y1='threshold_min_eps', y2='threshold_max_eps', source=source_hist, color='green', view=view1, fill_alpha=.7, legend_label="chi-squared thresholds band") # band: view= is not supported, so just using varea above #band = Band(base='Date', lower='case_ma07_lower', upper='case_ma07_upper', source=source_hist, level='underlay', # fill_alpha=1.0, line_width=1, line_color='black', view=view1) #p_a1.add_layout(band) c_a1a = p_a1.circle(x='Date', y='daily_conf', source=source_hist, color='black', view=view1) # https://stackoverflow.com/a/51540955/4126114 # https://docs.bokeh.org/en/latest/docs/user_guide/styling.html#inside-the-plot-area p_a1.legend.label_text_font_size = '6pt' p_a1.legend.location = "top_left" p_a2 = figure(title="Total tests (daily vs 7-day moving avg)", **plot_size_and_tools) p_a2.varea(x='Date', y1='tests_ma07_lower', y2='tests_ma07_upper', source=source_hist, color='pink', view=view1) p_a2.varea(x='Date', y1='tests_ma07', y2="tests_ma07_eps", source=source_hist, color='red', view=view1) #p_a2.varea(x='Date', y1='tests_ma14', y2="tests_ma14_eps", source=source_hist, color='purple', view=view1) p_a2.circle(x='Date', y='daily_tests', source=source_hist, color='black', view=view1) p_a2.x_range = p_a1.x_range # lock in the x axis so that zoom works simultaneously on all p_b1 = figure(title="Detrended cases. Negative: good, positive: bad", **plot_size_and_tools) p_b1.varea(x='Date', y1='thresMinMinusMid', y2='thresMaxMinusMid', source=source_hist, color='green', view=view1, legend_label="thresholds band", fill_alpha=0.7) p_b1.varea(x='Date', y1='caseMa07Lower_minusMid', y2='caseMa07Upper_minusMid', source=source_hist, color='pink', view=view1, legend_label="cases ma7 - threshold mid +/- std", fill_alpha=0.7) p_b1.varea(x='Date', y1='case_detrended', y2='caseDet_eps', source=source_hist, color='red', view=view1, legend_label="cases detrended") p_b1.circle(x='Date', y='case_detrended', source=source_hist, color='red', view=view1) p_b1.x_range = p_a1.x_range p_b1.legend.label_text_font_size = '6pt' p_b1.legend.location = "top_left" p_b2 = figure(title="Detrended cases percentage of raw cases", **plot_size_and_tools) p_b2.circle(x='Date', y='caseDet_pct', source=source_hist, color='red', view=view1) p_b2.x_range = p_a1.x_range p_c1 = figure(title="Ratio case/total (daily)", **plot_size_and_tools) c_c1a = p_c1.circle(x='Date', y='ratio_daily', source=source_hist, color='blue', view=view1) p_c2 = figure(title="Ratio case/total (7-day ma)", **plot_size_and_tools) p_c2.circle(x='Date', y='ratio_ma07', source=source_hist, color='blue', view=view1) # general-use lines slope_y0 = Slope(gradient=0, y_intercept=0, line_color='orange', line_width=50) slope_x0 = Slope(gradient=np.Inf, y_intercept=0, line_color='orange', line_width=50) # scatter plot view_us = CDSView(source=source_latest, filters=[GroupFilter(column_name='Continent', group="US")]) view_other = CDSView(source=source_latest, filters=[GroupFilter(column_name='Continent', group="Other")]) TOOLTIPS = [ ("Country/Region", "@CountryProv"), ] p_cont = [] for srcCont_i in srcLatest_continent.iterrows(): srcCont_i = srcCont_i[1] p_d1=figure(plot_width=600,plot_height=400,tooltips=TOOLTIPS,title=srcCont_i.Continent) p_d1.scatter('case_detrended','case_det_diff07',source=srcCont_i.src, size=12,color='color') # , view=view_us p_d1.xaxis.axis_label = 'Cases detrended: values' p_d1.yaxis.axis_label = 'Cases detrended: diff07' from bokeh.models import LabelSet labels = LabelSet(x='case_detrended', y='case_det_diff07', text='cp_code', level='glyph', x_offset=5, y_offset=5, source=srcCont_i.src, render_mode='canvas') p_d1.add_layout(labels) p_d1.add_layout(slope_y0) p_d1.add_layout(slope_x0) p_cont.append(p_d1) # group plots into 3 per row # https://stackoverflow.com/a/1625013/4126114 from itertools import zip_longest p_cont = list(zip_longest(*(iter(p_cont),) * 3)) p_cont = [[e for e in t if e != None] for t in p_cont] g = gridplot([[p_a1, p_a2], [p_b1, p_b2], [p_c1, p_c2]] + p_cont) return source_hist, c_a1a, g
y_axis_label='Winning Percentage', y_range=(0.25, 0.725), title='Winning Percentage vs. Run Differential', tools='hover', tooltips=tooltip, toolbar_location=None) winrun_fig.circle(x='RunDiff', y='WinPCT', radius=2, alpha=0.5, color='#d71d1d', source=rt_cds) bf_line = Slope(gradient=coeffs[-2], y_intercept=coeffs[-1], line_color='#400987', line_width=2) winrun_fig.add_layout(bf_line) error_fig = figure(x_axis_label='Run Differential', x_range=(-350, 310), y_axis_label='Residual', y_range=(-0.075, 0.09), title='Residual Errors in Linear Regression', tools='hover', tooltips=tooltip, toolbar_location=None) error_fig.circle(x='RunDiff', y='Error', radius=2,
def scatter(x, y, label=None, group=None, title="Scatter Plot", xlabel="x", ylabel="y", width=600, height=600, legend=True, size=4, shape="circle", font_size="16pt", label_font_size="13pt", col_palette=None, hover_xy=True, gradient=False, hline=False, vline=False, xrange=None, yrange=None): """Creates a scatterplot using Bokeh. Required Parameters ------------------- x : array-like, shape = [n_samples] Inpute data for x-axis. y : array-like, shape = [n_samples] Inpute data for y-axis. """ # Error check if len(x) != len(y): raise ValueError("length of X does not match length of Y.") # If label is None, give an index based on input order if label is None: label_copy = {} label_copy["Idx"] = list(range(len(x))) else: try: label2 = label.copy() label2_dict = label2.to_dict("series") label_copy = label2_dict # Ensure I don't overwrite label (when plot_groupmean=True) except TypeError: label2 = label.copy() label_copy = {} label_copy[label2.name] = label2.values.tolist() # If colour palette is None (default): if col_palette is None: col_palette = [ "red", "blue", "green", "orange", "blueviolet", "gold", "peru", "pink", "darkblue", "olive", "teal", "slategray" ] # Group is None or allow for multiple classes if group is None: group_copy = [None] * len(x) col = [] for i in range(len(x)): col.append(col_palette[2]) else: group_copy = group.copy() group_unique = np.sort(np.unique(group_copy)) col = [] for i in range(len(group_copy)): for j in range(len(group_unique)): if group_copy[i] == group_unique[j]: col.append(col_palette[j]) # Bokeh data source with data labels data = {"x": x, "y": y, "group": group_copy, "col": col} data_label = {} for name, val in label_copy.items(): data_label[name] = val data.update(data_label) source = ColumnDataSource(data=data) # Tool-tip (add everything in label_copy) TOOLTIPS = [] if hover_xy is True: TOOLTIPS = [("x", "@x{1.111}"), ("y", "@y{1.111}")] for name, val in data_label.items(): TOOLTIPS.append((str(name), "@" + str(name))) # Base figure fig = figure(title=title, x_axis_label=xlabel, y_axis_label=ylabel, plot_width=width, plot_height=height, x_range=xrange, y_range=yrange) # Add to plot if shape is "circle": shape = fig.circle("x", "y", size=size, alpha=0.6, color="col", legend="group", source=source) elif shape is "triangle": shape = fig.triangle("x", "y", size=size, alpha=0.6, color="col", legend="group", source=source) else: raise ValueError("shape has to be either 'circle' or 'triangle'.") shape_hover = HoverTool(renderers=[shape], tooltips=TOOLTIPS) fig.add_tools(shape_hover) if gradient is not False: slope = Slope(gradient=gradient, y_intercept=0, line_color="black", line_width=2, line_alpha=0.3) fig.add_layout(slope) new_gradient = -(1 / gradient) slope2 = Slope(gradient=new_gradient, y_intercept=0, line_color="black", line_dash="dashed", line_width=2, line_alpha=0.10) fig.add_layout(slope2) if hline is not False: h = Span(location=0, dimension="width", line_color="black", line_width=3, line_alpha=0.15) fig.add_layout(h) if vline is not False: v = Span(location=0, dimension="height", line_color="black", line_width=3, line_alpha=0.15) fig.add_layout(v) # Font-sizes fig.title.text_font_size = font_size fig.xaxis.axis_label_text_font_size = label_font_size fig.yaxis.axis_label_text_font_size = label_font_size # Extra padding fig.min_border_left = 20 fig.min_border_right = 20 fig.min_border_top = 20 fig.min_border_bottom = 20 # Remove legend if legend is False: fig.legend.visible = False return fig
def scatterplot_comparison( controls_df: pd.DataFrame, result_df: pd.DataFrame, data_label: str, *, ref_label: Union[str, List[str]] = None, category_labels: Dict = None, controls_name: str = 'controls', result_name: str = 'model', size: float = 7.5, fill_alpha: float = 0.2, facet_col: str = None, facet_col_wrap: int = 2, facet_sort_order: bool = True, facet_sync_axes: str = 'both', hover_col: Union[str, List[str]] = None, glyph_col: str = None, glyph_legend: bool = True, glyph_legend_location: str = 'bottom_right', glyph_legend_label_text_font_size: str = '11px', figure_title: str = None, plot_height: int = None, identity_line: bool = True, identity_colour: str = 'red', identity_width: int = 2, color_palette: Dict[int, Any] = Category20, calc_pct_diff: bool = True, totals_in_titles: bool = True, filter_zero_rows: bool = True ) -> Tuple[pd.DataFrame, Union[Column, Figure, GridBox]]: """Creates an interactive Bokeh-based scatter plot to compare data. Args: controls_df (pd.DataFrame): A DataFrame containing control values. Must be in wide-format where rows represent a reference (e.g. count station, TAZ, geography, etc.) and columns represent the data categories. result_df (pd.DataFrame): A DataFrame containing modelled values. Uses the same format as `controls_df`. data_label (str): The name to use for the data represented by the `controls_df` and `result_df` columns. ref_label (Union[str, List[str]], optional): Defaults to ``None``. The name(s) corresponding to the ``controls_df`` and ``result_df`` indices. The function will try to infer the name(s) from indices of the source DataFrames. If the indicies of the DataFrames are not set, then values must be set for this parameter, otherwise an error will be raised. If providing a value to this parameter and the indices of the source DataFrames are MultiIndex objects, then the provided value must be a list of strings. category_labels (Dict, optional): Defaults to ``None``. Category labels used to rename the `controls_df` and `result_df` columns. controls_name (str, optional): Defaults to ``'controls'``. The name for the controls. result_name (str, optional): Defaults to ``'model'``. The name for the results. size (float, optional): Defaults to ``7.5``. The size of the scatter plot points. fill_alpha (float, optional): Defaults to ``0.2``. The opacity of the point fill. facet_col (str, optional): Defaults to ``None``. The name of the column to use for creating a facet plot. facet_col_wrap (int, optional): Defaults to ``2``. The number of columns to wrap subplots in the facet plot. facet_sort_order (bool, optional): Defaults to ``True``. A flag to render facet subplots in ascending order sorted by unique ``facet_col`` values. facet_sync_axes (str, optional): Defaults to ``'both'``. Option to sync/link facet axes. Accepts one of ``['both', 'x', 'y']``. Set to None to disable linked facet plot axes. hover_col (Union[str, List[str]], optional): Defaults to ``None``. The column names to display in the plot tooltips. glyph_col (str, optional): Defaults to ``None``. The name of the column to use for glyph coloring. A standard color palette will be mapped to unique ``glyph_col`` values. glyph_legend (bool, optional): Defaults to ``True``. A flag to enable/disable the legend if ``glyph_col`` is set. The legend will be included in each plot/facet subplot. glyph_legend_location (str, optional): Defaults to ``'bottom_right'``. The location of the glyph legend in each plot/facet subplot. Please refer to the Bokeh ``Legend`` documentation for acceptable values. glyph_legend_label_text_font_size (str, optional): Defaults to ``'11px'``. The text size of the legend labels. figure_title (str, optional): Defaults to ``None``. The chart title to use. plot_height (int, optional): Defaults to ``None``. The desired plot height. For facet plots, this value will be set for each subplot. identity_line (bool, optional): Defaults to ``True``. A flag to include an identity (1:1) line in the scatter plot. identity_colour (str, optional): Defaults to ``'red'``. The colour to use for the identity line. Accepts html colour names. identity_width (int, optional): Defaults to ``2``. The line width to use for the identity line. color_palette (Dict[str, Any], optional): Defaults to ``Category20``. The Bokeh color palette to use. calc_pct_diff (bool, optional): Defaults to ``True``. Include percent difference calculation in DataFrame output totals_in_titles (bool, optional): Defaults to ``True``. Include the control and result totals in plot title. filter_zero_rows (bool, optional): Defaults to ``True``. Filter out comparisons where controls and results are both zeros. Returns: Tuple[pd.DataFrame, Union[Column, Figure, GridBox]] """ if not controls_df.index.equals(result_df.index): warnings.warn( 'Indices for `controls_df` and `result_df` are not identical; function may not produce desired ' 'results') if not controls_df.columns.equals(result_df.columns): warnings.warn( 'Columns for `controls_df` and `result_df` are not identical; function may not produce desired ' 'results') if ref_label is None: assert np.all(controls_df.index.names == result_df.index.names), 'Unable to resolve different index names, ' \ 'please specify values for `ref_label` instead' assert not (None in controls_df.index.names ), 'Some index levels in `controls_df` do not have names' assert not (None in result_df.index.names ), 'Some index levels in `result_df` do not have names' ref_label = list(controls_df.index.names) elif isinstance(ref_label, Hashable): ref_label = [ref_label] elif isinstance(ref_label, List): pass else: raise RuntimeError('Invalid data type provided for `ref_label`') if hover_col is None: hover_col = [] if isinstance(hover_col, Hashable): hover_col = [hover_col] elif isinstance(hover_col, List): pass else: raise RuntimeError('Invalid data type provided for `ref_label`') # Prepare data for plotting df = controls_df.stack() df.index.names = [*ref_label, data_label] df = df.to_frame(name=controls_name) df[result_name] = result_df.stack() df[result_name].fillna(0, inplace=True) if filter_zero_rows: df = df[df.sum(axis=1) > 0].copy() df.reset_index(inplace=True) if category_labels is not None: df[data_label] = df[data_label].map(category_labels) fig_df = df.copy() if totals_in_titles: label_totals = fig_df.groupby(data_label)[[controls_name, result_name]].sum() label_totals[ 'label'] = label_totals.index + f' ({controls_name}=' + label_totals[ controls_name].map( '{:,.0f}'.format ) + f', {result_name}=' + label_totals[result_name].map( '{:,.0f}'.format) + ')' fig_df[data_label] = fig_df[data_label].map(label_totals['label']) if glyph_col is not None: n_colors = max(len(fig_df[glyph_col].unique()), 3) color_palette = color_palette[n_colors] # Prepare figure formatting values source = ColumnDataSource(fig_df) tooltips = [(c, '@{%s}' % c) for c in hover_col] tooltips += [(controls_name, '@{%s}{0,0.0}' % controls_name), (result_name, '@{%s}{0,0.0}' % result_name)] figure_params = _prep_figure_params(controls_name, result_name, tooltips, plot_height) glyph_params = { 'source': source, 'x': controls_name, 'y': result_name, 'size': size, 'fill_alpha': fill_alpha, 'hover_color': 'red' } slope = Slope(gradient=1, y_intercept=0, line_color=identity_colour, line_dash='dashed', line_width=identity_width) def apply_legend_settings(p_: Figure): p_.legend.visible = glyph_legend p_.legend.title = glyph_col p_.legend.location = glyph_legend_location p_.legend.label_text_font_size = glyph_legend_label_text_font_size p_.legend.click_policy = 'hide' # Plot figure if facet_col is None: # Basic plot p = figure(sizing_mode='stretch_both', **figure_params) if glyph_col is None: # Single glyphs p.circle(**glyph_params) else: # Iterate through unique `glyph_col` values to use interactive legend feature for i, gc in enumerate(sorted(fig_df[glyph_col].unique())): source_view = CDSView( source=source, filters=[GroupFilter(column_name=glyph_col, group=gc)]) p.circle(view=source_view, legend_label=gc, color=color_palette[i], **glyph_params) apply_legend_settings(p) if identity_line: p.add_layout(slope) fig = p else: # Facet plot fig = [] facet_column_items = fig_df[facet_col].unique().tolist() facet_column_items = sorted( facet_column_items) if facet_sort_order else facet_column_items linked_axes = {} for i, fc in enumerate(facet_column_items): p = figure(title=fc, **figure_params, **linked_axes) filters = [GroupFilter(column_name=facet_col, group=fc)] if glyph_col is None: # Single glyphs source_view = CDSView(source=source, filters=filters) p.circle(view=source_view, **glyph_params) else: # Iterate through unique `glyph_col` values to use interactive legend feature for j, gc in enumerate(sorted(fig_df[glyph_col].unique())): filters_ = filters + [ GroupFilter(column_name=glyph_col, group=gc) ] source_view = CDSView(source=source, filters=filters_) p.circle(view=source_view, legend_label=gc, color=color_palette[j], **glyph_params) apply_legend_settings(p) if (i == 0) and (facet_sync_axes is not None): if facet_sync_axes.lower() in ['x', 'both']: linked_axes['x_range'] = p.x_range if facet_sync_axes.lower() in ['y', 'both']: linked_axes['y_range'] = p.y_range if identity_line: p.add_layout(slope) fig.append(p) fig = gridplot(fig, ncols=facet_col_wrap, sizing_mode='stretch_both', merge_tools=True) if figure_title is not None: fig = _wrap_figure_title(fig, figure_title) if calc_pct_diff: df['pct_diff'] = (df[result_name] - df[controls_name]) / df[controls_name] * 100 df['pct_diff'] = df['pct_diff'].replace([np.inf, -np.inf], np.nan) return df, fig
source=source, selection_color="red", alpha=0.6, nonselection_alpha=0.1, selection_alpha=0.4) orig_data_linreg = corr.line('StockFwdRets', 'LineRegressTotal', source=source, color='purple') filtered_data_linreg = corr.line('StockFwdRets', 'FilteredLineRegressTotal', source=source, color='orange') modified_slope_obj = Slope( gradient=linreg_data_source.to_df().iloc[0]['modified_total_gradient'], y_intercept=linreg_data_source.to_df().iloc[0]['modified_total_yint'], line_color='orange', line_dash='dashed', line_width=3.5) # corr.add_layout(modified_slope_obj) # first time series chart ts1 = figure(plot_width=500, plot_height=400, tools=tools, x_axis_type='datetime', active_drag="xbox_select") ts1.line('date', 'StockAdjClose', source=source_static) ts1.circle('date', 'StockAdjClose', size=2, source=source,
def plot(self, metric="r2q2", hide_pval=True, grid_line=False, legend=True): # Choose metric to plot metric_title = np.array([ "ACCURACY", "AIC", "AUC", "BIC", "F1-SCORE", "PRECISION", "R²", "SENSITIVITY", "SPECIFICITY", "SSE" ]) metric_list = np.array([ "acc", "aic", "auc", "bic", "f1score", "prec", "r2q2", "sens", "spec", "sse" ]) metric_idx = np.where(metric_list == metric)[0][0] mname = metric_title[metric_idx] stats = [] stats.append( [self.stats_original[0][mname], self.stats_original[1][mname], 1]) for i in self.stats_perm: stats.append([i[0][mname], i[1][mname], i[2]]) self.stats = stats if metric == "r2q2": full_text = "R²" cv_text = "Q²" else: full_text = mname + "full" cv_text = mname + "cv" # Split data for plotting (corr, r2, q2) stats_r2 = [] stats_q2 = [] stats_corr = [] for i in range(len(stats)): stats_r2.append(stats[i][0]) stats_q2.append(stats[i][1]) stats_corr.append(stats[i][2]) # Calculate gradient, and y-intercept for plot 1 r2gradient = (stats_r2[0] - np.mean(stats_r2[1:])) / (1 - np.mean(stats_corr[1:])) q2gradient = (stats_q2[0] - np.mean(stats_q2[1:])) / (1 - np.mean(stats_corr[1:])) r2yintercept = stats_r2[0] - r2gradient q2yintercept = stats_q2[0] - q2gradient max_vals = max(np.max(stats_r2), np.max(stats_q2)) min_vals = min(np.min(stats_r2), np.min(stats_q2)) y_range_share = (min_vals - abs(0.2 * min_vals), max_vals + abs(0.1 * min_vals)) # Figure 1 data = {"corr": stats_corr, "r2": stats_r2, "q2": stats_q2} source = ColumnDataSource(data=data) fig1 = figure(plot_width=470, plot_height=410, x_range=(-0.15, 1.15), x_axis_label="Correlation", y_range=y_range_share, y_axis_label=full_text + " & " + cv_text) # Lines r2slope = Slope(gradient=r2gradient, y_intercept=r2yintercept, line_color="black", line_width=2, line_alpha=0.3) q2slope = Slope(gradient=q2gradient, y_intercept=q2yintercept, line_color="black", line_width=2, line_alpha=0.3) fig1.add_layout(r2slope) fig1.add_layout(q2slope) # Points r2_square = fig1.square("corr", "r2", size=6, alpha=0.5, color="red", legend=full_text, source=source) q2_square = fig1.square("corr", "q2", size=6, alpha=0.5, color="blue", legend=cv_text, source=source) # Add Hovertool fig1.add_tools( HoverTool(renderers=[r2_square], tooltips=[(full_text + " Value", "@r2")])) fig1.add_tools( HoverTool(renderers=[q2_square], tooltips=[(cv_text + " Value", "@q2")])) # Extra padding fig1.min_border_left = 20 fig1.min_border_right = 20 fig1.min_border_top = 20 fig1.min_border_bottom = 20 #fig1.legend.location = "bottom_right" # Calculate Density cure for Figure 2 # Density curve X1 = np.array(stats_r2[1:]) x1_min, x1_max = X1.min(), X1.max() x1_padding = (x1_max - x1_min) * 0.6 x1_grid = np.linspace(x1_min - x1_padding, x1_max + x1_padding, 50) x1_pdf = scipy.stats.gaussian_kde(X1, "scott") x1_pdf_grid = x1_pdf(x1_grid) # Density curve X2 = np.array(stats_q2[1:]) x2_min, x2_max = X2.min(), X2.max() x2_padding = (x2_max - x2_min) * 0.6 x2_grid = np.linspace(x2_min - x2_padding, x2_max + x2_padding, 50) x2_pdf = scipy.stats.gaussian_kde(X2, "scott") x2_pdf_grid = x2_pdf(x2_grid) x2_pdf_grid = [-x for x in x2_pdf_grid] # Figure 2 if hide_pval == True: y_range_share2 = (min_vals - abs(0.2 * min_vals), max_vals + abs(0.1 * max_vals)) ymin = min(x2_pdf_grid) - 1 xmin = max(x1_pdf_grid) + 1 yy_range = (ymin - abs(0.1 * ymin), xmin + abs(0.1 * xmin)) else: y_range_share2 = [min_vals - abs(0.2 * min_vals), max_vals + 0.8] ymin = min(x2_pdf_grid) - 1.2 xmin = max(x1_pdf_grid) + 1.2 yy_range = (ymin - 1, xmin + 1) if metric == "auc": if y_range_share2[1] > 1.5: y_range_share2[1] = 1.5 y_range_share2 = tuple(y_range_share2) fig2 = figure(plot_width=470, plot_height=410, x_axis_label=full_text + " & " + cv_text, y_axis_label="p.d.f.", x_range=y_range_share2, y_range=yy_range) slope_0 = Span(location=0, dimension="width", line_color="black", line_width=2, line_alpha=0.3) fig2.add_layout(slope_0) # Plot distribution fig2.patch(x1_grid, x1_pdf_grid, alpha=0.35, color="red", line_color="grey", line_width=1) fig2.patch(x2_grid, x2_pdf_grid, alpha=0.35, color="blue", line_color="grey", line_width=1) # Extra padding fig2.min_border_left = 60 fig2.min_border_right = 20 fig2.min_border_top = 20 fig2.min_border_bottom = 20 # Lollipops R2 # Do a t-test #a = ttest_1samp(stats_r2[1:], [stats_r2[0]])[1][0] #b = a / 2 b = ttest_ind(stats_r2[1:], [stats_r2[0]], alternative='smaller')[1] if b > 0.005: data2_manu = "%0.2f" % b else: data2_manu = "%0.2e" % b # Plot data2 = { "x": [stats_r2[0]], "y": [max(x1_pdf_grid) + 1], "hover": [data2_manu] } source2 = ColumnDataSource(data=data2) data2_line = { "x": [stats_r2[0], stats_r2[0]], "y": [max(x1_pdf_grid) + 1, 0], "hover": [str(data2_manu), str(data2_manu)] } source2_line = ColumnDataSource(data=data2_line) r2fig2_line = fig2.line("x", "y", line_width=2.25, line_color="red", alpha=0.5, source=source2_line) r2fig2 = fig2.circle("x", "y", fill_color="red", line_color="grey", alpha=0.75, size=7, legend=full_text, source=source2) # Lollipops Q2 # Do a t-test # if ttest_1samp(stats_q2[1:], [stats_q2[0]])[1][0] / 2 > 0.005: # a = ttest_1samp(stats_q2[1:], [stats_q2[0]])[1][0] # b = a / 2 # data3_manu = "%0.2f" % b # else: # a = ttest_1samp(stats_q2[1:], [stats_q2[0]])[1][0] # b = a / 2 # data3_manu = "%0.2e" % b b = ttest_ind(stats_q2[1:], [stats_q2[0]], alternative='smaller')[1] if b > 0.005: data3_manu = "%0.2f" % b else: data3_manu = "%0.2e" % b # Plot data3 = { "x": [stats_q2[0]], "y": [min(x2_pdf_grid) - 1], "hover": [data3_manu] } source3 = ColumnDataSource(data=data3) data3_line = { "x": [stats_q2[0], stats_q2[0]], "y": [(min(x2_pdf_grid) - 1), 0], "hover": [data3_manu, data3_manu] } source3_line = ColumnDataSource(data=data3_line) q2fig2_line = fig2.line("x", "y", line_width=2.25, line_color="blue", alpha=0.5, source=source3_line) q2fig2 = fig2.circle("x", "y", fill_color="blue", line_color="grey", alpha=0.75, size=7, legend=cv_text, source=source3) if hide_pval == False: # Add text textr2 = "True " + full_text + "\nP-Value: {}".format(data2_manu) textq2 = "True " + cv_text + "\nP-Value: {}".format(data3_manu) fig2.text(x=[stats_r2[0] + 0.05, stats_q2[0] + 0.05], y=[(max(x1_pdf_grid) + 0.5), (min(x2_pdf_grid) - 1.5)], text=[textr2, textq2], angle=0, text_font_size="8pt") # Font-sizes fig1.xaxis.axis_label_text_font_size = "13pt" fig1.yaxis.axis_label_text_font_size = "13pt" fig2.xaxis.axis_label_text_font_size = "12pt" fig2.yaxis.axis_label_text_font_size = "12pt" fig1.legend.location = "bottom_right" fig2.legend.location = "top_left" fig1.legend.visible = True fig2.legend.visible = True if grid_line == False: fig1.xgrid.visible = False fig1.ygrid.visible = False fig2.xgrid.visible = False fig2.ygrid.visible = False if legend == False: fig1.legend.visible = False fig2.legend.visible = False fig = gridplot([[fig1, fig2]]) return fig
def scatter_matrix( df, *, xs: Sequence[str] = None, ys: Sequence[str] = None, width=None, height=None, regression=True, **kwargs, ): assert len(df) > 0, 'TODO handle this' # FIXME handle empty df source = CDS(df) # TODO what about non-numeric stuff? xs = df.columns if xs is None else xs ys = df.columns if ys is None else ys ys = list(reversed( ys)) # reorder to move meaningful stuff to the top left corner isnum = lambda c: is_numeric_dtype(df.dtypes[c]) # reorder so non-numeric is in the back # todo mode to drop non-numeric? not sure.. definitely can drop 'error' and datetimish? xs = list(sorted(xs, key=isnum, reverse=True)) ys = list(sorted(ys, key=isnum, reverse=True)) from bokeh.models import Label # TODO not sure I wanna reuse axis? def make(xc: str, yc: str): p = figure(df=df) diag = xc == yc # todo handle properly # TODO not sure if I even want them... move to the very end? if isnum(xc) and isnum(yc): p.scatter(x=xc, y=yc, source=source, size=3) else: # TODO ugh, doesn't want to show the label without any points?? # p.circle(x=0.0, y=0.0) # FIXME how to make sure text fits into the plot?? add_text( p, x=0.0, y=0.0, text='Not numeric', text_color='red', ) p.xaxis.axis_label = xc p.yaxis.axis_label = yc return p grid = [[make(xc=x, yc=y) for x in xs] for y in ys] from bokeh.layouts import gridplot w1 = None if width is None else width // min(len(xs), len(ys)) h1 = None if height is None else height // min(len(xs), len(ys)) grid_res = gridplot(grid, plot_width=w1, plot_height=h1) # TODO might be useful to include/exclude specific cols (e.g. datetime) while keeping them in annotations # TODO add the presence of the grid to the 'visual tests' # but if I swith it to raw bokeh -- it has Grid class.. might need to mess with # also maybe add extra axis under each plot in the grid? easier for a huge matrix of plots # some code in old dashboard if not regression: return grid_res # todo this would be need for plotly as well? import statsmodels.formula.api as smf # type: ignore for plot in chain.from_iterable(grid): gs = plot.renderers if len(gs) == 0: # must be non-numeric? meh though continue [g] = gs xx = g.glyph.x yy = g.glyph.y if xx == yy: # diagonal thing, e.g. histogram. compute some stats?? continue with pd.option_context('mode.use_inf_as_null', True): # FIXME proper error handling, display number of dropped items? dd = df[[xx, yy]].dropna() # otherwise from_scatter fails # todo would be nice to display stats on the number of points dropped udd = dd.drop_duplicates() if len(udd) <= 1: # can't perform a reasonable regression then add_text( plot, x=0.0, y=0.0, text='ERROR: no points to correlate', text_color='red', ) continue res = smf.ols(f"{yy} ~ {xx}", data=dd).fit() intercept = res.params['Intercept'] slope = res.params[xx] r2 = res.rsquared ## TODO crap. is it really the best way to figure out relative position?? relx = 0.01 rely = 0.1 # todo highlight high enough R2? minx, maxx = min(dd[xx]), max(dd[xx]) miny, maxy = min(dd[yy]), max(dd[yy]) # todo font size dependent on width?? ugh. txt = f'R2 = {r2:.4f}\nY ~ {slope:.3f} X' # todo need to add various regression properties, like intercept, etc # TODO hopefuly this overlays correctly?? not sure about nans, again from bokeh.models import Slope sl = Slope(gradient=slope, y_intercept=intercept, line_color='green', line_width=3) plot.add_layout(sl) add_text( plot, text=txt, x=minx + (maxx - minx) * relx, y=miny + (maxy - miny) * rely, text_color=g.glyph.line_color, ) # TODO dynamic resizing would be nice return grid_res
def triadEffortPlot(args): """ Plot concatenated pickled data from triadEffortData """ from .stats import unpickleAll # Initializing bokeh is an expensive operation and this module is imported # alot, so only do it when necessary. from bokeh.palettes import Set3 from bokeh.plotting import figure from bokeh.models import RadioButtonGroup, CustomJS, Slope from bokeh.embed import json_item from bokeh.layouts import column p = figure( plot_width=1000, plot_height=500, sizing_mode='scale_both', x_range=(0, 1), y_range=(0, 1), output_backend="webgl", ) data = list(unpickleAll(sys.stdin.buffer)) colors = Set3[len(data)] lines = dict() for o, color in zip(data, colors): name = o['layout'].name assert name not in lines lines[name] = p.line(o['x'], o['y'], line_width=1, color=color, legend_label=name, name=name) # color: base1 slope = Slope(gradient=1, y_intercept=0, line_color='#93a1a1', line_dash='dashed', line_width=1) p.add_layout(slope) setPlotStyle(p) for axis, size, font in ((p.xaxis, '1em', 'IBM Plex Sans'), (p.yaxis, '1em', 'IBM Plex Sans')): axis.major_label_text_font_size = size axis.major_label_text_font = font LABELS = ["All", "Standard", "Usable"] visible = { 0: list(lines.keys()), 1: ['ar-asmo663', 'ar-linux', 'ar-osx'], 2: ['ar-lulua', 'ar-ergoarabic', 'ar-malas', 'ar-linux', 'ar-osx'], } ranges = { 0: [(0, 1), (0, 1)], 1: [(0, 0.5), (0, 0.4)], 2: [(0, 0.5), (0, 0.4)], } presets = RadioButtonGroup(labels=LABELS, active=0) # Set visibility and x/yranges on click. Not sure if there’s a more pythonic way. presets.js_on_click( CustomJS(args=dict(lines=lines, plot=p, visible=visible, ranges=ranges), code=""" for (const [k, line] of Object.entries (lines)) { line.visible = visible[this.active].includes (k); } const xrange = plot.x_range; xrange.start = ranges[this.active][0][0]; xrange.end = ranges[this.active][0][1]; const yrange = plot.y_range; yrange.start = ranges[this.active][1][0]; yrange.end = ranges[this.active][1][1]; """)) json.dump(json_item(column(p, presets)), sys.stdout) return 0
def figures_slopes(df_slopes, df_pop): nbStart = 7 nbEnd = 0 rolling = 7 df_countrySlopes = determineSlope(df_slopes, df_pop, nbStart, nbEnd, rolling) df_countrySlopes = df_countrySlopes.replace([np.inf, -np.inf], np.nan) df_countrySlopes = df_countrySlopes.dropna() #df_countrySlopes=df_countrySlopes[df_countrySlopes.casesSlopePval<0.05] #df_countrySlopes=df_countrySlopes[df_countrySlopes.testsSlopePval<0.05] df_countrySlopes["temp"] = "0" df_countrySlopes.loc[ df_countrySlopes.testsWeeklyPerc >= df_countrySlopes.casesWeeklyPerc, ['temp']] = "1" #df_countrySlopes[["CountryProv","casesWeeklyPerc","testsWeeklyPerc"]].to_csv("df_countrySlopes.csv", index=False) df_countrySlopes = ColumnDataSource(df_countrySlopes) gf = GroupFilter(column_name='temp', group="1") view1 = CDSView(source=df_countrySlopes, filters=[gf]) gf = GroupFilter(column_name='temp', group="0") view2 = CDSView(source=df_countrySlopes, filters=[gf]) TOOLTIPS = [ ("Country/Region", "@CountryProv"), ("Cases Rate (%)", "@casesWeeklyPerc"), ("Tests Rate (%)", "@testsWeeklyPerc"), ] p1 = figure(tooltips=TOOLTIPS, tools=",pan,tap,box_zoom,reset", title="Generated on the basis of " + str(rolling) + " day moving average") r1 = p1.scatter('casesWeeklyPerc', 'testsWeeklyPerc', source=df_countrySlopes, size=12, color='#73b2ff', legend_label='Tests Rate > Cases Rate', view=view1) r2 = p1.scatter('casesWeeklyPerc', 'testsWeeklyPerc', source=df_countrySlopes, size=12, color='#ff7f7f', legend_label='Tests Rate < Cases Rate', view=view2) p1.xaxis.axis_label = 'Weekly Rate of Change for Positive Cases(%)' p1.yaxis.axis_label = 'Weekly Rate of Change for Nb. Tests(%)' p1.ray([0], [0], length=0, angle=np.pi, color='white') p1.ray([0], [0], length=0, angle=0, color='white') p1.ray([0], [0], length=0, angle=np.pi / 2, color='white') p1.ray([0], [0], length=0, angle=3 * np.pi / 2, color='white') editplotcolors(p1) slope = Slope(gradient=1, y_intercept=0, line_color='white', line_dash='dashed', line_width=2) p1.add_layout(slope) p1.legend.background_fill_alpha = 0.8 p1.legend.background_fill_color = "#262626" p1.legend.border_line_alpha = 0 p1.legend.label_text_color = "whitesmoke" p1.legend.location = 'top_right' p1.toolbar_location = "right" from bokeh.layouts import row, column, widgetbox return df_countrySlopes, p1
def generate_figure(self, columns, category_select=None, genome_features=None, reaction_scores=None, reaction_percentiles=None): # To include with figure object TOOLTIPS = [("reaction", "@tooltip"), ("(x,y)", "($x, $y)")] ################################################################## # The output figure will be saved using the 'grid' function # Each row in the figure will be from a pair of columns in the matrix # The first scatterplot will be the general "genome-features background" # The second scatterplot will be the reaction percentiles and subsystems # The third "column" in a row will contain the subsystem select figure_grid = list() for first_column in range(len(columns)): for second_column in range(len(columns)): if (first_column >= second_column): continue # Row of figures for the pair of conditions figure_row = list() ################################################################## # For the first scatterplot, it is optional if (genome_features is not None or reaction_scores is not None): # Find range for axes x_max = math.ceil( max(genome_features[columns[first_column]])) y_max = math.ceil( max(genome_features[columns[second_column]])) plot_max = max([x_max, y_max]) bokeh_fig = figure(x_range=(0.0, plot_max), y_range=(0.0, plot_max)) bokeh_fig.xaxis.axis_label = columns[first_column] bokeh_fig.yaxis.axis_label = columns[second_column] bokeh_fig.title.text = "Genome Features Expression Abundances" genome_source = ColumnDataSource( data=dict(genome_features)) # Plot as black and visible scatter_fig = bokeh_fig.circle(x=columns[first_column], y=columns[second_column], source=genome_source, color='black', size=4, visible=True) reaction_source = ColumnDataSource( data=dict(reaction_scores)) # Plot as red scatter_fig = bokeh_fig.circle(x=columns[first_column], y=columns[second_column], source=reaction_source, color='red', size=6, visible=True) slope_line = Slope(gradient=1, y_intercept=0, line_color="red") bokeh_fig.add_layout(slope_line) figure_row.append(bokeh_fig) ################################################################## # For the second scatterplot if (reaction_percentiles is not None): ################################################################## # Set up parent figure object bokeh_fig = figure(tooltips=TOOLTIPS, x_range=(0.0, 1.0), y_range=(0.0, 1.0)) bokeh_fig.xaxis.axis_label = columns[first_column] bokeh_fig.yaxis.axis_label = columns[second_column] bokeh_fig.xaxis.formatter = NumeralTickFormatter( format="0.0") bokeh_fig.yaxis.formatter = NumeralTickFormatter( format="0.0") bokeh_fig.title.text = "Model Reactions Percentile Rank (p<0.01)" ################################################################## # The data is transformed into ColumnDataSource object to allow for CustomJS to work # The source_dict stores the data after it's been transformed into ColumnDataSource # The scatter_dict stores the individual bokeh scatterplots for rendering in CustomJS source_dict = dict() scatter_dict = dict() ################################################################## # For the background data, all the data is captured under a single 'All' key # It is added first, so that it will always be in the background # It is intentionally made visible and won't be changed in the CustomJS # Transform source = ColumnDataSource( data=dict(reaction_percentiles['All'])) # Store transformation source_dict['All'] = source # Plot as black and visible scatter_fig = bokeh_fig.circle(x=columns[first_column], y=columns[second_column], source=source, color='color', size='size', fill_alpha='fill_alpha', visible=True) # Store plot scatter_dict['All'] = scatter_fig ################################################################## # For the foreground data, the scatter plot for each subsystem is create # separately, but made invisible, to be used with the Select dropdown for scatter in reaction_percentiles.keys(): # Not using the 'All' background data if (scatter == 'All'): continue # Transform source = ColumnDataSource( data=dict(reaction_percentiles[scatter])) # Store transformation source_dict[scatter] = source # Plot as red but not visible scatter_fig = bokeh_fig.circle( x=columns[first_column], y=columns[second_column], source=source, color='color', size='size', fill_alpha='fill_alpha', visible=False) # Store plot scatter_dict[scatter] = scatter_fig # Add red central slope slope_line = Slope(gradient=1, y_intercept=0, line_color="red") bokeh_fig.add_layout(slope_line) # Add parent figure to row of figures figure_row.append(bokeh_fig) # Add subsystem selector # Starts with default value of "None" and allows user to pick one # whereupon, according to CustomJS code below, it'll become visible subsystem_select = Select(title="Select Subsystem:", value="None", options=['None'] + sorted(category_select)) # Add JS callback callback = CustomJS(args=dict( source=source_dict, figs=scatter_dict, subsystem_select=subsystem_select), code=""" console.log("Updating") for (let scatter in source){ // Only choose subsystem if(scatter == 'All'){ continue } // Chosen subsystem if(scatter == subsystem_select.value){ figs[scatter].visible=true // Iterate through datapoints to make sure they are red and of a larger size for (let i = 0; i < source[scatter].data['color'].length; i++) { // This is where I would scale with p-value source[scatter].data['color'][i] = 'red' source[scatter].data['size'][i] = 8 } } else { // Here we have to make sure that the non-chosen subsystems are not visible figs[scatter].visible=false // default values, but this is really un-necessary for (let i = 0; i < source[scatter].data['color'].length; i++) { source[scatter].data['color'][i] = 'black' source[scatter].data['size'][i] = 6 } } // Actually show change in plot source[scatter].change.emit() } """) subsystem_select.js_on_change('value', callback) # Add subsystem selector to row of figures figure_row.append(subsystem_select) # Add row of figures to grid figure_grid.append(figure_row) return figure_grid
def figure_scatter_values(df_chisq): df_chisq["casema07_diff07"] = df_chisq.case_ma07.diff(periods=1) df_chisq["testsma07_diff07"] = df_chisq.tests_ma07.diff(periods=1) df_chisq["casedet_diff07"] = df_chisq.case_detrended.diff(periods=1) df_chisq["casedetpct_diff07"] = df_chisq.caseDet_pct.diff(periods=1) df_chisq[ "angle"] = df_chisq.testsma07_diff07 / df_chisq.casema07_diff07 * 3.14 df_chisq["casema07_start"] = df_chisq.case_ma07 - df_chisq.casema07_diff07 df_chisq[ "testsma07_start"] = df_chisq.tests_ma07 - df_chisq.testsma07_diff07 df_chisq[ "casedet_start"] = df_chisq.case_detrended - df_chisq.casedet_diff07 df_chisq[ "casedetpct_start"] = df_chisq.caseDet_pct - df_chisq.casedetpct_diff07 df_chisq["dt_str"] = df_chisq.Date.dt.strftime("%Y-%m-%d") # FIXME # df_chisq.set_index(["CountryProv","Date"]).tail()[['case_ma07', 'tests_ma07', 'casema07_diff07', 'testsma07_diff07', 'casema07_start', 'testsma07_start']] print("gathering moving 14-day windows") #df_sub = df_chisq[df_chisq.Date >= "2020-04-28"] df_sub = df_chisq df_latest = [] dtmax_n = df_sub.Date.unique().max() dtmin_n = df_sub.Date.unique().min() import datetime as dt #dt_range = df_sub.Date.unique() dt_range = np.arange(dtmax_n, dtmin_n, dt.timedelta(days=-14)) #dtmax_s = str(dtmax_n)[:10] # http://stackoverflow.com/questions/28327101/ddg#28327650 for dt_i in dt_range: dt_delta = (dt_i - dtmin_n).astype('timedelta64[D]').astype(int) if dt_delta < 14: continue print(dt_i, dt_delta) df_i = df_sub[df_sub.Date <= dt_i] df_i = df_i.groupby("CountryProv").apply( lambda g: g.tail(14)).reset_index(drop=True) df_i["color"] = "#73b2ff" df_i["dtLast"] = dt_i df_latest.append(df_i) if len(df_latest) == 0: raise Exception("No data in moving window") df_latest = pd.concat(df_latest, axis=0) df_latest["display_cpcode"] = df_latest.apply( lambda g: "" if g.dtLast != g.Date else g.cp_code, axis=1) print("done") #source_hist = ColumnDataSource(df_chisq) #source_latest = ColumnDataSource(df_latest) # since cannot use View iwth LabelSet, creating a different source per continent # Couldn't figure out how to filter the datasource in add_layout or Arrow, # so just grouping on both continent and dtLast srcLatest_continent = df_latest.groupby( ["Continent", "dtLast"]).apply(lambda g: ColumnDataSource(g)) srcLatest_continent = srcLatest_continent.reset_index().rename( columns={0: "src"}) plot_size_and_tools = { 'plot_height': 300, 'plot_width': 600, 'tools': ['box_select', 'reset', 'help', 'box_zoom'], 'x_axis_type': 'datetime' } # general-use lines slope_y0 = Slope(gradient=0, y_intercept=0, line_color='orange', line_width=50) slope_x0 = Slope(gradient=np.Inf, y_intercept=0, line_color='orange', line_width=50) # scatter plot TOOLTIPS = [ ("Country/Region", "@CountryProv"), ("Date", "@dt_str"), ] # first set for case vs tests, then second set for case diff vs test diff params = ( #('values', 'tests_ma07', 'case_ma07', 'testsma07_start', 'casema07_start', 'ma07(Tests)', 'ma07(Cases)'), #('diffs', 'casema07_diff07', 'testsma07_diff07', 'diff07(ma07(Cases))', 'diff07(ma07(Tests))'), ('values', 'case_detrended', 'case_ma07', 'casedet_start', 'casema07_start', 'detrended(cases)', 'ma07(Cases)'), #('values', 'caseDet_pct', 'case_ma07', 'casedetpct_start', 'casema07_start', 'detrended(ma07(cases))/cases*100', 'ma07(Cases)'), ) p_all = {'values': [], 'diffs': []} from bokeh.models import Arrow, NormalHead, OpenHead, VeeHead for k, fdxv, fdyv, fdxs, fdys, labx, laby in params: p_cont = [] for srcCont_i in srcLatest_continent.iterrows(): srcCont_i = srcCont_i[1] print("Adding plot for %s, %s" % (srcCont_i.Continent, srcCont_i.dtLast)) #init_group=dtmax_s #gf = GroupFilter(column_name='dtLast', group=init_group) #view1 = CDSView(source=srcCont_i.src, filters=[gf]) p_d1 = figure(plot_width=600, plot_height=400, tooltips=TOOLTIPS, title="%s %s" % (srcCont_i.Continent, srcCont_i.dtLast)) #p_d1.triangle(fdxv, fdyv, source=srcCont_i.src, size=12, color='blue', angle="angle") #p_d1.scatter(fdxs, fdys, source=srcCont_i.src, size=3, color='red') #, view=view1) p_d1.scatter(fdxv, fdyv, source=srcCont_i.src, size=3, color='red') p_d1.add_layout( Arrow(end=VeeHead(size=6), x_start=fdxs, y_start=fdys, x_end=fdxv, y_end=fdyv, line_color='blue', source=srcCont_i.src #view=view1 # srcCont_i.src ) #, #view=view1 # not supported ) p_d1.xaxis.axis_label = labx p_d1.yaxis.axis_label = laby from bokeh.models import LabelSet labels = LabelSet(x=fdxv, y=fdyv, text='display_cpcode', level='glyph', x_offset=5, y_offset=5, source=srcCont_i.src, render_mode='canvas') p_d1.add_layout(labels) p_d1.add_layout(slope_y0) p_d1.add_layout(slope_x0) p_cont.append(p_d1) p_all[k] = p_cont # group plots into 3 per row # https://stackoverflow.com/a/1625013/4126114 from itertools import zip_longest for k in ['values', 'diffs']: p_cont = p_all[k] p_cont = list(zip_longest(*(iter(p_cont), ) * 3)) p_cont = [[e for e in t if e != None] for t in p_cont] p_all[k] = p_cont g = gridplot(p_all['values'] + p_all['diffs']) layout = column(g) return layout
def scatter(x, y, label=None, group=None, title="Scatter Plot", xlabel="x", ylabel="y", width=600, height=600, legend=True, size=4, shape="circle", font_size="16pt", label_font_size="13pt", col_palette=None, hover_xy=True, gradient=False, gradient_alt=False, hline=False, vline=False, xrange=None, yrange=None, ci95=False, scatterplot=True, extraci95_x=False, extraci95_y=False, extraci95=False): """Creates a scatterplot using Bokeh. Required Parameters ------------------- x : array-like, shape = [n_samples] Inpute data for x-axis. y : array-like, shape = [n_samples] Inpute data for y-axis. """ # Error check if len(x) != len(y): raise ValueError("length of X does not match length of Y.") # If label is None, give an index based on input order if label is None: label_copy = {} label_copy["Idx"] = list(range(len(x))) else: try: label2 = label.copy() label2_dict = label2.to_dict("series") label_copy = label2_dict # Ensure I don't overwrite label (when plot_groupmean=True) except TypeError: label2 = label.copy() label_copy = {} label_copy[label2.name] = label2.values.tolist() # If colour palette is None (default): if col_palette is None: col_palette = ["red", "blue", "green"] # Group is None or allow for multiple classes (can add more in the Future) if group is None: group_copy = [None] * len(x) col = [] for i in range(len(x)): col.append(col_palette[2]) else: group_copy = group.copy() group_unique = np.sort(np.unique(group_copy)) col = [] for i in range(len(group_copy)): if group_copy[i] == group_unique[0]: col.append(col_palette[0]) elif group_copy[i] == group_unique[1]: col.append(col_palette[1]) else: col.append(col_palette[2]) # Bokeh data source with data labels data = {"x": x, "y": y, "group": group_copy, "col": col} data_label = {} for name, val in label_copy.items(): data_label[name] = val data.update(data_label) source = ColumnDataSource(data=data) # Tool-tip (add everything in label_copy) TOOLTIPS = [] if hover_xy is True: TOOLTIPS = [("x", "@x{1.111}"), ("y", "@y{1.111}")] for name, val in data_label.items(): TOOLTIPS.append((str(name), "@" + str(name))) # Base figure fig = figure(title=title, x_axis_label=xlabel, y_axis_label=ylabel, plot_width=width, plot_height=height, x_range=xrange, y_range=yrange) # Add to plot if scatterplot is True: if shape is "circle": shape = fig.circle("x", "y", size=size, alpha=0.6, color="col", source=source) elif shape is "triangle": shape = fig.triangle("x", "y", size=size, alpha=0.6, color="col", source=source) else: raise ValueError("shape has to be either 'circle' or 'triangle'.") shape_hover = HoverTool(renderers=[shape], tooltips=TOOLTIPS) fig.add_tools(shape_hover) if gradient is not False: if gradient_alt is False: slope = Slope(gradient=gradient, y_intercept=0, line_color="black", line_width=2, line_alpha=0.3) fig.add_layout(slope) new_gradient = -(1 / gradient) slope2 = Slope(gradient=new_gradient, y_intercept=0, line_color="black", line_dash="dashed", line_width=2, line_alpha=0.10) fig.add_layout(slope2) else: c = 0.5 - gradient * 0.5 slope = Slope(gradient=gradient, y_intercept=c, line_color="black", line_width=2, line_alpha=0.3) fig.add_layout(slope) new_gradient = -(1 / gradient) new_c = 0.5 - new_gradient * 0.5 slope2 = Slope(gradient=new_gradient, y_intercept=new_c, line_color="black", line_dash="dashed", line_width=2, line_alpha=0.10) fig.add_layout(slope2) if hline is not False: h = Span(location=0, dimension="width", line_color="black", line_width=3, line_alpha=0.15) fig.add_layout(h) if vline is not False: v = Span(location=0, dimension="height", line_color="black", line_width=3, line_alpha=0.15) fig.add_layout(v) # if ci95 is true if ci95 is True: # if group is None if group is None: group_label = [0] * len(X) group_label = group_copy x_score = x y_score = y # Score plot extra: 95% confidence ellipse using PCA unique_group = np.sort(np.unique(group_label)) # Set colour per group list_color = [ "red", "blue", "green", "black", "orange", "yellow", "brown", "cyan" ] while len(list_color) < len( unique_group ): # Loop over list_color if number of groups > len(list_colour) list_color += list_color # Add 95% confidence ellipse for each unique group in a loop max_val = [] for i in range(len(unique_group)): # Get scores for the corresponding group group_i_x = [] group_i_y = [] for j in range(len(group_label)): if group_label[j] == unique_group[i]: group_i_x.append(x_score[j]) group_i_y.append(y_score[j]) # Calculate ci95 ellipse for each group data_circ_group = pd.DataFrame({"0": group_i_x, "1": group_i_y}) m, outside_m = ci95_ellipse(data_circ_group, type="mean") p, outside_p = ci95_ellipse(data_circ_group, type="pop") # Plot ci95 ellipse outer line fig.line(m[:, 0], m[:, 1], color=list_color[i], line_width=2, alpha=0.8, line_dash="solid", legend="{}".format(unique_group[i])) fig.line(p[:, 0], p[:, 1], color=list_color[i], alpha=0.4) # Plot ci95 ellipse shade fig.patch(m[:, 0], m[:, 1], color=list_color[i], alpha=0.07) fig.patch(p[:, 0], p[:, 1], color=list_color[i], alpha=0.01) fig.x(np.median(m[:, 0]), np.median(m[:, 1]), size=size, alpha=0.6, color=list_color[i], line_width=2) maxv = max(np.abs(p).flatten()) max_val.append(maxv) if extraci95 is True: # if group is None if group is None: group_label = [0] * len(X) group_label = group_copy x_score = extraci95_x y_score = extraci95_y # Score plot extra: 95% confidence ellipse using PCA unique_group = np.sort(np.unique(group_label)) # Set colour per group list_color = [ "red", "blue", "green", "black", "orange", "yellow", "brown", "cyan" ] while len(list_color) < len( unique_group ): # Loop over list_color if number of groups > len(list_colour) list_color += list_color # Add 95% confidence ellipse for each unique group in a loop for i in range(len(unique_group)): # Get scores for the corresponding group group_i_x = [] group_i_y = [] for j in range(len(group_label)): if group_label[j] == unique_group[i]: group_i_x.append(x_score[j]) group_i_y.append(y_score[j]) # Calculate ci95 ellipse for each group data_circ_group = pd.DataFrame({ "0": group_i_x, "1": group_i_y }) m, outside_m = ci95_ellipse(data_circ_group, type="mean") p, outside_p = ci95_ellipse(data_circ_group, type="pop") # Plot ci95 ellipse outer line fig.line(m[:, 0], m[:, 1], color=list_color[i], line_width=2, alpha=0.8, line_dash="dashed") fig.line(p[:, 0], p[:, 1], color=list_color[i], alpha=0.4, line_dash="dashed") # Plot ci95 ellipse shade fig.patch(m[:, 0], m[:, 1], color=list_color[i], alpha=0.07) fig.patch(p[:, 0], p[:, 1], color=list_color[i], alpha=0.01) fig.x(np.median(m[:, 0]), np.median(m[:, 1]), size=size, alpha=0.6, color=list_color[i], line_width=2) maxv = max(np.abs(p).flatten()) max_val.append(maxv) max_range = max(max_val) new_range_min = -max_range - 0.05 * max_range new_range_max = max_range + 0.05 * max_range fig.y_range = Range1d(new_range_min, new_range_max) fig.x_range = Range1d(new_range_min, new_range_max) # Font-sizes fig.title.text_font_size = font_size fig.xaxis.axis_label_text_font_size = label_font_size fig.yaxis.axis_label_text_font_size = label_font_size # Extra padding fig.min_border_left = 20 fig.min_border_right = 20 fig.min_border_top = 20 fig.min_border_bottom = 20 # Remove legend if legend is True: fig.legend.visible = True fig.legend.location = "bottom_right" else: fig.legend.visible = False # if scatterplot is True: # if legend is False: # fig.legend.visible = False return fig
# linear equation parameters (obtained from a prior simple linear regression) gradient = 0.94 y_intercept = 0.1966 # Set the figure up p = figure(plot_height=480, plot_width= 647, y_range=(0, 1.1 * max(ypts)), title="Scatterplot with fitted regression line") # plot the points within the figure p.circle(xpts, ypts, size=10, color="#aeb3b7") slope = Slope(gradient=gradient, y_intercept=y_intercept, line_color='#3a6587', line_dash='dotted', line_width=2) p.add_layout(slope) # Removes the chart gridlines (i.e.. removes the chart clutter) p.xgrid.grid_line_color = None p.ygrid.grid_line_color = None # Remove the border. Set the width to 0 does not work so we need # to set to 0.1 to make it less visible. p.outline_line_width = 0.1 # change just some things about the x-axes p.xaxis.axis_label = "Independent Variable"
def permutation_test(model, X, Y, nperm=100, folds=8, grid_line=True): """Creates permutation test plots using Bokeh. Required Parameters ------------------- model : object This object is assumed to store bootlist attributes in .model (e.g. modelPLS.model.x_scores_). X : array-like, shape = [n_samples, n_features] Predictor variables, where n_samples is the number of samples and n_features is the number of predictors. Y : array-like, shape = [n_samples, 1] Response variables, where n_samples is the number of samples. """ model = deepcopy(model) # Get train and test idx using Stratified KFold skf = StratifiedKFold(n_splits=folds) trainidx = [] testidx = [] for train, test in skf.split(X, Y): trainidx.append(train) testidx.append(test) # Calculate binary_metrics for stats_full y_pred_full = model.test(X) stats_full = binary_metrics(Y, y_pred_full) # Calculate binary_metrics for stats_cv y_pred_cv = [None] * len(Y) for j in range(len(trainidx)): X_train = X[trainidx[j], :] Y_train = Y[trainidx[j]] X_test = X[testidx[j], :] model.train(X_train, Y_train) y_pred = model.test(X_test) for (idx, val) in zip(testidx[j], y_pred): y_pred_cv[idx] = val.tolist() stats_cv = binary_metrics(Y, y_pred_cv) # Extract R2, Q2 stats = [] stats.append([stats_full["R²"], stats_cv["R²"], 1]) # For each permutation, shuffle Y and calculate R2, Q2 and append to stats for i in tqdm(range(nperm), desc="Permutation Resample"): # Shuffle Y_shuff = Y.copy() np.random.shuffle(Y_shuff) # Model and calculate full binary_metrics model.train(X, Y_shuff) y_pred_full = model.test(X) stats_full = binary_metrics(Y_shuff, y_pred_full) # Get train and test idx using Stratified KFold for Y_shuff skf_nperm = StratifiedKFold(n_splits=folds) trainidx_nperm = [] testidx_nperm = [] for train, test in skf_nperm.split(X, Y_shuff): trainidx_nperm.append(train) testidx_nperm.append(test) # Model and calculate cv binary_metrics y_pred_cv = [None] * len(Y_shuff) for j in range(len(trainidx_nperm)): X_train = X[trainidx_nperm[j], :] Y_train = Y_shuff[trainidx_nperm[j]] X_test = X[testidx_nperm[j], :] model.train(X_train, Y_train) y_pred = model.test(X_test) for (idx, val) in zip(testidx_nperm[j], y_pred): y_pred_cv[idx] = val.tolist() stats_cv = binary_metrics(Y_shuff, y_pred_cv) # Calculate correlation using Pearson product-moment correlation coefficients and append permuted R2, Q2 and correlation coefficient corr = abs(np.corrcoef(Y_shuff, Y)[0, 1]) stats.append([stats_full["R²"], stats_cv["R²"], corr]) # Split data for plotting (corr, r2, q2) stats_r2 = [] stats_q2 = [] stats_corr = [] for i in range(len(stats)): stats_r2.append(stats[i][0]) stats_q2.append(stats[i][1]) stats_corr.append(stats[i][2]) # Calculate gradient, and y-intercept for plot 1 r2gradient = (stats_r2[0] - np.mean(stats_r2[1:])) / (1 - np.mean(stats_corr[1:])) q2gradient = (stats_q2[0] - np.mean(stats_q2[1:])) / (1 - np.mean(stats_corr[1:])) r2yintercept = stats_r2[0] - r2gradient q2yintercept = stats_q2[0] - q2gradient # Figure 1 data = {"corr": stats_corr, "r2": stats_r2, "q2": stats_q2} source = ColumnDataSource(data=data) fig1 = figure(plot_width=470, plot_height=410, x_range=(-0.15, 1.15), x_axis_label="Correlation", y_axis_label="R² & Q²") # Lines r2slope = Slope(gradient=r2gradient, y_intercept=r2yintercept, line_color="black", line_width=2, line_alpha=0.3) q2slope = Slope(gradient=q2gradient, y_intercept=q2yintercept, line_color="black", line_width=2, line_alpha=0.3) fig1.add_layout(r2slope) fig1.add_layout(q2slope) # Points r2_square = fig1.square("corr", "r2", size=6, alpha=0.5, color="red", legend="R²", source=source) q2_square = fig1.square("corr", "q2", size=6, alpha=0.5, color="blue", legend="Q²", source=source) # Add Hovertool fig1.add_tools( HoverTool(renderers=[r2_square], tooltips=[("R² Value", "@r2")])) fig1.add_tools( HoverTool(renderers=[q2_square], tooltips=[("Q² Value", "@q2")])) # Extra padding fig1.min_border_left = 20 fig1.min_border_right = 20 fig1.min_border_top = 20 fig1.min_border_bottom = 20 fig1.legend.location = "bottom_right" # Calculate Density cure for Figure 2 # Density curve X1 = np.array(stats_r2[1:]) x1_min, x1_max = X1.min(), X1.max() x1_padding = (x1_max - x1_min) * 0.6 x1_grid = np.linspace(x1_min - x1_padding, x1_max + x1_padding, 50) x1_pdf = scipy.stats.gaussian_kde(X1, "scott") x1_pdf_grid = x1_pdf(x1_grid) # Density curve X2 = np.array(stats_q2[1:]) x2_min, x2_max = X2.min(), X2.max() x2_padding = (x2_max - x2_min) * 0.6 x2_grid = np.linspace(x2_min - x2_padding, x2_max + x2_padding, 50) x2_pdf = scipy.stats.gaussian_kde(X2, "scott") x2_pdf_grid = x2_pdf(x2_grid) x2_pdf_grid = [-x for x in x2_pdf_grid] # Figure 2 fig2 = figure(plot_width=470, plot_height=410, x_range=(min(x2_grid) * 1.1, max(stats_r2[0], max(x1_grid)) + 0.65), y_range=((min(x2_pdf_grid) - 1) * 1.2, (max(x1_pdf_grid) + 1) * 1.1), x_axis_label="R² & Q²", y_axis_label="p.d.f.") slope_0 = Span(location=0, dimension="width", line_color="black", line_width=2, line_alpha=0.3) fig2.add_layout(slope_0) # Plot distribution fig2.patch(x1_grid, x1_pdf_grid, alpha=0.35, color="red", line_color="grey", line_width=1) fig2.patch(x2_grid, x2_pdf_grid, alpha=0.35, color="blue", line_color="grey", line_width=1) # Extra padding fig2.min_border_left = 60 fig2.min_border_right = 20 fig2.min_border_top = 20 fig2.min_border_bottom = 20 # Lollipops R2 # Do a t-test a = ttest_ind(stats_r2[1:], [stats_r2[0]], alternative="smaller")[1] if a > 0.005: data2_manu = "%0.2f" % a else: data2_manu = "%0.2e" % a # Plot data2 = { "x": [stats_r2[0]], "y": [max(x1_pdf_grid) + 1], "hover": [data2_manu] } source2 = ColumnDataSource(data=data2) data2_line = { "x": [stats_r2[0], stats_r2[0]], "y": [max(x1_pdf_grid) + 1, 0], "hover": [str(data2_manu), str(data2_manu)] } source2_line = ColumnDataSource(data=data2_line) r2fig2_line = fig2.line("x", "y", line_width=2, line_color="red", source=source2_line) r2fig2 = fig2.circle("x", "y", fill_color="red", size=6, legend="R²", source=source2) # Lollipops Q2 # Do a t-test b = ttest_ind(stats_q2[1:], [stats_q2[0]], alternative="smaller")[1] if b > 0.005: data3_manu = "%0.2f" % b else: data3_manu = "%0.2e" % b # Plot data3 = { "x": [stats_q2[0]], "y": [min(x2_pdf_grid) - 1], "hover": [data3_manu] } source3 = ColumnDataSource(data=data3) data3_line = { "x": [stats_q2[0], stats_q2[0]], "y": [(min(x2_pdf_grid) - 1), 0], "hover": [data3_manu, data3_manu] } source3_line = ColumnDataSource(data=data3_line) q2fig2_line = fig2.line("x", "y", line_width=2, line_color="blue", source=source3_line) q2fig2 = fig2.circle("x", "y", fill_color="blue", size=6, legend="Q²", source=source3) # Add text textr2 = "True R²\nP-Value: {}".format(data2_manu) textq2 = "True Q²\nP-Value: {}".format(data3_manu) fig2.text(x=[stats_r2[0] + 0.05, stats_q2[0] + 0.05], y=[(max(x1_pdf_grid) + 0.5), (min(x2_pdf_grid) - 1.5)], text=[textr2, textq2], angle=0, text_font_size="8pt") # Font-sizes fig1.xaxis.axis_label_text_font_size = "13pt" fig1.yaxis.axis_label_text_font_size = "13pt" fig2.xaxis.axis_label_text_font_size = "12pt" fig2.yaxis.axis_label_text_font_size = "12pt" fig2.legend.location = "top_left" # Remove grid lines if grid_line == False: fig1.xgrid.visible = False fig1.ygrid.visible = False fig2.xgrid.visible = False fig2.ygrid.visible = False fig = gridplot([[fig1, fig2]]) return fig
import numpy as np from bokeh.models import Slope from bokeh.plotting import figure, output_file, show output_file("slope.html", title="slope.py example") # linear equation parameters gradient = 2 y_intercept = 10 # create random data xpts = np.arange(0, 20) ypts = gradient * xpts + y_intercept + np.random.normal(0, 4, 20) p = figure(plot_width=450, plot_height=450, y_range=(0, 1.1 * max(ypts))) p.circle(xpts, ypts, size=5, color="skyblue") slope = Slope(gradient=gradient, y_intercept=y_intercept, line_color='orange', line_dash='dashed', line_width=3.5) p.add_layout(slope) p.yaxis.axis_label = 'y' p.xaxis.axis_label = 'x' show(p)
y_axis_label='Slugging Percentage', y_axis_type='linear', y_range=(0.28, 0.75), title='Hall of Fame OPS Components', tools='hover', tooltips=tooltip, toolbar_location=None) obpslg_fig.circle(x='OBP', y='SLG', radius=0.0025, alpha=0.5, color='blue', source=hofbat_cds) slope7 = Slope(gradient=-1, y_intercept=0.7, line_color='orange', line_width=1) slope8 = Slope(gradient=-1, y_intercept=0.8, line_color='red', line_width=1) slope9 = Slope(gradient=-1, y_intercept=0.9, line_color='white', line_width=1) slope10 = Slope(gradient=-1, y_intercept=1.0, line_color='green', line_width=1) obpslg_fig.add_layout(slope7) obpslg_fig.add_layout(slope8) obpslg_fig.add_layout(slope9) obpslg_fig.add_layout(slope10) erahrr_fig = figure(x_axis_label='Home Run Rate', x_axis_type='linear', x_range=(0, 0.09), y_axis_label='Era', y_range=labels, title='Home Run Rates by Era',
def bokeh_scatter_plot(self, tool1, tool2, **kwargs): """Return (and show) an interactive scatter plot comparing 2 tools rendered in bokeh library. Needs bokeh and colorcet libraries. Always return the `bokeh.plotting.Figure` instance with the plot. This can be used to further tune the plot. `tool1` (axis `x`) and `tool2` (axis `y`) `show` : Bool if `True` (default), show the plot in Jupyter notebook Possible kwargs =============== `show` : Bool, indicates, whether or not show the plot (in Jupyter) `col` : String name of ltlcross metric to plot, `states` by default `merge_same` : Bool if `True` (default), merge same instances and add colorbar for count, see `add_count` of `self.get_plot_data`. `include_equal` : Bool if `False` (default) do not include formulas with the same values for both tools And we have 4 arguments that control the appearance of the plot `palette` : color palette to use if `merge_same` is `True` default : `bwy` from `colorcet` `marker_color` : color to use if `merge_same` is `False` default : "navy" `alpha` : alpha of marks default `1` if `merge_same` and `.3` otherwise `marker_size` : int default `10` All remaining kwargs are supplied to `bokeh.plotting.scatter` """ from bokeh.models import ColumnDataSource, CustomJS, ColorBar, TapTool, HoverTool, Slope from bokeh.transform import linear_cmap import bokeh.plotting as bplt # Get the arguments merge_same = kwargs.pop("merge_same", True) alpha = kwargs.pop("alpha", 1) if merge_same else kwargs.pop("alpha", .3) marker_size = kwargs.pop("marker_size", 10) show = kwargs.pop("show", True) include_equal = kwargs.pop("include_equal", False) col = kwargs.pop("col", "states") # Import colorcet for palette if merge_same: import colorcet as cc palette = kwargs.pop("palette", cc.bgy) # Make the graph render in notebooks if show: bplt.output_notebook() # Create the basic plot object p = bplt.figure(title=f"Numbers of {col}") p.xaxis.axis_label = f"{tool1}" p.yaxis.axis_label = f"{tool2}" # Prepare the data data = self.get_plot_data(tool1, tool2, add_count=merge_same, include_equal=include_equal, col=col) if not merge_same: # We want to have the form_id and formula fields available for tooltip data = data.reset_index() source = ColumnDataSource(data) # Tooltips tooltips = [ (tool1, f"@{{{tool1}}}"), (tool2, f"@{{{tool2}}}"), ] if merge_same: # Map count of cases to color mapper = linear_cmap(palette=palette, field_name="count", low=1, high=data["count"].max()) color = mapper # Add count to tooltip tooltips.append(("count", "@count")) # Print command to display selected formulas callback = CustomJS(args=dict(source=source), code=f""" // Select the data var inds = source.selected.indices; var data = source.data; var x = data['{tool1}'][inds]; var y = data['{tool2}'][inds]; // Create the two commands var fst_row = "data = a.get_plot_data('{tool1}','{tool2}',add_count=False)"; var snd_row = "data[(data['{tool1}'] == " + x + ") & (data['{tool2}'] == " + y + ")]"; // Instructions var instructions = "Use the following code to list the formulas.\\n"; instructions += "Replace `a` with the ResAnalyzer` object:\\n\\n" alert(instructions + fst_row + "\\n" + snd_row); """) else: color = kwargs.pop("marker_color", "navy") tooltips.append(("formula id", "@form_id")) # Print formula on selection (currently only works for 1) callback = CustomJS(args=dict(source=source), code=f""" // Select the data var inds = source.selected.indices; var data = source.data; // Print formulas ids var output = data['form_id'][inds[0]]; for (var i = 1; i < inds.length; i++) {{ var f = data['form_id'][inds[i]]; output += ', ' + f; }} output += '\\n' // Print formulas (1 per line) for (var i = 0; i < inds.length; i++) {{ var f = data['formula'][inds[i]]; output += f + '\\n'; }} alert(output); """) # Plot data and add `y=x` slope = Slope(gradient=1, y_intercept=0, line_color="orange", line_width=2, line_dash="dashed") p.add_layout(slope) p.scatter(x=tool1, y=tool2, source=source, color=color, alpha=alpha, size=marker_size, **kwargs) # Add the hoover & selecting tool p.add_tools(TapTool(callback=callback)) p.add_tools(HoverTool(tooltips=tooltips, mode="mouse")) if merge_same: color_bar = ColorBar(color_mapper=mapper['transform'], width=16, location=(0, 0)) p.add_layout(color_bar, 'right') if show: bplt.show(p) return p
from sklearn import linear_model model = linear_model.LinearRegression() training_x = np.array(train['horsepower']).reshape(-1,1) training_y = np.array(train['price']) model.fit(training_x, training_y) slope = np.asscalar(np.squeeze(model.coef_)) intercept = model.intercept_ print('slope:', slope, 'intercept:', intercept) # In[25]: # Now let's add the line to our graph from bokeh.models import Slope best_line = Slope(gradient=slope, y_intercept=intercept, line_color='red', line_width=3) p.add_layout(best_line) show(p) # In[26]: from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score # function to predict the mean_absolute_error, mean_squared_error and r-squared def predict_metrics(lr, x, y): pred = lr.predict(x) mae = mean_absolute_error(y, pred) mse = mean_squared_error(y, pred) r2 = r2_score(y, pred) return mae, mse, r2
ump_fig = figure(x_axis_label='Home Avg. Runs', x_axis_type='linear', x_range=(4, 5.25), y_axis_label='Visitor Avg. Runs', y_axis_type='linear', y_range=(3.75, 5.25), title='Home Plate Umpire Average Runs per game', tools='hover', tooltips=tooltip, toolbar_location=None) ump_fig.circle('AVG_HOME', 'AVG_VIS', size=20, color='#006BB6', source=ump_cds) slope = Slope(gradient=1, y_intercept=0, line_color='#CE1141', line_dash='dashed', line_width=3) ump_fig.add_layout(slope) chuck = Label(x=5, y=5.04, text='Chuck Meriwether') ump_fig.add_layout(chuck) doug = Label(x=4.08, y=3.76, text='Doug Harvey') ump_fig.add_layout(doug) alfonso = Label(x=4.8, y=4.09, text='Alfonso Marquez') ump_fig.add_layout(alfonso) attend_cds = ColumnDataSource(day_attend) attend_fig = figure(x_axis_label='Day of the Week', x_range=days, y_axis_label='Avg. Attendance', y_axis_type='linear',