示例#1
0
def _plot_manual_exercise(df):
    pallete = palletes[max(palletes)] # get last

    # todo one axis for count, one for seconds? although not really gonna work for
    # maybe on separate plots?
    has_err = df['error'].notna()
    errs = df[has_err].copy()
    errs['reps'] = 1 # meh
    some_dt = df['dt'].dropna().iloc[-1]
    errs['dt'].fillna(some_dt, inplace=True)
    # not sure? some errs have reps.. errs['reps'].fillna(-5) # meh
    df  = df[~has_err]
    # TODO would be nice to reuse stuff to display errors as a table

    # FIXME handle none carefully here, otherwise they aren't displayed
    plots = []
    # todo hmm, reuse group hints somehow? not sure..

    # TODO helper groupby to check for None (to make sure they are handled)
    groups = list(df.groupby('kind'))
    # sort by the most recent
    groups = list(sorted(groups, key=lambda kind_d: max(unlocalize(kind_d[1]['dt'])), reverse=True))

    kinds = [kind for kind, _ in groups]
    # make colors stable
    colors = {kind: c for kind, c in zip(kinds, cycle(pallete))}
    colors['errors'] = 'red'

    x_range = None
    for k, edf in chain(
            [('errors', errs)],
            groups,
    ):
        color = colors[k]
        # TODO add some x jitter to declutter?
        # need to preserve the order though? I guess need to group
        p = date_figure(df, height=150, x_range=x_range)
        p.scatter(x='dt', y='reps'  , source=CDS(edf), legend_label='reps'  , color=color)

        from bokeh.models import LinearAxis, Range1d
        maxy = np.nanmax(edf['volume'] * 1.1) # TODO meh
        if not np.isnan(maxy): # I guess doesn't have volume?
            p.extra_y_ranges = {'volume': Range1d(start=0.0, end=maxy)}
            # add the second axis to the plot.
            p.add_layout(LinearAxis(y_range_name='volume'), 'right')
            p.scatter(x='dt', y='volume', source=CDS(edf), legend_label='volume', color='black', size=2, y_range_name='volume')

        p.title.text = k

        p.y_range.start = 0

        if x_range is None:
            x_range = p.x_range
        plots.append(p)
        # TODO not sure if I want sliding averages?
    return column(plots)
示例#2
0
def plot_sleep_intervals(df):
    df = _sleep_df(df)

    ints = df[[
        'sleep_start', 'sleep_end'
    ]].applymap(lambda dt: None if pd.isnull(dt) else _mins(dt.time()))
    # todo maybe instead plot angled lines? then won't need messing with minutes at all? Although still useful to keep
    p = date_figure()
    mint = _mins(time(22, 0))
    maxt = _mins(time(11, 0))
    # TODO need to handle nans/errors?
    p.vbar(source=CDS(ints),
           x='date',
           width=timedelta(1),
           bottom='sleep_start',
           top='sleep_end',
           color='black',
           alpha=0.1)

    from .core.bokeh import set_hhmm_axis
    # TODO also guess mint/maxt?
    set_hhmm_axis(p.yaxis, mint=mint, maxt=maxt)

    add_daysoff(p)
    return p
示例#3
0
def add_text(plot, *, text: str, **kwargs):
    from bokeh.models import Text
    # ugh. for f**k's sake, Label doesn't support multiline... https://github.com/bokeh/bokeh/issues/7317
    # and glyphs always want a data source
    textsrc = CDS({'text': [text]})
    kwargs['text'] = 'text'
    glyph = Text(**kwargs)
    plot.add_glyph(textsrc, glyph)
示例#4
0
def add_daysoff(plot, *, dates=None, bottom=None, top=None):
    if bottom is None or top is None:
        b, t = guess_range(plot, axis='y')
        # todo how to extend the range a bit so there is some leeway?
        bottom = bottom or b
        top = top or t

    if dates is None:
        mind, maxd = guess_range(plot, axis='x')
    else:
        mind = min(dates)
        maxd = max(dates)
    day = timedelta(1)

    # todo need to keep boundary (-0.5, 0.5)? .. or it's automatic??
    days = list(np.arange(mind, maxd, day))
    # TODO make a dataframe with it??

    days_2 = pd.date_range(start=mind, end=maxd, freq='D')
    days_2.name = 'date'

    # FIXME make non-defensive, this is only temporary for tests
    try:
        import my.calendar.holidays as holidays
    except Exception as e:
        import logging
        logging.exception(e)
        return

    col_df = pd.DataFrame(index=days_2, columns=['color'])

    def calc_color(row):
        # todo separate column, abstract away
        dt = row.name
        is_workday = holidays.is_workday(dt)
        return 'blue' if is_workday else 'red'

    col_df['color'] = col_df.apply(calc_color, axis=1)

    # right... nice thing about this is that it's infinite to the top and bottom...
    # but I have no clue how to make them togglable...
    # from bokeh.models import BoxAnnotation
    # for d, row in col_df.iterrows():
    #     ann = BoxAnnotation(left=d, right=d + day, fill_alpha=0.1, fill_color=row['color'])
    #     plot.add_layout(ann)

    # https://stackoverflow.com/a/56258632/706389
    # todo box annotation vs vbar??
    return plot.vbar(
        source=CDS(col_df),
        x='date',
        color='color',
        width=day,
        bottom=bottom,
        top=top,
        alpha=0.05,
        legend_label='Days off',
    )
示例#5
0
def plot(day: str, df):
    #    from bokeh.io import output_notebook
    #     output_notebook()
    # mx = []
    # my = []
    # for lat, lon in zip(df['lat'], df['lon']):
    #     (mmx, mmy) = merc(lat, lon)
    #     mx.append(mmx)
    #     my.append(mmy)

    df = pd.DataFrame(
        (merc(lat, lon) for _, (lat, lon) in df[['lat', 'lon']].iterrows()),
        columns=['mlon', 'mlat'],
        index=df.index,
    )
    # todo err... swap lat and lon?
    # df['mlon'] = mx
    # df['mlat'] = my

    # range bounds supplied in web mercator coordinates
    # p = figure(x_range=(-2000000, 6000000), y_range=(-1000000, 7000000),

    # todo set some reasonable minimum span? otherwise if there is no movement, almost nothing is displayed
    x_range = min(df['mlon']) - 100, max(df['mlon']) + 100
    y_range = min(df['mlat']) - 100, max(df['mlat']) + 100

    # print(x_range, y_range)
    # print(max(df['mlon']), min(df['mlon']))

    # TODO determine range from data?
    p = figure(
        title='map',
        x_axis_type='mercator',
        y_axis_type='mercator',
        # todo autodetect?
        # todo hmm, if I fix width, sometimes rendering fails? 20171118
        # plot_width =2500,
        # plot_height=1400,
        x_range=x_range,
        y_range=y_range,
    )
    p.add_tile(tile_provider)

    p.circle(x='mlon',
             y='mlat',
             size=10,
             fill_color='blue',
             fill_alpha=0.8,
             source=CDS(df))

    # I guess mimicking the same interface as google maps is ok? e.g. sidebar with points and time on the left and highlight them (maybe with different colors depending on whether they are in the future or in the past)

    #save to html file
    # output_file("file.html")
    # save(plot)

    return p
示例#6
0
 def plot(p, df):
     # todo hmm, they overlap? maybe need to stack
     p.quad(source=CDS(df),
            top='top',
            bottom='bottom',
            left='left',
            right='right',
            color='darkgreen',
            alpha='alpha')
     top = 26 * 60
     set_hhmm_axis(p.yaxis, mint=0, maxt=top, period=60)
     add_daysoff(p)
示例#7
0
    def __init__(self, prefs={}):

        # dictionary to hold the dataframes to be rendered
        self._gene_data = dict(
            transcripts=CDS(self._placeholder()),
            #coding_exons=CDS(exon_data_frame()),
            #noncoding_exons=CDS(exon_data_frame()),
            exons=CDS(exon_data_frame()),
            introns=CDS(intron_data_frame()),
            labels=CDS(transcript_label_data_frame()))

        # preferences
        self._prefs = deepcopy(defaults)
        self._prefs.update(prefs)

        # the transcript data from the database
        self._transcripts = None

        # flag to indicate if the gene data to display has changed
        self._dirty_flag = False

        # create the plot
        self._figure = self._create_plot()
示例#8
0
def plot_multiple(df, *, columns, **kwargs):
    # todo autodiscover columns somehow?
    # basically all except dates?

    # todo use multiindex for groups? not sure if possible
    # https://stackoverflow.com/questions/30791839/is-there-an-easy-way-to-group-columns-in-a-pandas-dataframe

    # todo make configurable
    from bokeh.palettes import Dark2_5 as palette  # type: ignore

    from .pandas import read_group_hints, read_range_hints
    groups = read_group_hints(df)

    range_hints = read_range_hints(df)

    # todo think of a better name?..
    x_range = None

    # todo height??
    # todo for grouping, could simply take soft hints?
    # and if something is unknown, just warn and display on the plot
    plots = []
    for grp in groups:
        # todo add source to annotation?
        # todo rely on kwargs for date x axis?
        p = date_figure(x_range=x_range, **kwargs)

        # todo color rainbow??
        for f, color in zip(grp, cycle(palette)):
            # todo make this behaviour adjustable?
            fdf = df[df[f].notna()]

            # TODO rely on dt index? it can be non-unique so it should be fine...
            p.scatter(x='dt',
                      y=f,
                      source=CDS(data=fdf),
                      color=color,
                      legend_label=f)
            p.line(x='dt',
                   y=f,
                   source=CDS(data=fdf),
                   color=color,
                   legend_label=f)

            # TODO axis labels

            # hmm it actually uses glucose level as an example
            # https://docs.bokeh.org/en/latest/docs/user_guide/annotations.html#box-annotations

            # TODO from bokeh.sampledata.glucose import data -- could use for demo/testing
            rhs = range_hints.get(f, [])
            # TODO if no color, just vary color + ???
            for rh in rhs:
                # right. annotation works, but wasn't sure how to make it toggable
                # from bokeh.models import BoxAnnotation # type: ignore
                # normal = BoxAnnotation(bottom=rh.low, top=rh.high, fill_alpha=0.1, fill_color=rh.color)
                # p.add_layout(normal)

                extras: Dict[str, Optional[str]] = dict(color=None)
                col = rh.color
                if col is None:
                    col = color
                    if len(rhs) > 1:
                        logging.warning(
                            "Multiple ranges for %s don't have colour, this will result in ranges overlapping: %s",
                            f, rhs)
                        # at least make the separators visible
                        extras = dict(color='black', line_dash='dotted')

                # todo hide by default?
                # hmm, without left=, it plots at timestamp 0 =/
                p.hbar(
                    left=min(fdf['dt']),
                    right=max(fdf['dt']),
                    y=(rh.low + rh.high) / 2,
                    height=rh.high - rh.low,  # eh, this is awkward..
                    fill_color=col,
                    fill_alpha=0.1,
                    legend_label=f'{f} ranges',
                    **extras,
                )

        p.title.text = str(grp)
        if x_range is None:
            x_range = p.x_range

        plots.append(p)

    return gridplot([[x] for x in plots])
示例#9
0
def rolling(*,
            x: str,
            y: str,
            df,
            avgs: Sequence[Avg] = ['7D', '30D'],
            legend_label=None,
            context: Optional[RollingResult] = None,
            **kwargs) -> RollingResult:
    # TODO maybe use a special logging handler, so everything logged with warning level gets displayed?
    errors = []

    # todo ugh. the same HPI check would be nice..
    tzs = set(df.index.map(lambda x: getattr(x, 'tzinfo', None)))  # meh
    if len(tzs) > 1:
        errors.append(
            f'WARNING: a mixture of timezones: {tzs}. You might want to unlocalize() them first.'
        )
    elif len(tzs) == 1:
        [_tz] = tzs
        if _tz is not None:
            # todo not really sure about that.. maybe it's okay, although UTC might be wrong as well
            errors.append(
                f'WARNING: detected timezone: {_tz}. You might want to unlocalize() first.'
            )

    # todo should copy df first??
    if legend_label is None:
        legend_label = y

    # meh... don't think I like it
    # TODO def test this
    if context is None:
        ls = []
        plot = date_figure(df=df)
        ls.append(plot)

        ctx = RollingResult(
            layout=column(ls, sizing_mode='stretch_width'),
            plots=[],
            figures=[plot],
        )
    else:
        ctx = context

    plot = ctx.figure
    plots = ctx.plots
    layouts = ctx.layout.children

    # todo assert datetime index? test it too
    # todo although in theory it doens't have to be datetimes with the approprivate avgs??

    has_x = df.index.notna()
    has_y = df[y].notna()
    err = df['error'].notna(
    ) if 'error' in df.columns else df.index == 'how_to_make_empty_index?'
    # ^^^ todo a bit ugly... think about this better

    for_table = ~has_x  # case 1 is handled

    # case 2: set proper error for ones that don't have y
    df.loc[has_x & ~has_y & ~err, 'error'] = f'{y} is nan/null'
    # now case 2 and 3 are the same

    # case 3
    case_3 = has_x & ~has_y
    for_table |= case_3
    for_marks = case_3

    # case 4, 5
    ok = has_x & has_y
    case_4 = ok & err
    for_table |= case_4
    for_warn = case_4

    dfm = df.loc[for_marks]
    dfe = df.loc[for_table]
    dfw = df.loc[for_warn]
    df = df.loc[ok]
    if len(dfm) > 0:
        # todo meh.. how to make the position absolute??
        some_y = df[y].quantile(
            0.8)  # to display kinda on top, but not too high
        if np.isnan(some_y):
            # otherwise fails during JSON serialization
            some_y = 0.0
        plot.scatter(
            source=CDS(dfm),
            x=x,
            y=some_y,
            legend_label='errors',
            line_color='red',
            fill_color='yellow',  # ??
            marker='circle_cross',
            size=10,
        )

    if len(dfe) > 0:
        errors.append(f'Also encountered {len(dfe)} errors:')

    from bokeh.models.widgets.markups import Div
    # first a summary for the most important warnings/errors
    # todo append later stuff as well, there are some warnings during means
    for e in errors:
        layouts.append(
            Div(
                text=html.escape(e),
                style={
                    'color': 'red',
                    'font-weight': 'strong'
                },
            ))

    if len(dfe) > 0:
        # todo could even keep the 'value' erorrs and display below too.. but for now it's ok
        # ok, if it respected newlines, would be perfect
        # for now this is 'fine'...

        # todo maybe should display all points, highlight error ones as red (and it sorts anyway so easy to overview?)
        # todo would be nice to highlight the corresponding points in table/plot
        from bokeh.models.widgets import DataTable, DateFormatter, TableColumn
        from bokeh.models.widgets.tables import DateFormatter, NumberFormatter, HTMLTemplateFormatter
        # didn't work at all??
        # from bokeh.models.widgets.tables import ScientificFormatter

        # todo DataCube?? even more elaborate
        dfe = dfe.reset_index(
        )  # todo ugh. otherwise doesn't display the index at all?
        dfe = dfe.sort_values(by=x)

        # todo maybe display 'error' as the first col?
        datefmt = DateFormatter(format="%Y-%m-%d")
        # todo speed_avg could have less digits (guess by the dispersion or something??)

        # TODO horrible, but js bits of bokeh compute some complete bullhit for column widths
        # todo set monospace font??
        one_char = 10  # pixels

        def datatable_columns(df):
            for c, t in df.dtypes.items():
                formatter = None

                # TODO also use col name.. then won't have to handle nans!
                width = 15  # in characters
                # for fixed width types, we can have something kind of reasonable
                if str(t).startswith('float'):
                    l = df[c].dropna().map(str).str.len().max()
                    width = 4 if np.isnan(l) else l

                if str(t).startswith('datetime'):
                    formatter = DateFormatter(format='%Y%m%d %H%M%S %Z',
                                              nan_format='Nan')
                    width = 15
                elif str(t).startswith('timedelta'):
                    # TODO warn if df contains stuff with duration >1D?
                    # without nan_format, it results in NaN:Nan:Nan
                    formatter = DateFormatter(format='%H:%M:%S',
                                              nan_format='Nan')
                    width = 8

                # if c == 'error':
                #     # meh, but the only easy way to limit and ellipsize it I found
                #     # aaand it still computes width in some weird way, ends up taking too much space
                #     formatter = HTMLTemplateFormatter(template='<div style="text-overflow: ellipsis; overflow: hidden; width: 60ch;"><%= value %></div>')

                tc = TableColumn(
                    field=c,
                    title=c,
                    **({} if formatter is None else dict(formatter=formatter)),
                    width=width * one_char,
                )
                yield tc

        # TODO hmm, if we reuse the data source, editing & selection might work?
        errors_table = DataTable(
            source=CDS(dfe),
            columns=list(datatable_columns(dfe)),
            # todo ugh. handle this properly, was too narrow on the sleep plots
            editable=True,
            width=2000,

            # default ends up in trimmed table content
            autosize_mode='none',

            # this might overstretch the parent...
            # autosize_mode='fit_viewport',

            # this just makes it respect the parent width
            # width_policy='fit',
        )
        layouts.append(errors_table)

        # todo
        # >>> plot.circle([1,2,3], [4,5,6], name="temp")
        # >>> plot.select(name="temp")
        # [GlyphRenderer(id='399d53f5-73e9-44d9-9527-544b761c7705', ...)]

    if len(dfw) > 0:
        plot.circle(source=CDS(dfw),
                    x=x,
                    y=y,
                    legend_label='warnings',
                    size=20,
                    color='yellow')

    # todo warn if unsorted?
    df = df.sort_index()

    if len(df) == 0:
        # add a fake point, so at least plotting doesn't fail...
        df = pd.DataFrame([{
            x: datetime(year=2000, month=1, day=1),
            y: 0.0,
        }]).set_index(x)
        avgs = ['3D' for _ in avgs]
        # FIXME need to add this to errors as well, or at least title..
        # TODO need to add a better placholder, timestamp 0 really messes things up
        warnings.warn(f'No data points for {df}, empty plot!')

    if None not in avgs:
        plots.append(
            plot.scatter(x=x,
                         y=y,
                         source=CDS(df),
                         legend_label=legend_label,
                         **kwargs))

    # only stuff without errors/warnings participates in the avg computation
    if 'error' in df.columns:  # meh
        df = df[df['error'].isna()]
    for period in [a for a in avgs if a is not None]:
        dfy = df[[y]]
        if str(dfy.index.dtype) == 'object':
            logging.error(
                f"{dfy.dtypes}: index type is 'object'. You're likely doing something wrong"
            )
        if 'datetime64' in str(dfy.index.dtype):
            # you're probably doing something wrong otherwise..
            # todo warn too?
            # check it's a valid period
            pd.to_timedelta(period)
        # TODO how to fill the missing values??
        # a sequence of consts would be a good test for it
        # todo why would index be na at this point? probably impossible?
        dfa = dfy[dfy.index.notna()].rolling(period).mean()
        # TODO assert x in df?? or rolling wrt to x??

        # somehow plot.line works if 'x' is index? but df[x] doesnt..

        # todo different style by default? thicker line? not sure..
        plots.append(
            plot.line(x=x,
                      y=y,
                      source=CDS(dfa),
                      legend_label=f'{legend_label} ({period} avg)',
                      **kwargs))

    plot.title.text = f'x: {x}, y: {y}'
    # TODO axis labels instead?
    return ctx
示例#10
0
def scatter_matrix(
    df,
    *,
    xs: Sequence[str] = None,
    ys: Sequence[str] = None,
    width=None,
    height=None,
    regression=True,
    **kwargs,
):
    assert len(df) > 0, 'TODO handle this'

    # FIXME handle empty df
    source = CDS(df)
    # TODO what about non-numeric stuff?

    xs = df.columns if xs is None else xs
    ys = df.columns if ys is None else ys
    ys = list(reversed(
        ys))  # reorder to move meaningful stuff to the top left corner

    isnum = lambda c: is_numeric_dtype(df.dtypes[c])
    # reorder so non-numeric is in the back
    # todo mode to drop non-numeric? not sure.. definitely can drop 'error' and datetimish?
    xs = list(sorted(xs, key=isnum, reverse=True))
    ys = list(sorted(ys, key=isnum, reverse=True))

    from bokeh.models import Label

    # TODO not sure I wanna reuse axis?
    def make(xc: str, yc: str):
        p = figure(df=df)
        diag = xc == yc  # todo handle properly
        # TODO not sure if I even want them... move to the very end?
        if isnum(xc) and isnum(yc):
            p.scatter(x=xc, y=yc, source=source, size=3)
        else:
            # TODO ugh, doesn't want to show the label without any points??
            # p.circle(x=0.0, y=0.0)
            # FIXME how to make sure text fits into the plot??
            add_text(
                p,
                x=0.0,
                y=0.0,
                text='Not numeric',
                text_color='red',
            )
        p.xaxis.axis_label = xc
        p.yaxis.axis_label = yc
        return p

    grid = [[make(xc=x, yc=y) for x in xs] for y in ys]
    from bokeh.layouts import gridplot
    w1 = None if width is None else width // min(len(xs), len(ys))
    h1 = None if height is None else height // min(len(xs), len(ys))
    grid_res = gridplot(grid, plot_width=w1, plot_height=h1)

    # TODO might be useful to include/exclude specific cols (e.g. datetime) while keeping them in annotations

    # TODO add the presence of the grid to the 'visual tests'
    # but if I swith it to raw bokeh -- it has Grid class.. might need to mess with
    # also maybe add extra axis under each plot in the grid? easier for a huge matrix of plots
    # some code in old dashboard
    if not regression:
        return grid_res

    # todo this would be need for plotly as well?
    import statsmodels.formula.api as smf  # type: ignore

    for plot in chain.from_iterable(grid):
        gs = plot.renderers
        if len(gs) == 0:
            # must be non-numeric? meh though
            continue
        [g] = gs
        xx = g.glyph.x
        yy = g.glyph.y

        if xx == yy:
            # diagonal thing, e.g. histogram. compute some stats??
            continue

        with pd.option_context('mode.use_inf_as_null', True):
            # FIXME proper error handling, display number of dropped items?
            dd = df[[xx, yy]].dropna()  # otherwise from_scatter fails
        # todo would be nice to display stats on the number of points dropped

        udd = dd.drop_duplicates()
        if len(udd) <= 1:
            # can't perform a reasonable regression then
            add_text(
                plot,
                x=0.0,
                y=0.0,
                text='ERROR: no points to correlate',
                text_color='red',
            )
            continue

        res = smf.ols(f"{yy} ~ {xx}", data=dd).fit()
        intercept = res.params['Intercept']
        slope = res.params[xx]
        r2 = res.rsquared

        ## TODO crap. is it really the best way to figure out relative position??
        relx = 0.01
        rely = 0.1

        # todo highlight high enough R2?
        minx, maxx = min(dd[xx]), max(dd[xx])
        miny, maxy = min(dd[yy]), max(dd[yy])
        # todo font size dependent on width?? ugh.
        txt = f'R2 = {r2:.4f}\nY ~ {slope:.3f} X'

        # todo need to add various regression properties, like intercept, etc
        # TODO hopefuly this overlays correctly?? not sure about nans, again
        from bokeh.models import Slope
        sl = Slope(gradient=slope,
                   y_intercept=intercept,
                   line_color='green',
                   line_width=3)
        plot.add_layout(sl)
        add_text(
            plot,
            text=txt,
            x=minx + (maxx - minx) * relx,
            y=miny + (maxy - miny) * rely,
            text_color=g.glyph.line_color,
        )

    # TODO dynamic resizing would be nice
    return grid_res