Exemplo n.º 1
0
def distinct_events(tbl, time_col, user_col, type):
    if type not in ["first", "last"]:
        return tbl

    res = (tbl
            >> group_by(_[user_col])
            >> arrange(_[time_col] if type == "first" else -_[time_col])
            >> filter(row_number(_) == 1)
            >> ungroup()
            )

    return res
def plot_large_error_percentage(data, threshold):
    """
    calculate overall high error percentage based on 2 std devations from the mean 
        abs(resid_mean - resid) > 2 * resid.std()
        
    gather long form data of the columns, filter to only large errors and their percentages
    
    setup facet grid and pass in axes plotting function
    """
    resid_mean = data.final_residuals.mean()

    # Calculate distance from average and apply a threshold to indicate large errors
    data['high_error'] = (resid_mean - data.final_residuals).abs() > threshold

    # Percentage of large errors in the overall data set
    overall_large_error_percent = data.high_error.mean()

    # long form plotting data
    # Use siuba library to help with R/dplyr style aggregations
    data_melted = (data.loc[:, [
        'jobType', 'industry', 'degree', 'major', 'milesFromMetropolis',
        'yearsExperience', 'high_error'
    ]].melt(id_vars='high_error', var_name='column') >> group_by(
        'column', 'value', 'high_error') >> summarize(total=_.value_counts())
                   >> group_by('column', 'value') >>
                   mutate(percent=_.total / _.total.sum()) >> ungroup())

    # Filter for only large error percentages and apply a grouped sort to prepare for plotting
    data_melted = (
        data_melted.loc[data_melted.high_error, :].groupby('column').apply(
            lambda x: x.sort_values('percent', ascending=False)).reset_index(
                drop=True))

    plot_title = f"Percentage of large errors per category\nLarge error threshold = {round(threshold, 1)}"
    plotGrid = sns.FacetGrid(data_melted,
                             col='column',
                             sharey=False,
                             col_wrap=3,
                             height=4,
                             aspect=1.2)
    plotGrid.map(_large_error_axes_plot,
                 'percent',
                 'value',
                 error_threshold=overall_large_error_percent)
    plotGrid.set_xlabels('Percentage of large errors')
    plotGrid.set_ylabels('')
    plotGrid.add_legend(loc='upper center',
                        bbox_to_anchor=(0.5, 0),
                        fancybox=True,
                        frameon=True)
    plt.suptitle(plot_title, y=1.05)
    plt.show()
st.write("Goals by month")

st.write("Top 8 players not in our data")
top8 >> filter(_.yr_start < 1979)

# +
from pandas.tseries.offsets import MonthBegin
from siuba.experimental.pd_groups import fast_summarize

top8_goals = (
    top8_games >> mutate(
        date=_.date.astype("datetime64[D]"),
        age_years=top8_games.age.str.split('-').str.get(0).astype(int)) >>
    arrange(_.date) >> group_by(_.player, month=_.date - MonthBegin(1)) >>
    fast_summarize(ttl_goals=_.goals.sum(), age_years=_.age_years.min()) >>
    group_by(_.player) >> mutate(cuml_goals=_.ttl_goals.cumsum()) >> ungroup())

p_goals = alt.Chart(top8_goals).mark_line().encode(y="cuml_goals:Q",
                                                   color="player")

# +
time = st.selectbox("Choose a time", ["month", "age_years"])

st.write(p_goals.encode(x=time))

# +
st.write("Goals by seasons")


@st.cache
def data_season_goals():
Exemplo n.º 4
0
def test_group_by_ungroup(df):
    q1 = df >> group_by(_.g)
    assert q1.group_by == ("g", )

    q2 = q1 >> ungroup()
    assert q2.group_by == tuple()
Exemplo n.º 5
0
)

# -

top8 >> filter(_.yr_start < 1979)

# +
# get them from game_goals

top8_games = game_goals >> inner_join(_, top8, "player")

games_per_year = (
    top8_games
    #    >> filter(_.player == "Alex Ovechkin")
    >> count(_.player, year=_.date.astype("datetime64[D]").dt.year.astype(str))
    >> group_by(_.player) >> mutate(cuml_games=_.n.cumsum(), ) >> ungroup())

alt.Chart(games_per_year).mark_line().encode(x='year:T',
                                             y='cuml_games',
                                             color="player")
# -

# +
from pandas.tseries.offsets import MonthBegin
from siuba.experimental.pd_groups import fast_summarize

top8_goals = (
    top8_games >> mutate(
        date=_.date.astype("datetime64[D]"),
        age_years=top8_games.age.str.split('-').str.get(0).astype(int)) >>
    arrange(_.date) >> group_by(_.player, month=_.date - MonthBegin(1)) >>
Exemplo n.º 6
0
def after_join(
        lhs, rhs,
        by_time, by_user,
        mode = "inner",
        type = "first-firstafter",
        max_gap = None,
        min_gap = None,
        gap_col = None,
        suffix = ("_x", "_y")
        ):

    if max_gap is not None or min_gap is not None or gap_col is not None:
        raise NotImplementedError("max_gap, min_gap, gap_col not implemented")

    # Get type of join for both tables, from e.g. "first-firstafter"
    type_lhs, type_rhs = type.split("-")

    # Convert join keys to dictionary form
    by_time_x, by_time_y = _get_key_tuple(by_time)
    by_user_x, by_user_y = _get_key_tuple(by_user)

    # mutate in row_number ----
    lhs_i = (lhs
            >> arrange(_[by_user_x], _[by_time_x])
            >> mutate(__idx = row_number(_))
            >> distinct_events(by_time_x, by_user_x, type_lhs)
            )

    rhs_i = (rhs
            >> arrange(_[by_user_y], _[by_time_y])
            >> mutate(__idy = row_number(_))
            >> distinct_events(by_time_y, by_user_y, type_rhs)
            )

    # Handle when time column is in the other table
    if by_time_x == by_time_y:
        # TODO: don't use implicit join suffix below
        pair_time_x, pair_time_y = by_time_x + "_x", by_time_y + "_y"
    else:
        pair_time_x, pair_time_y = by_time_x, by_time_y

    # Inner join by user, filter by time
    pairs = filter(
            inner_join(lhs_i, rhs_i, by_user),
            _[pair_time_x] <= _[pair_time_y]
            )

    # TODO: firstwithin
    if type_lhs in ["firstwithin", "lastbefore"]:
        raise NotImplementedError("Can't currently handle lhs type %s" % type_lhs)

    # Handle firstafter by subsetting
    if type_rhs == "firstafter":
        pairs = (pairs
                >> arrange(_[pair_time_y])
                >> group_by(_.__idx)
                >> filter(row_number(_) == 1)
                >> ungroup()
                )


    distinct_pairs = select(pairs, _.__idx, _.__idy)


    if mode in ["inner", "left", "right", "full", "outer"]:
        by_dict = dict([(by_user_x, by_user_y), ("__idy", "__idy")])
        res = (lhs_i
                >> join(_, distinct_pairs, on = "__idx", how = mode) 
                # TODO: suffix arg
                >> join(_, rhs_i , on = by_dict, how = mode)#, suffix = suffix)
                >> select(-_["__idx", "__idy"])
                )
    elif mode in ["semi", "anti"]:
        join_func = semi_join if mode == "semi" else anti_join
        res = (lhs_i
                >> join_func(_, distinct_pairs, "__idx")
                >> select(-_["__idx", "__idy"])
                )

    else:
        raise ValueError("mode not recognized: %s" %mode)

    return res