def distinct_events(tbl, time_col, user_col, type): if type not in ["first", "last"]: return tbl res = (tbl >> group_by(_[user_col]) >> arrange(_[time_col] if type == "first" else -_[time_col]) >> filter(row_number(_) == 1) >> ungroup() ) return res
def plot_large_error_percentage(data, threshold): """ calculate overall high error percentage based on 2 std devations from the mean abs(resid_mean - resid) > 2 * resid.std() gather long form data of the columns, filter to only large errors and their percentages setup facet grid and pass in axes plotting function """ resid_mean = data.final_residuals.mean() # Calculate distance from average and apply a threshold to indicate large errors data['high_error'] = (resid_mean - data.final_residuals).abs() > threshold # Percentage of large errors in the overall data set overall_large_error_percent = data.high_error.mean() # long form plotting data # Use siuba library to help with R/dplyr style aggregations data_melted = (data.loc[:, [ 'jobType', 'industry', 'degree', 'major', 'milesFromMetropolis', 'yearsExperience', 'high_error' ]].melt(id_vars='high_error', var_name='column') >> group_by( 'column', 'value', 'high_error') >> summarize(total=_.value_counts()) >> group_by('column', 'value') >> mutate(percent=_.total / _.total.sum()) >> ungroup()) # Filter for only large error percentages and apply a grouped sort to prepare for plotting data_melted = ( data_melted.loc[data_melted.high_error, :].groupby('column').apply( lambda x: x.sort_values('percent', ascending=False)).reset_index( drop=True)) plot_title = f"Percentage of large errors per category\nLarge error threshold = {round(threshold, 1)}" plotGrid = sns.FacetGrid(data_melted, col='column', sharey=False, col_wrap=3, height=4, aspect=1.2) plotGrid.map(_large_error_axes_plot, 'percent', 'value', error_threshold=overall_large_error_percent) plotGrid.set_xlabels('Percentage of large errors') plotGrid.set_ylabels('') plotGrid.add_legend(loc='upper center', bbox_to_anchor=(0.5, 0), fancybox=True, frameon=True) plt.suptitle(plot_title, y=1.05) plt.show()
st.write("Goals by month") st.write("Top 8 players not in our data") top8 >> filter(_.yr_start < 1979) # + from pandas.tseries.offsets import MonthBegin from siuba.experimental.pd_groups import fast_summarize top8_goals = ( top8_games >> mutate( date=_.date.astype("datetime64[D]"), age_years=top8_games.age.str.split('-').str.get(0).astype(int)) >> arrange(_.date) >> group_by(_.player, month=_.date - MonthBegin(1)) >> fast_summarize(ttl_goals=_.goals.sum(), age_years=_.age_years.min()) >> group_by(_.player) >> mutate(cuml_goals=_.ttl_goals.cumsum()) >> ungroup()) p_goals = alt.Chart(top8_goals).mark_line().encode(y="cuml_goals:Q", color="player") # + time = st.selectbox("Choose a time", ["month", "age_years"]) st.write(p_goals.encode(x=time)) # + st.write("Goals by seasons") @st.cache def data_season_goals():
def test_group_by_ungroup(df): q1 = df >> group_by(_.g) assert q1.group_by == ("g", ) q2 = q1 >> ungroup() assert q2.group_by == tuple()
) # - top8 >> filter(_.yr_start < 1979) # + # get them from game_goals top8_games = game_goals >> inner_join(_, top8, "player") games_per_year = ( top8_games # >> filter(_.player == "Alex Ovechkin") >> count(_.player, year=_.date.astype("datetime64[D]").dt.year.astype(str)) >> group_by(_.player) >> mutate(cuml_games=_.n.cumsum(), ) >> ungroup()) alt.Chart(games_per_year).mark_line().encode(x='year:T', y='cuml_games', color="player") # - # + from pandas.tseries.offsets import MonthBegin from siuba.experimental.pd_groups import fast_summarize top8_goals = ( top8_games >> mutate( date=_.date.astype("datetime64[D]"), age_years=top8_games.age.str.split('-').str.get(0).astype(int)) >> arrange(_.date) >> group_by(_.player, month=_.date - MonthBegin(1)) >>
def after_join( lhs, rhs, by_time, by_user, mode = "inner", type = "first-firstafter", max_gap = None, min_gap = None, gap_col = None, suffix = ("_x", "_y") ): if max_gap is not None or min_gap is not None or gap_col is not None: raise NotImplementedError("max_gap, min_gap, gap_col not implemented") # Get type of join for both tables, from e.g. "first-firstafter" type_lhs, type_rhs = type.split("-") # Convert join keys to dictionary form by_time_x, by_time_y = _get_key_tuple(by_time) by_user_x, by_user_y = _get_key_tuple(by_user) # mutate in row_number ---- lhs_i = (lhs >> arrange(_[by_user_x], _[by_time_x]) >> mutate(__idx = row_number(_)) >> distinct_events(by_time_x, by_user_x, type_lhs) ) rhs_i = (rhs >> arrange(_[by_user_y], _[by_time_y]) >> mutate(__idy = row_number(_)) >> distinct_events(by_time_y, by_user_y, type_rhs) ) # Handle when time column is in the other table if by_time_x == by_time_y: # TODO: don't use implicit join suffix below pair_time_x, pair_time_y = by_time_x + "_x", by_time_y + "_y" else: pair_time_x, pair_time_y = by_time_x, by_time_y # Inner join by user, filter by time pairs = filter( inner_join(lhs_i, rhs_i, by_user), _[pair_time_x] <= _[pair_time_y] ) # TODO: firstwithin if type_lhs in ["firstwithin", "lastbefore"]: raise NotImplementedError("Can't currently handle lhs type %s" % type_lhs) # Handle firstafter by subsetting if type_rhs == "firstafter": pairs = (pairs >> arrange(_[pair_time_y]) >> group_by(_.__idx) >> filter(row_number(_) == 1) >> ungroup() ) distinct_pairs = select(pairs, _.__idx, _.__idy) if mode in ["inner", "left", "right", "full", "outer"]: by_dict = dict([(by_user_x, by_user_y), ("__idy", "__idy")]) res = (lhs_i >> join(_, distinct_pairs, on = "__idx", how = mode) # TODO: suffix arg >> join(_, rhs_i , on = by_dict, how = mode)#, suffix = suffix) >> select(-_["__idx", "__idy"]) ) elif mode in ["semi", "anti"]: join_func = semi_join if mode == "semi" else anti_join res = (lhs_i >> join_func(_, distinct_pairs, "__idx") >> select(-_["__idx", "__idy"]) ) else: raise ValueError("mode not recognized: %s" %mode) return res