Exemplo n.º 1
0
def test_arrange_grouped(backend, df):
    q = group_by(_.y) >> arrange(_.x)
    assert_equal_query(df, q, DATA.sort_values(['x']))

    # arrange w/ mutate is the same, whether used before or after group_by
    assert_equal_query(
        df, q >> mutate(res=row_number(_)),
        mutate(DATA.sort_values(['x']).groupby('y'), res=row_number(_)))
Exemplo n.º 2
0
def test_mutate_window_funcs(backend):
    data = data_frame(x=range(1, 5), g=[1, 1, 2, 2])
    dfs = backend.load_df(data)
    assert_equal_query(
        dfs,
        group_by(_.g) >> mutate(row_num=row_number(_).astype(float)),
        data.assign(row_num=[1.0, 2, 1, 2]))
Exemplo n.º 3
0
def test_filter_via_group_by(backend):
    df = data_frame(x=range(1, 11), g=[1] * 5 + [2] * 5)

    dfs = backend.load_df(df)

    assert_equal_query(dfs,
                       group_by(_.g) >> filter(row_number(_) < 3),
                       data_frame(x=[1, 2, 6, 7], g=[1, 1, 2, 2]))
Exemplo n.º 4
0
def distinct_events(tbl, time_col, user_col, type):
    if type not in ["first", "last"]:
        return tbl

    res = (tbl
            >> group_by(_[user_col])
            >> arrange(_[time_col] if type == "first" else -_[time_col])
            >> filter(row_number(_) == 1)
            >> ungroup()
            )

    return res
Exemplo n.º 5
0
DATA_SPEC = data_frames([
    column('x', elements=floats(width=32) | integers(), unique=True),
    column('g', dtype=str, elements=text(max_size=1))
],
                        index=indexes(elements=floats() | integers(),
                                      max_size=10))

OMNIBUS_VECTOR_FUNCS = [
    #cumall, cumany, cummean,
    #desc,
    v.dense_rank(_.x, na_option="keep"),
    #v.percent_rank(_.x),
    v.min_rank(_.x, na_option="keep"),
    v.cume_dist(_.x, na_option="keep"),
    v.row_number(_.x),
    #ntile,
    v.between(_.x, 2, 5, default=False),
    v.coalesce(_.x, 2),
    v.lead(_.x),
    v.lag(_.x),
    v.n(_.x),
    v.na_if(_.x, 2),
    #near,
    v.nth(_.x, 2),
    v.first(_.x),
    v.last(_.x,
           order_by=_.x),  # TODO: in SQL getting FROM LAST requires order by
]

VECTOR_AGG_FUNCS = [
Exemplo n.º 6
0
def after_join(
        lhs, rhs,
        by_time, by_user,
        mode = "inner",
        type = "first-firstafter",
        max_gap = None,
        min_gap = None,
        gap_col = None,
        suffix = ("_x", "_y")
        ):

    if max_gap is not None or min_gap is not None or gap_col is not None:
        raise NotImplementedError("max_gap, min_gap, gap_col not implemented")

    # Get type of join for both tables, from e.g. "first-firstafter"
    type_lhs, type_rhs = type.split("-")

    # Convert join keys to dictionary form
    by_time_x, by_time_y = _get_key_tuple(by_time)
    by_user_x, by_user_y = _get_key_tuple(by_user)

    # mutate in row_number ----
    lhs_i = (lhs
            >> arrange(_[by_user_x], _[by_time_x])
            >> mutate(__idx = row_number(_))
            >> distinct_events(by_time_x, by_user_x, type_lhs)
            )

    rhs_i = (rhs
            >> arrange(_[by_user_y], _[by_time_y])
            >> mutate(__idy = row_number(_))
            >> distinct_events(by_time_y, by_user_y, type_rhs)
            )

    # Handle when time column is in the other table
    if by_time_x == by_time_y:
        # TODO: don't use implicit join suffix below
        pair_time_x, pair_time_y = by_time_x + "_x", by_time_y + "_y"
    else:
        pair_time_x, pair_time_y = by_time_x, by_time_y

    # Inner join by user, filter by time
    pairs = filter(
            inner_join(lhs_i, rhs_i, by_user),
            _[pair_time_x] <= _[pair_time_y]
            )

    # TODO: firstwithin
    if type_lhs in ["firstwithin", "lastbefore"]:
        raise NotImplementedError("Can't currently handle lhs type %s" % type_lhs)

    # Handle firstafter by subsetting
    if type_rhs == "firstafter":
        pairs = (pairs
                >> arrange(_[pair_time_y])
                >> group_by(_.__idx)
                >> filter(row_number(_) == 1)
                >> ungroup()
                )


    distinct_pairs = select(pairs, _.__idx, _.__idy)


    if mode in ["inner", "left", "right", "full", "outer"]:
        by_dict = dict([(by_user_x, by_user_y), ("__idy", "__idy")])
        res = (lhs_i
                >> join(_, distinct_pairs, on = "__idx", how = mode) 
                # TODO: suffix arg
                >> join(_, rhs_i , on = by_dict, how = mode)#, suffix = suffix)
                >> select(-_["__idx", "__idy"])
                )
    elif mode in ["semi", "anti"]:
        join_func = semi_join if mode == "semi" else anti_join
        res = (lhs_i
                >> join_func(_, distinct_pairs, "__idx")
                >> select(-_["__idx", "__idy"])
                )

    else:
        raise ValueError("mode not recognized: %s" %mode)

    return res