Exemplo n.º 1
0
def test_safe_group_by_apply():
    df = DF([["a", 1], ["a", 2], [None, 3]], "b:str,c:long", True)

    def _m1(df):
        PD_UTILS.ensure_compatible(df)
        df["ct"] = df.shape[0]
        return df

    res = PD_UTILS.safe_groupby_apply(df.native, ["b"], _m1)
    PD_UTILS.ensure_compatible(res)
    assert 3 == res.shape[0]
    assert 3 == res.shape[1]
    assert [["a", 1, 2], ["a", 2, 2], [None, 3, 1]] == res.values.tolist()

    res = PD_UTILS.safe_groupby_apply(df.native, [], _m1)
    PD_UTILS.ensure_compatible(res)
    assert 3 == res.shape[0]
    assert 3 == res.shape[1]
    assert [["a", 1, 3], ["a", 2, 3], [None, 3, 3]] == res.values.tolist()

    df = DF([[1.0, "a"], [1.0, "b"], [None, "c"], [None, "d"]],
            "b:double,c:str", True)
    res = PD_UTILS.safe_groupby_apply(df.native, ["b"], _m1)
    assert [
        [1.0, "a", 2],
        [1.0, "b", 2],
        [float("nan"), "c", 2],
        [float("nan"), "d", 2],
    ].__repr__() == res.values.tolist().__repr__()
Exemplo n.º 2
0
 def _apply_schema(self, pdf: pd.DataFrame,
                   schema: Optional[Schema]) -> Tuple[pd.DataFrame, Schema]:
     PD_UTILS.ensure_compatible(pdf)
     if pdf.columns.dtype == "object":  # pdf has named schema
         pschema = _input_schema(pdf)
         if schema is None or pschema == schema:
             return pdf, pschema.assert_not_empty()
         pdf = pdf[schema.assert_not_empty().names]
     else:  # pdf has no named schema
         schema = _input_schema(schema).assert_not_empty()
         assert_or_throw(
             pdf.shape[1] == len(schema),
             ValueError(
                 f"Pandas datafame column count doesn't match {schema}"),
         )
         pdf.columns = schema.names
     return _enforce_type(pdf, schema), schema
Exemplo n.º 3
0
def test_safe_group_by_apply_special_types():
    def _m1(df):
        PD_UTILS.ensure_compatible(df)
        df["ct"] = df.shape[0]
        return df

    df = DF([["a", 1.0], [None, 3.0], [None, 3.0], [None, None]],
            "a:str,b:double", True)
    res = PD_UTILS.safe_groupby_apply(df.native, ["a", "b"], _m1)
    PD_UTILS.ensure_compatible(res)
    assert 4 == res.shape[0]
    assert 3 == res.shape[1]
    DF(
        [["a", 1.0, 1], [None, 3.0, 2], [None, 3.0, 2], [None, None, 1]],
        "a:str,b:double,ct:int",
        True,
    ).assert_eq(res)

    dt = datetime.now()
    df = DF([["a", dt], [None, dt], [None, dt], [None, None]],
            "a:str,b:datetime", True)
    res = PD_UTILS.safe_groupby_apply(df.native, ["a", "b"], _m1)
    PD_UTILS.ensure_compatible(res)
    assert 4 == res.shape[0]
    assert 3 == res.shape[1]
    DF(
        [["a", dt, 1], [None, dt, 2], [None, dt, 2], [None, None, 1]],
        "a:str,b:datetime,ct:int",
        True,
    ).assert_eq(res)

    dt = date(2020, 1, 1)
    df = DF([["a", dt], [None, dt], [None, dt], [None, None]], "a:str,b:date",
            True)
    res = PD_UTILS.safe_groupby_apply(df.native, ["a", "b"], _m1)
    PD_UTILS.ensure_compatible(res)
    assert 4 == res.shape[0]
    assert 3 == res.shape[1]
    DF(
        [["a", dt, 1], [None, dt, 2], [None, dt, 2], [None, None, 1]],
        "a:str,b:date,ct:int",
        True,
    ).assert_eq(res)

    dt = date(2020, 1, 1)
    df = DF([["a", dt], ["b", dt], ["b", dt], ["b", None]], "a:str,b:date",
            True)
    res = PD_UTILS.safe_groupby_apply(df.native, ["a", "b"], _m1)
    PD_UTILS.ensure_compatible(res)
    assert 4 == res.shape[0]
    assert 3 == res.shape[1]
    DF(
        [["a", dt, 1], ["b", dt, 2], ["b", dt, 2], ["b", None, 1]],
        "a:str,b:date,ct:int",
        True,
    ).assert_eq(res)
Exemplo n.º 4
0
 def _m1(df):
     PD_UTILS.ensure_compatible(df)
     df["ct"] = df.shape[0]
     return df