示例#1
0
def test_rowfirst():
    assert str(dt.rowfirst(f.A)) == str(f.A.rowfirst())
    assert str(dt.rowfirst(f[:])) == str(f[:].rowfirst())
    DT = dt.Frame({'A':[1, None, None, None],
                   'B':[None, 3, 4, None],
                   'C':[2, None, 5, None]})

    assert_equals(DT[:, f[:].rowfirst()], DT[:, dt.rowfirst(f[:])])
示例#2
0
def test_rowfirstlast_strs(st):
    DT = dt.Frame([("a", None, "b", None), (None, None, "x", None),
                   ("", "", "AHA!", "last")],
                  stype=st)
    RES = DT[:, [rowfirst(f[:]), rowlast(f[:])]]
    assert_equals(RES, dt.Frame([["a", "x", ""], ["b", "x", "last"]],
                                stype=st))
示例#3
0
def test_rowfirstlast_floats(st):
    DT = dt.Frame([(3.0, 7.0, math.nan),
                   (math.inf, None, None),
                   (math.nan, 2.5, -111)], stype=st)
    RES = DT[:, [rowfirst(f[:]), rowlast(f[:])]]
    assert_equals(RES, dt.Frame([[3.0, math.inf, 2.5],
                                 [7.0, math.inf, -111.0]], stype=st))
示例#4
0
def test_reprs():
    # Check that row-expressions can be repr'd without errors
    assert repr(rowall())
    assert repr(rowany())
    assert repr(rowsum())
    assert repr(rowcount())
    assert repr(rowmin())
    assert repr(rowmax())
    assert repr(rowfirst())
    assert repr(rowlast())
    assert repr(rowmean())
    assert repr(rowsd())
示例#5
0
def test_rowfirstlast_incompatible_types():
    DT = dt.Frame([["a", None, "c", None], [1, 3, 4, None]])
    assert_equals(DT[:, rowfirst(f[:])], dt.Frame(["a", "3", "c", None]))
示例#6
0
def test_rowfirstlast_ints(st):
    DT = dt.Frame([(7, 5, 19, 22), (None, 1, 2, None),
                   (None, None, None, None)],
                  stype=st)
    RES = DT[:, [rowfirst(f[:]), rowlast(f[:])]]
    assert_equals(RES, dt.Frame([[7, 1, None], [22, 2, None]], stype=st))
示例#7
0
def test_rowfirstlast_bools():
    DT = dt.Frame([(None, True, False), (False, None, None),
                   (None, None, None)])
    RES = DT[:, [rowfirst(f[:]), rowlast(f[:])]]
    assert_equals(RES, dt.Frame([[True, False, None], [False, False, None]]))
示例#8
0
    # remove black listed columns or column groups that smaller than minimal size
    col_groups = {
        key: val
        for key, val in all_col_groups.items()
        if not key in black_listed_columns or len(val) >= min_col_group_size
    }

    # list of column prefixes
    columns = list(col_groups.keys())
    # list of column ranges
    ranges = [(min(idx), max(idx)) for idx in col_groups.values()]

# produce tuple for column slices
col_slices = [((col + "%d") % (desde), (col + "%d") % (hasta))
              for (col, (desde, hasta)) in zip(columns, ranges)]

for c, r, s in zip(columns, ranges, col_slices):
    update_map = {
        c + "_sum": rowsum(f[s[0]:s[1]]),
        c + "_mean": rowmean(f[s[0]:s[1]]),
        c + "_sd": rowsd(f[s[0]:s[1]]),
        c + "_max": rowmax(f[s[0]:s[1]]),
        c + "_min": rowmin(f[s[0]:s[1]]),
        c + "_range": rowmax(f[s[0]:s[1]]) - rowmin(f[s[0]:s[1]]),
        c + "_first": rowfirst(f[s[0]:s[1]]),
        c + "_last": rowlast(f[s[0]:s[1]]),
        c + "_missing": (r[1] - r[0] + 1) - rowcount(f[s[0]:s[1]])
    }
    X[:, update(**update_map)]

return {"CreditCard-train-aug.csv": X}
示例#9
0
def test_rowfirstlast_incompatible():
    DT = dt.Frame(A=["a", "b", "c"], B=[1, 3, 4])
    with pytest.raises(TypeError, match="Incompatible column types"):
        assert DT[:, rowfirst(f[:])]
    def create_data(
        X: dt.Frame = None
    ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray,
               List[np.ndarray], pd.DataFrame, List[pd.DataFrame], Dict[
                   str, str],  # {data set names : paths}
               Dict[str, dt.Frame],  # {data set names : dt frames}
               Dict[str, np.ndarray],  # {data set names : np arrays}
               Dict[str, pd.DataFrame],  # {data set names : pd frames}
               ]:
        if X is None:
            return []

        columns = None  # columns = ["PAY_AMT", "BILL_AMT", "PAY_"]
        ranges = None  # [(1, 6), (1, 6), (2, 6)]
        black_listed_columns = []
        min_col_group_size = 2

        # parse column names for time series column groups
        if columns is None or columns == [] or \
                ranges is None or ranges == []:
            # match any column names that consist of alpha name (prefix) followed by integer index (suffix)
            p = re.compile(r"^([a-zA-Z_]+)(\d+)$")
            matches = [p.match(c) for c in X.names]
            all_col_groups = defaultdict(list)
            for m in matches:
                if m is not None:
                    key = m.group(1)
                    val = int(m.group(2))
                    all_col_groups[key].append(val)

            # remove black listed columns or column groups that smaller than minimal size
            col_groups = {
                key: val
                for key, val in all_col_groups.items()
                if not key in black_listed_columns
                or len(val) >= min_col_group_size
            }

            # list of column prefixes
            columns = list(col_groups.keys())
            # list of column ranges
            ranges = [(min(idx), max(idx)) for idx in col_groups.values()]

        col_slices = [((col + "%d") % (desde), (col + "%d") % (hasta))
                      for (col, (desde, hasta)) in zip(columns, ranges)]

        for c, r, s in zip(columns, ranges, col_slices):
            update_map = {
                c + "_sum": rowsum(f[s[0]:s[1]]),
                c + "_mean": rowmean(f[s[0]:s[1]]),
                c + "_sd": rowsd(f[s[0]:s[1]]),
                c + "_max": rowmax(f[s[0]:s[1]]),
                c + "_min": rowmin(f[s[0]:s[1]]),
                c + "_range": rowmax(f[s[0]:s[1]]) - rowmin(f[s[0]:s[1]]),
                c + "_first": rowfirst(f[s[0]:s[1]]),
                c + "_last": rowlast(f[s[0]:s[1]]),
                c + "_missing": (r[1] - r[0] + 1) - rowcount(f[s[0]:s[1]])
            }
            X[:, update(**update_map)]

        return X