def test_rowfirst(): assert str(dt.rowfirst(f.A)) == str(f.A.rowfirst()) assert str(dt.rowfirst(f[:])) == str(f[:].rowfirst()) DT = dt.Frame({'A':[1, None, None, None], 'B':[None, 3, 4, None], 'C':[2, None, 5, None]}) assert_equals(DT[:, f[:].rowfirst()], DT[:, dt.rowfirst(f[:])])
def test_rowfirstlast_strs(st): DT = dt.Frame([("a", None, "b", None), (None, None, "x", None), ("", "", "AHA!", "last")], stype=st) RES = DT[:, [rowfirst(f[:]), rowlast(f[:])]] assert_equals(RES, dt.Frame([["a", "x", ""], ["b", "x", "last"]], stype=st))
def test_rowfirstlast_floats(st): DT = dt.Frame([(3.0, 7.0, math.nan), (math.inf, None, None), (math.nan, 2.5, -111)], stype=st) RES = DT[:, [rowfirst(f[:]), rowlast(f[:])]] assert_equals(RES, dt.Frame([[3.0, math.inf, 2.5], [7.0, math.inf, -111.0]], stype=st))
def test_reprs(): # Check that row-expressions can be repr'd without errors assert repr(rowall()) assert repr(rowany()) assert repr(rowsum()) assert repr(rowcount()) assert repr(rowmin()) assert repr(rowmax()) assert repr(rowfirst()) assert repr(rowlast()) assert repr(rowmean()) assert repr(rowsd())
def test_rowfirstlast_incompatible_types(): DT = dt.Frame([["a", None, "c", None], [1, 3, 4, None]]) assert_equals(DT[:, rowfirst(f[:])], dt.Frame(["a", "3", "c", None]))
def test_rowfirstlast_ints(st): DT = dt.Frame([(7, 5, 19, 22), (None, 1, 2, None), (None, None, None, None)], stype=st) RES = DT[:, [rowfirst(f[:]), rowlast(f[:])]] assert_equals(RES, dt.Frame([[7, 1, None], [22, 2, None]], stype=st))
def test_rowfirstlast_bools(): DT = dt.Frame([(None, True, False), (False, None, None), (None, None, None)]) RES = DT[:, [rowfirst(f[:]), rowlast(f[:])]] assert_equals(RES, dt.Frame([[True, False, None], [False, False, None]]))
# remove black listed columns or column groups that smaller than minimal size col_groups = { key: val for key, val in all_col_groups.items() if not key in black_listed_columns or len(val) >= min_col_group_size } # list of column prefixes columns = list(col_groups.keys()) # list of column ranges ranges = [(min(idx), max(idx)) for idx in col_groups.values()] # produce tuple for column slices col_slices = [((col + "%d") % (desde), (col + "%d") % (hasta)) for (col, (desde, hasta)) in zip(columns, ranges)] for c, r, s in zip(columns, ranges, col_slices): update_map = { c + "_sum": rowsum(f[s[0]:s[1]]), c + "_mean": rowmean(f[s[0]:s[1]]), c + "_sd": rowsd(f[s[0]:s[1]]), c + "_max": rowmax(f[s[0]:s[1]]), c + "_min": rowmin(f[s[0]:s[1]]), c + "_range": rowmax(f[s[0]:s[1]]) - rowmin(f[s[0]:s[1]]), c + "_first": rowfirst(f[s[0]:s[1]]), c + "_last": rowlast(f[s[0]:s[1]]), c + "_missing": (r[1] - r[0] + 1) - rowcount(f[s[0]:s[1]]) } X[:, update(**update_map)] return {"CreditCard-train-aug.csv": X}
def test_rowfirstlast_incompatible(): DT = dt.Frame(A=["a", "b", "c"], B=[1, 3, 4]) with pytest.raises(TypeError, match="Incompatible column types"): assert DT[:, rowfirst(f[:])]
def create_data( X: dt.Frame = None ) -> Union[str, List[str], dt.Frame, List[dt.Frame], np.ndarray, List[np.ndarray], pd.DataFrame, List[pd.DataFrame], Dict[ str, str], # {data set names : paths} Dict[str, dt.Frame], # {data set names : dt frames} Dict[str, np.ndarray], # {data set names : np arrays} Dict[str, pd.DataFrame], # {data set names : pd frames} ]: if X is None: return [] columns = None # columns = ["PAY_AMT", "BILL_AMT", "PAY_"] ranges = None # [(1, 6), (1, 6), (2, 6)] black_listed_columns = [] min_col_group_size = 2 # parse column names for time series column groups if columns is None or columns == [] or \ ranges is None or ranges == []: # match any column names that consist of alpha name (prefix) followed by integer index (suffix) p = re.compile(r"^([a-zA-Z_]+)(\d+)$") matches = [p.match(c) for c in X.names] all_col_groups = defaultdict(list) for m in matches: if m is not None: key = m.group(1) val = int(m.group(2)) all_col_groups[key].append(val) # remove black listed columns or column groups that smaller than minimal size col_groups = { key: val for key, val in all_col_groups.items() if not key in black_listed_columns or len(val) >= min_col_group_size } # list of column prefixes columns = list(col_groups.keys()) # list of column ranges ranges = [(min(idx), max(idx)) for idx in col_groups.values()] col_slices = [((col + "%d") % (desde), (col + "%d") % (hasta)) for (col, (desde, hasta)) in zip(columns, ranges)] for c, r, s in zip(columns, ranges, col_slices): update_map = { c + "_sum": rowsum(f[s[0]:s[1]]), c + "_mean": rowmean(f[s[0]:s[1]]), c + "_sd": rowsd(f[s[0]:s[1]]), c + "_max": rowmax(f[s[0]:s[1]]), c + "_min": rowmin(f[s[0]:s[1]]), c + "_range": rowmax(f[s[0]:s[1]]) - rowmin(f[s[0]:s[1]]), c + "_first": rowfirst(f[s[0]:s[1]]), c + "_last": rowlast(f[s[0]:s[1]]), c + "_missing": (r[1] - r[0] + 1) - rowcount(f[s[0]:s[1]]) } X[:, update(**update_map)] return X