def test_j_dict(dt0, tbl0): dt1 = dt0[:, {"x": f[0], "y": f["D"]}] dt1.internal.check() assert dt1.shape == (6, 2) assert same_iterables(dt1.names, ("x", "y")) assert not dt1.internal.isview assert same_iterables(dt1.to_list(), [tbl0[0], tbl0[3]])
def test_create_from_kwargs1(): d0 = dt.Frame(A=[1, 2, 3], B=[True, None, False], C=["a", "b", "c"]) frame_integrity_check(d0) assert same_iterables(d0.names, ("A", "B", "C")) assert same_iterables(d0.to_list(), [[1, 2, 3], [True, None, False], ["a", "b", "c"]])
def test_j_dict(dt0, tbl0): dt1 = dt0[:, {"x": f[0], "y": f["D"]}] frame_integrity_check(dt1) assert dt1.shape == (6, 2) assert same_iterables(dt1.names, ("x", "y")) assert not isview(dt1) assert same_iterables(dt1.to_list(), [tbl0[0], tbl0[3]])
def test_cols_expression(dt0, tbl0): """ Check that it is possible to select computed columns: dt[lambda f: [f.A + f.B]] """ dt1 = dt0[:, f.A + f.B] dt1.internal.check() assert dt1.shape == (6, 1) assert dt1.ltypes == (ltype.int, ) assert as_list(dt1) == [[tbl0[0][i] + tbl0[1][i] for i in range(6)]] dt2 = dt0[:, [f.A + f.B, f.C - f.D, f.A / f.C, f.B * f.D]] dt2.internal.check() assert dt2.shape == (6, 4) assert dt2.ltypes == (ltype.int, ltype.real, ltype.real, ltype.int) assert as_list(dt2) == [[tbl0[0][i] + tbl0[1][i] for i in range(6)], [tbl0[2][i] - tbl0[3][i] for i in range(6)], [tbl0[0][i] / tbl0[2][i] for i in range(6)], [tbl0[1][i] * tbl0[3][i] for i in range(6)]] dt3 = dt0[:, {"foo": f.A + f.B - f.C * 10, "a": f.A, "b": 1, "c": 2}] dt3.internal.check() assert dt3.shape == (6, 4) assert same_iterables(dt3.names, ("foo", "a", "b", "c")) assert same_iterables(dt3.ltypes, (ltype.real, ltype.int, ltype.int, ltype.real)) assert not dt3.internal.isview assert as_list(dt3["foo"]) == [[tbl0[0][i] + tbl0[1][i] - tbl0[2][i] * 10 for i in range(6)]]
def test_create_from_kwargs2(): d0 = dt.Frame(x=range(4), y=[1, 3, 8, 0], stypes=[dt.int64, dt.float32]) frame_integrity_check(d0) assert d0.shape == (4, 2) assert same_iterables(d0.names, ("x", "y")) assert same_iterables(d0.stypes, (dt.int64, dt.float32)) assert same_iterables(d0.to_list(), [[0, 1, 2, 3], [1, 3, 8, 0]])
def test_create_from_kwargs1(): d0 = dt.Frame(A=[1, 2, 3], B=[True, None, False], C=["a", "b", "c"]) d0.internal.check() assert same_iterables(d0.names, ("A", "B", "C")) assert same_iterables(d0.topython(), [[1, 2, 3], [True, None, False], ["a", "b", "c"]])
def test_create_from_dict(): d7 = dt.Frame({"A": [1, 5, 10], "B": [True, False, None], "C": ["alpha", "beta", "gamma"]}) assert d7.shape == (3, 3) assert same_iterables(d7.names, ("A", "B", "C")) assert same_iterables(d7.ltypes, (ltype.int, ltype.bool, ltype.str)) frame_integrity_check(d7)
def test_create_from_mixed_sources(numpy): df = dt.Frame({"A": numpy.random.randn(5), "B": range(5), "C": ["foo", "baw", "garrgh", "yex", "fin"], "D": numpy.array([5, 8, 1, 3, 5813], dtype="int32")}) frame_integrity_check(df) assert df.shape == (5, 4) assert same_iterables(df.names, ("A", "B", "C", "D")) assert same_iterables(df.stypes, (stype.float64, stype.int32, stype.str32, stype.int32))
def test_groupby_multi_large(seed): random.seed(seed) letters = "abcdefghijklmn" n = 100 + int(random.expovariate(0.0001)) col0 = [random.choice([True, False]) for _ in range(n)] col1 = [random.randint(-10, 10) for _ in range(n)] col2 = [random.choice(letters) for _ in range(n)] col3 = [random.random() for _ in range(n)] rows = [(col0[i], col1[i], col2[i], col3[i]) for i in range(n)] rows.sort() grouped = [] lastkey = rows[0][:3] sumval = 0 for i in range(n): ikey = rows[i][:3] if ikey != lastkey: grouped.append(lastkey + (sumval, )) lastkey = ikey sumval = 0 sumval += rows[i][3] grouped.append(lastkey + (sumval, )) DT0 = dt.Frame([col0, col1, col2, col3], names=["A", "B", "C", "D"]) DT1 = DT0[:, sum(f.D), by(f.A, f.B, f.C)] DT2 = dt.Frame(grouped) assert same_iterables(DT1.to_list(), DT2.to_list())
def test_cols_dict(dt0, tbl0): """ Test selecting multiple columns using a dictionary: dt[{"x": "A", "y": "B"}] """ dt1 = dt0(select={"x": 0, "y": "D"}) dt1.internal.check() assert dt1.shape == (6, 2) assert same_iterables(dt1.names, ("x", "y")) assert not dt1.internal.isview assert same_iterables(as_list(dt1), [tbl0[0], tbl0[3]]) dt2 = dt0[{"_": slice(None)}] dt2.internal.check() assert dt2.shape == (6, 4) assert dt2.names == ("_", "_1", "_2", "_3") assert not dt2.internal.isview assert as_list(dt2) == tbl0
def test_create_from_dict_of_numpy_arrays(numpy): df = dt.Frame({"A": numpy.random.randn(67), "B": numpy.random.randn(67), "C": numpy.random.randn(67)}) frame_integrity_check(df) assert df.shape == (67, 3) assert df.stypes == (stype.float64,) * 3 assert same_iterables(df.names, ("A", "B", "C"))
def test_topandas(): d0 = dt.Frame({"A": [1, 5], "B": ["hello", "you"], "C": [True, False]}) p0 = d0.to_pandas() assert p0.shape == (2, 3) assert same_iterables(p0.columns.tolist(), ["A", "B", "C"]) assert p0["A"].values.tolist() == [1, 5] assert p0["B"].values.tolist() == ["hello", "you"] assert p0["C"].values.tolist() == [True, False]
def test_tonumpy1(numpy): d0 = dt.Frame({"A": [1, 5], "B": ["helo", "you"], "C": [True, False], "D": [3.4, None]}) a0 = d0.to_numpy() assert a0.shape == d0.shape assert a0.dtype == numpy.dtype("object") assert same_iterables(a0.T.tolist(), d0.to_list()) a1 = numpy.array(d0) assert (a0 == a1).all()
def test_0rows_frame(): dt0 = dt.Frame(A=[], B=[], stype=int) assert dt0.shape == (0, 2) dt1 = dt0[f.A == 0, :] frame_integrity_check(dt1) assert dt1.shape == (0, 2) assert same_iterables(dt1.names, ("A", "B")) dt2 = dt0[:, f.A - f.B] frame_integrity_check(dt2) assert dt2.shape == (0, 1) assert dt2.ltypes == (ltype.int, )
def test_j_expression(dt0, tbl0): dt1 = dt0[:, f.A + f.B] frame_integrity_check(dt1) assert dt1.shape == (6, 1) assert dt1.ltypes == (ltype.int, ) assert dt1.to_list() == [[tbl0[0][i] + tbl0[1][i] for i in range(6)]] dt2 = dt0[:, [f.A + f.B, f.C - f.D, f.A / f.C, f.B * f.D]] frame_integrity_check(dt2) assert dt2.shape == (6, 4) assert dt2.ltypes == (ltype.int, ltype.real, ltype.real, ltype.int) assert dt2.to_list() == [[tbl0[0][i] + tbl0[1][i] for i in range(6)], [tbl0[2][i] - tbl0[3][i] for i in range(6)], [tbl0[0][i] / tbl0[2][i] for i in range(6)], [tbl0[1][i] * tbl0[3][i] for i in range(6)]] dt3 = dt0[:, {"foo": f.A + f.B - f.C * 10, "a": f.A, "b": f[1], "c": f[2]}] frame_integrity_check(dt3) assert dt3.shape == (6, 4) assert same_iterables(dt3.names, ("foo", "a", "b", "c")) assert same_iterables(dt3.ltypes, (ltype.real, ltype.int, ltype.int, ltype.real)) assert dt3[:, "foo"].to_list() == [[ tbl0[0][i] + tbl0[1][i] - tbl0[2][i] * 10 for i in range(6) ]]
def test_issue998(): src = find_file("h2o-3", "bigdata", "laptop", "higgs_head_2M.csv") # The file is 1.46GB in size. I could not find a smaller file that exhibits # this problem... The issue only appeared in single-threaded mode, so we # have to read this file slowly. On my laptop, this test runs in about 8s. f0 = dt.fread(src, nthreads=1, fill=True, na_strings=["-999"]) assert f0.shape == (2000000, 29) assert f0.names == tuple("C%d" % i for i in range(f0.ncols)) assert f0.stypes == (dt.stype.float64, ) * f0.ncols assert same_iterables( f0.sum().to_list(), [[1058818.0], [1981919.6107614636], [701.7858121241807], [-195.48500674014213], [1996390.3476011853], [-1759.5364254778178], [1980743.446578741], [-1108.7512905876065], [1712.947751407064], [2003064.4534490108], [1985100.3810670376], [1190.8404791812281], [384.00605312064], [1998592.0739881992], [1984490.1900614202], [2033.9754767678387], [-1028.0810855487362], [2001341.0813384056], [1971311.3271338642], [-943.92552991907], [-1079.3848229270661], [1996588.295421958], [2068619.2163415626], [2049516.5437491536], [2100795.4839400873], [2019540.6562294513], [1946283.046177674], [2066298.020782411], [1919714.12131235]])
def test_cols_colselector(dt0, tbl0): """ Check that a "column selector" expression is equivalent to directly indexing the column: dt[lambda f: f.A] """ dt1 = dt0(select=lambda f: f.B) dt1.internal.check() assert dt1.shape == (6, 1) assert dt1.names == ("B", ) assert not dt1.internal.isview assert as_list(dt1) == [tbl0[1]] dt2 = dt0(select=lambda f: [f.A, f.C]) dt2.internal.check() assert dt2.shape == (6, 2) assert dt2.names == ("A", "C") assert not dt2.internal.isview assert as_list(dt2) == [tbl0[0], tbl0[2]] dt3 = dt0[lambda f: {"x": f.A, "y": f.D}] dt3.internal.check() assert dt3.shape == (6, 2) assert same_iterables(dt3.names, ("x", "y")) assert not dt3.internal.isview
def test_j_colselector3(dt0, tbl0): dt3 = dt0[:, {"x": f.A, "y": f.D}] frame_integrity_check(dt3) assert dt3.shape == (6, 2) assert same_iterables(dt3.names, ("x", "y")) assert not isview(dt3)
def test_create_from_pandas(pandas): p = pandas.DataFrame({"A": [2, 5, 8], "B": ["e", "r", "qq"]}) d = dt.Frame(p) d.internal.check() assert d.shape == (3, 2) assert same_iterables(d.names, ("A", "B"))
def test_create_from_pandas_with_names(pandas): p = pandas.DataFrame({"A": [2, 5, 8], "B": ["e", "r", "qq"]}) d = dt.Frame(p, names=["miniature", "miniscule"]) frame_integrity_check(d) assert d.shape == (3, 2) assert same_iterables(d.names, ("miniature", "miniscule"))
def test_j_colselector3(dt0, tbl0): dt3 = dt0[:, {"x": f.A, "y": f.D}] dt3.internal.check() assert dt3.shape == (6, 2) assert same_iterables(dt3.names, ("x", "y")) assert not dt3.internal.isview