Пример #1
0
def test_cut_small():
    nbins = [4, 2, 5, 4, 10, 3, 2, 5]
    colnames = [
        "bool", "int_pos", "int_neg", "int", "float", "inf_max", "inf_min",
        "inf"
    ]

    DT = dt.Frame(
        [[True, None, False, False, True, None], [3, None, 4, 1, 5, 4],
         [-5, -1, -1, -1, None, 0], [None, -5, -314, 0, 5, 314],
         [None, 1.4, 4.1, 1.5, 5.9, 1.4], [math.inf, 1.4, 4.1, 1.5, 5.9, 1.4],
         [-math.inf, 1.4, 4.1, 1.5, 5.9, 1.4],
         [-math.inf, 1.4, 4.1, math.inf, 5.9, 1.4]],
        names=colnames)
    DT_ref_right = dt.Frame(
        [[3, None, 0, 0, 3, None], [0, None, 1, 0, 1, 1],
         [0, 3, 3, 3, None, 4], [None, 1, 0, 1, 2, 3], [None, 0, 5, 0, 9, 0],
         [None] * DT.nrows, [None] * DT.nrows, [None] * DT.nrows],
        names=colnames,
        stypes=[stype.int32] * DT.ncols)

    DT_ref_left = dt.Frame(
        [[3, None, 0, 0, 3, None], [1, None, 1, 0, 1, 1],
         [0, 4, 4, 4, None, 4], [None, 1, 0, 2, 2, 3], [None, 0, 6, 0, 9, 0],
         [None] * DT.nrows, [None] * DT.nrows, [None] * DT.nrows],
        names=colnames,
        stypes=[stype.int32] * DT.ncols)

    DT_cut_list = DT[:, cut(DT, nbins=nbins)]
    DT_cut_tuple = DT[:, cut(DT, nbins=tuple(nbins))]
    DT_cut_list_left = DT[:, cut(DT, nbins=nbins, right_closed=False)]
    assert_equals(DT_ref_right, DT_cut_list)
    assert_equals(DT_ref_right, DT_cut_tuple)
    assert_equals(DT_ref_left, DT_cut_list_left)
Пример #2
0
def test_cut_one_row():
    nbins = [1, 2, 3, 4]
    DT = dt.Frame([[True], [404], [3.1415926], [None]])
    DT_cut_right = DT[:, cut(DT, nbins=nbins)]
    DT_cut_left = DT[:, cut(DT, nbins=nbins, right_closed=False)]
    assert DT_cut_right.to_list() == [[0], [0], [1], [None]]
    assert DT_cut_left.to_list() == [[0], [1], [1], [None]]
Пример #3
0
def test_cut_error_inconsistent_bins():
    msg = (
        "When nbins is a list or a tuple, its length must be the same as "
        "the number of columns in the frame/expression, i.e. 2, instead got: 1"
    )
    DT = dt.Frame([[3, 1, 4], [1, 5, 9]])
    with pytest.raises(ValueError, match=msg):
        cut(DT, nbins=[10])
Пример #4
0
def test_cut_trivial_bins():
    DT_data = dt.Frame({"data": range(10)})
    DT_bins = dt.Frame({"bins": range(-1, 10)})
    cut_fexpr = cut(f[:], bins=[DT_bins])
    for i in range(5):
        DT_cut = DT_data[:, cut_fexpr]
        expr_cut = cut(DT_data, bins=[DT_bins])
        assert isinstance(expr_cut, FExpr)
        assert_equals(DT_data, DT_cut)
Пример #5
0
def test_cut_small_bins():
    DT_bins = [dt.Frame([-1, 0, 1, 2]),
               dt.Frame(range(10)),
               dt.Frame(range(-10, 0)),
               dt.Frame([-1000, 0, 314]),
               dt.Frame(range(10)),
               dt.Frame([0, 1.4, 2.8, 4.2, 5.6]),
               dt.Frame([0, 1.4, 2.8, 4.2, 5.6, 7.0]),
               dt.Frame([-5, 0, 15])]
    colnames = ["bool", "int_pos", "int_neg", "int",
                "float", "inf_max", "inf_min", "inf"]

    DT = dt.Frame(
           [[True, None, False, False, True, None],
           [3, None, 4, 1, 5, 4],
           [-5, -1, -1, -1, None, 0],
           [None, -5, -314, 0, 5, 314],
           [None, 1.4, 4.1, 1.5, 5.9, 1.4],
           [math.inf, 1.4, 4.1, 1.5, 5.9, 1.4],
           [-math.inf, 1.4, -4.1, 1.5, 5.9, 1.4],
           [-math.inf, 1.4, 4.1, math.inf, 5.9, 1.4]],
           names = colnames
         )

    DT_ref_right = dt.Frame(
                     [[1, None, 0, 0, 1, None],
                     [2, None, 3, 0, 4, 3],
                     [4, 8, 8, 8, None, None],
                     [None, 0, 0, 0, 1, 1],
                     [None, 1, 4, 1, 5, 1],
                     [None, 0, 2, 1, None, 0],
                     [None, 0, None, 1, 4, 0],
                     [None, 1, 1, None, 1, 1]],
                     names = colnames,
                     stypes = [stype.int32] * DT.ncols
                   )

    DT_ref_left = dt.Frame(
                     [[2, None, 1, 1, 2, None],
                     [3, None, 4, 1, 5, 4],
                     [5, None, None, None, None, None],
                     [None, 0, 0, 1, 1, None],
                     [None, 1, 4, 1, 5, 1],
                     [None, 1, 2, 1, None, 1],
                     [None, 1, None, 1, 4, 1],
                     [None, 1, 1, None, 1, 1]],
                     names = colnames,
                     stypes = [stype.int32] * DT.ncols
                   )

    DT_cut_list = DT[:, cut(DT, bins = DT_bins)]
    DT_cut_tuple = DT[:, cut(DT, bins = tuple(DT_bins))]
    DT_cut_list_left = DT[:, cut(DT, bins = DT_bins, right_closed = False)]
    assert_equals(DT_ref_right, DT_cut_list)
    assert_equals(DT_ref_right, DT_cut_tuple)
    assert_equals(DT_ref_left, DT_cut_list_left)
Пример #6
0
def test_cut_one_row_bins():
    DT_bins = [dt.Frame([0, 1]),
               dt.Frame(range(1000)),
               dt.Frame([-100, 3.1415926, 100]),
               dt.Frame(range(5))]
    DT = dt.Frame([[True], [404], [3.1415926], [None]])
    DT_cut_right = DT[:, cut(DT, bins = DT_bins)]
    DT_cut_left = DT[:, cut(DT, bins = DT_bins, right_closed = False)]
    assert DT_cut_right.to_list() == [[0], [403], [0], [None]]
    assert DT_cut_left.to_list() == [[None], [404], [1], [None]]
Пример #7
0
def test_cut_pandas_issue_35126(pandas):
    nbins = 42
    data = [-97, 0, 97]
    DT = dt.Frame(data)
    DT_cut_right = DT[:, cut(DT, nbins=nbins)]
    DT_cut_left = DT[:, cut(DT, nbins=nbins, right_closed=False)]
    assert DT_cut_right.to_list() == [[0, 20, 41]]
    assert DT_cut_left.to_list() == [[0, 21, 41]]

    # Testing that Pandas results are inconsistent
    PD = pandas.cut(data, nbins, labels=False)
    assert list(PD) == [0, 21, 41]
Пример #8
0
def test_cut_vs_pandas_random_bins(pandas, seed):
    random.seed(seed)
    max_bins = 20
    max_elements = 20
    max_value = 100

    n_elements = random.randint(1, max_elements)
    right_closed = bool(random.getrandbits(1))

    DT_bins = []
    bins = [[] for _ in range(3)]
    for i in range(3):
        nbins = random.randint(2, max_bins)
        bins[i] = random.sample(range(-max_value, max_value), nbins)
        bins[i].sort()
        DT_bins.append(dt.Frame(bins[i]))

    data = [[] for _ in range(3)]
    for _ in range(n_elements):
        data[0].append(random.randint(0, 1))
        data[1].append(random.randint(-max_value, max_value))
        data[2].append(random.random() * 2 * max_value - max_value)

    DT = dt.Frame(data, stypes = [stype.bool8, stype.int32, stype.float64])
    DT_cut = DT[:, cut(DT, bins = DT_bins, right_closed = right_closed)]

    PD_cut = [pandas.cut(data[i], bins[i], labels=False, right=right_closed) for i in range(3)]
    PD_l = [list(PD_cut[i]) for i in range(3)]

    # Replace `nan`s with `None` for pandas
    for i in range(3):
      PD_l[i] = [None if math.isnan(PD_l[i][j]) else PD_l[i][j] for j in range(n_elements)]

    assert PD_l == DT_cut.to_list()
Пример #9
0
def test_cut_vs_pandas_random(pandas, seed):
    random.seed(seed)
    max_size = 20
    max_value = 100

    n = random.randint(1, max_size)

    nbins = [random.randint(1, max_size) for _ in range(3)]
    right_closed = bool(random.getrandbits(1))
    data = [[] for _ in range(3)]

    for _ in range(n):
        data[0].append(random.randint(0, 1))
        data[1].append(random.randint(-max_value, max_value))
        data[2].append(random.random() * 2 * max_value - max_value)

    DT = dt.Frame(data, stypes=[stype.bool8, stype.int32, stype.float64])
    DT_cut = DT[:, cut(DT, nbins=nbins, right_closed=right_closed)]

    PD_cut = [
        pandas.cut(data[i], nbins[i], labels=False, right=right_closed)
        for i in range(3)
    ]

    assert [list(PD_cut[i]) for i in range(3)] == DT_cut.to_list()
Пример #10
0
def test_cut_error_inconsistent_nbins():
    msg = (
        "When nbins has more than one element, its length must be the same as "
        "the number of columns in the frame/expression, i.e. 2, instead got: 3"
    )
    DT = dt.Frame([[3, 1, 4], [1, 5, 9]])
    with pytest.raises(ValueError, match=msg):
        DT[:, cut(DT, nbins=[10, 11, 12])]
Пример #11
0
def test_cut_error_noniterable_bins():
    msg = "bins parameter must be a list or a tuple, instead got <class 'float'>"
    DT = dt.Frame(range(10))
    with pytest.raises(TypeError, match=msg):
        DT[:, cut(DT, bins=1.5)]
Пример #12
0
def test_cut_expr():
    DT = dt.Frame([range(0, 30, 3), range(0, 20, 2)])
    DT_cut = DT[:, cut(cut(f[0] - f[1]))]
    assert_equals(dt.Frame(range(10)), DT_cut)
Пример #13
0
def test_cut_trivial():
    DT = dt.Frame({"trivial": range(10)})
    DT_cut = DT[:, cut(f[:])]
    expr_cut = cut(DT)
    assert isinstance(expr_cut, FExpr)
    assert_equals(DT, DT_cut)
Пример #14
0
def test_cut_empty_frame():
    DT = dt.Frame()
    expr_cut = cut(DT)
    assert isinstance(expr_cut, FExpr)
    assert_equals(DT[:, f[:]], DT)
Пример #15
0
def test_cut_error_wrong_right():
    msg = "Expected a boolean, instead got <class 'int'>"
    DT = dt.Frame(range(10))
    with pytest.raises(TypeError, match=msg):
        cut(DT, right_closed=1492)
Пример #16
0
def test_cut_error_negative_nbins_list():
    msg = r"All elements in nbins must be positive, got nbins\[0\]: 0"
    DT = dt.Frame([[3, 1, 4], [1, 5, 9]])
    with pytest.raises(ValueError, match=msg):
        DT[:, cut(DT, nbins=[0, -1])]
Пример #17
0
def test_cut_error_float_nbins():
    msg = "Expected an integer, instead got <class 'float'>"
    DT = dt.Frame(range(10))
    with pytest.raises(TypeError, match=msg):
        DT[:, cut(DT, nbins=1.5)]
Пример #18
0
def test_cut_error_wrong_column_type():
    DT = dt.Frame([[1, 0], ["1", "0"]])
    msg = r"cut\(\) can only be applied to numeric columns, instead column 1 " \
          "has an stype: str32"
    with pytest.raises(TypeError, match=msg):
        DT[:, cut(DT)]
Пример #19
0
def test_cut_empty_frame():
    DT_cut = cut(dt.Frame())
    assert_equals(DT_cut, dt.Frame())
Пример #20
0
def test_cut_error_inconsistent_bins():
    msg = ("Number of elements in bins must be equal to the number of columns "
           "in the frame/expression, i.e. 2, instead got: 1")
    DT = dt.Frame([[3, 1, 4], [1, 5, 9]])
    with pytest.raises(ValueError, match=msg):
        DT[:, cut(DT, bins=[dt.Frame([1, 2])])]
Пример #21
0
def test_cut_error_string_bins():
    msg = "bins parameter must be a list or a tuple, instead got <class 'str'>"
    DT = dt.Frame(range(10))
    with pytest.raises(TypeError, match=msg):
        DT[:, cut(DT, bins="bin1")]
Пример #22
0
def test_cut_error_one_bin_edge():
    msg = "To bin data at least two edges are required, instead for the frame 0 got: 1"
    DT = dt.Frame(range(10))
    with pytest.raises(ValueError, match=msg):
        DT[:, cut(DT, bins=[dt.Frame([1])])]
Пример #23
0
def test_cut_error_noargs():
    msg = r"Function datatable\.cut\(\) requires exactly 1 positional " \
          r"argument, but none were given"
    with pytest.raises(TypeError, match=msg):
        cut()
Пример #24
0
def test_cut_error_none_bin_edge():
    msg = "Bin edges must be numeric values only, instead for the frame 0 got None at row 2"
    DT = dt.Frame(range(10))
    with pytest.raises(ValueError, match=msg):
        DT[:, cut(DT, bins=[dt.Frame([1, 2, None, 3])])]
Пример #25
0
def test_cut_error_wrong_column_type_zero_rows():
    DT = dt.Frame(str=[] / dt.str32)
    msg = r"cut\(\) can only be applied to numeric columns, instead column 0 " \
          "has an stype: str32"
    with pytest.raises(TypeError, match=msg):
        DT[:, cut(DT)]
Пример #26
0
def test_cut_error_bin_edges_not_increasing():
    msg = "Bin edges must be strictly increasing, instead for the frame 0 at rows 2 and 3 the values are 4 and 3.99"
    DT = dt.Frame(range(10))
    with pytest.raises(ValueError, match=msg):
        DT[:, cut(DT, bins=[dt.Frame([1, 2, 4.0, 3.99])])]
Пример #27
0
def test_cut_error_negative_nbins():
    msg = "Number of bins must be positive, instead got: -10"
    DT = dt.Frame(range(10))
    with pytest.raises(ValueError, match=msg):
        DT[:, cut(DT, nbins=-10)]
Пример #28
0
def test_cut_error_groupby():
    msg = r"cut\(\) cannot be used in a groupby context"
    DT = dt.Frame(range(10))
    with pytest.raises(NotImplementedError, match=msg):
        DT[:, cut(f[0]), f[0]]
Пример #29
0
def test_cut_error_wrong_right():
    msg = r"Argument right_closed in function datatable\.cut\(\) should " \
          r"be a boolean, instead got <class 'int'>"
    DT = dt.Frame(range(10))
    with pytest.raises(TypeError, match=msg):
        DT[:, cut(DT, right_closed=1492)]
Пример #30
0
def test_cut_error_zero_bins():
    msg = "Number of bins must be positive, instead got: 0"
    DT = dt.Frame(range(10))
    with pytest.raises(ValueError, match=msg):
        cut(DT, nbins=0)