Exemplo n.º 1
0
def test_label_encode_drop_one():
    random.seed(0)
    np.random.seed(0)

    df = DataFrame()

    # initialize data frame
    df["cats"] = np.random.randint(7, size=10, dtype=np.int32)
    vals = list(df["cats"].unique())
    # drop 1 randomly
    del vals[random.randrange(len(vals))]

    lab = dict(zip(vals, list(range(len(vals)))))

    # label encode series
    ncol = df["cats"].label_encoding(cats=vals, dtype="float32")
    arr = ncol.to_array()

    # verify labels of new column

    for i in range(arr.size):
        # assuming -1 is used for missing value
        np.testing.assert_equal(arr[i], lab.get(df.cats[i], -1))

    # label encode data frame
    df2 = df.label_encoding(column="cats",
                            prefix="cats",
                            cats=vals,
                            dtype="float32")

    assert df2.columns[0] == "cats"
    assert df2.columns[1] == "cats_labels"
Exemplo n.º 2
0
def test_label_encode_float_output():
    random.seed(0)
    np.random.seed(0)

    df = DataFrame()

    # initialize data frame
    df["cats"] = arr = np.random.randint(7, size=10, dtype=np.int32)
    cats = [1, 2, 3, 4]
    encoder = {c: i for i, c in enumerate(cats)}
    df2 = df.label_encoding(
        column="cats",
        prefix="cats",
        cats=cats,
        dtype=np.float32,
        na_sentinel=np.nan,
    )

    got = df2["cats_labels"].to_array(fillna="pandas")

    handcoded = np.array([encoder.get(v, np.nan) for v in arr])
    np.testing.assert_equal(got, handcoded)
Exemplo n.º 3
0
def test_label_encode(nelem, dtype):
    df = DataFrame()
    np.random.seed(0)

    # initialize data frame
    df["cats"] = _random(nelem, dtype)
    vals = df["cats"].unique()
    lab = dict(zip(vals, range(len(vals))))

    # label encode series
    ncol = df["cats"].label_encoding(cats=vals)
    arr = ncol.to_array()

    # verify labels of new column
    for i in range(arr.size):
        np.testing.assert_equal(arr[i], lab.get(df.cats[i], None))

    # label encode data frame
    df2 = df.label_encoding(column="cats", prefix="cats", cats=vals)

    assert df2.columns[0] == "cats"
    assert df2.columns[1] == "cats_labels"