示例#1
0
def test_label_encode_drop_one():
    random.seed(0)
    np.random.seed(0)

    df = DataFrame()

    # initialize data frame
    df['cats'] = np.random.randint(7, size=10, dtype=np.int32)
    vals = list(df['cats'].unique())
    # drop 1 randomly
    del vals[random.randrange(len(vals))]

    lab = dict(zip(vals, list(range(len(vals)))))

    # label encode series
    ncol = df['cats'].label_encoding(cats=vals, dtype='float32')
    arr = ncol.to_array()

    # verify labels of new column

    for i in range(arr.size):
        # assuming -1 is used for missing value
        np.testing.assert_equal(arr[i], lab.get(df.cats[i], -1))

    # label encode data frame
    df2 = df.label_encoding(column='cats', prefix='cats', cats=vals, dtype='float32')

    assert df2.columns[0] == 'cats'
    assert df2.columns[1] == 'cats_labels'
示例#2
0
def test_label_encode_float_output():
    random.seed(0)
    np.random.seed(0)

    df = DataFrame()

    # initialize data frame
    df['cats'] = arr = np.random.randint(7, size=10, dtype=np.int32)
    cats = [1, 2, 3, 4]
    encoder = {c: i for i, c in enumerate(cats)}
    df2 = df.label_encoding(column='cats', prefix='cats',
                            cats=cats, dtype=np.float32,
                            na_sentinel=np.nan)

    got = df2['cats_labels'].to_array()

    handcoded = np.array([encoder.get(v, np.nan) for v in arr])
    np.testing.assert_equal(got, handcoded)
示例#3
0
def test_label_encode(nelem, dtype):
    df = DataFrame()
    np.random.seed(0)

    # initialize data frame
    df['cats'] = _random(nelem, dtype)
    vals = df['cats'].unique()
    lab = dict(zip(vals, range(len(vals))))

    # label encode series
    ncol = df['cats'].label_encoding(cats=vals)
    arr = ncol.to_array()

    # verify labels of new column
    for i in range(arr.size):
        np.testing.assert_equal(arr[i], lab.get(df.cats[i], None))

    # label encode data frame
    df2 = df.label_encoding(column='cats', prefix='cats', cats=vals)

    assert df2.columns[0] == 'cats'
    assert df2.columns[1] == 'cats_labels'