示例#1
0
def test_reading_arrow_sparse_data():
    schema, darr = read_data()
    gar = GpuArrowReader(schema, darr)

    df = DataFrame(gar.to_dict().items())

    # preprocessing
    num_cols = set()
    cat_cols = set()
    response_set = set(['INCEARN '])
    feature_names = set(df.columns) - response_set

    # Determine cat and numeric columns
    uniques = {}
    for k in feature_names:
        try:
            uniquevals = df[k].unique()
            uniques[k] = uniquevals
        except ValueError:
            num_cols.add(k)
        else:
            nunique = len(uniquevals)
            if nunique < 2:
                del df[k]
            elif 1 < nunique < 1000:
                cat_cols.add(k)
            else:
                num_cols.add(k)

    # Fix numeric columns
    for k in (num_cols - response_set):
        df[k] = df[k].fillna(df[k].mean())
        assert df[k].null_count == 0
        std = df[k].std()
        # drop near constant columns
        if not np.isfinite(std) or std < 1e-4:
            del df[k]
            print('drop near constant', k)
        else:
            df[k] = df[k].scale()

    # Expand categorical columns
    for k in cat_cols:
        cats = uniques[k][1:]  # drop first
        df = df.one_hot_encoding(k, prefix=k, cats=cats)
        del df[k]

    # Print dtypes
    assert {df[k].dtype for k in df.columns} == {np.dtype('float64')}

    mat = df.as_matrix()

    assert mat.max() == 1
    assert mat.min() == 0
示例#2
0
def test_dataframe_basic():
    np.random.seed(0)
    df = DataFrame()

    # Populate with cuda memory
    df['keys'] = rmm.to_device(np.arange(10, dtype=np.float64))
    np.testing.assert_equal(df['keys'].to_array(), np.arange(10))
    assert len(df) == 10

    # Populate with numpy array
    rnd_vals = np.random.random(10)
    df['vals'] = rnd_vals
    np.testing.assert_equal(df['vals'].to_array(), rnd_vals)
    assert len(df) == 10
    assert tuple(df.columns) == ('keys', 'vals')

    # Make another dataframe
    df2 = DataFrame()
    df2['keys'] = np.array([123], dtype=np.float64)
    df2['vals'] = np.array([321], dtype=np.float64)

    # Concat
    df = gd.concat([df, df2])
    assert len(df) == 11

    hkeys = np.asarray(np.arange(10, dtype=np.float64).tolist() + [123])
    hvals = np.asarray(rnd_vals.tolist() + [321])

    np.testing.assert_equal(df['keys'].to_array(), hkeys)
    np.testing.assert_equal(df['vals'].to_array(), hvals)

    # As matrix
    mat = df.as_matrix()

    expect = np.vstack([hkeys, hvals]).T

    print(expect)
    print(mat)
    np.testing.assert_equal(mat, expect)