예제 #1
0
def test_dataframe_join_cats():
    lhs = DataFrame()
    lhs['a'] = pd.Categorical(list('aababcabbc'), categories=list('abc'))
    lhs['b'] = bb = np.arange(len(lhs))
    lhs = lhs.set_index('a')

    rhs = DataFrame()
    rhs['a'] = pd.Categorical(list('abcac'), categories=list('abc'))
    rhs['c'] = cc = np.arange(len(rhs))
    rhs = rhs.set_index('a')

    got = lhs.join(rhs)
    expect = lhs.to_pandas().join(rhs.to_pandas())

    # Note: pandas make a object Index after joining
    pd.util.testing.assert_frame_equal(
        got.sort_values(by='b')
        .to_pandas()
        .sort_index()
        .reset_index(drop=True),
        expect.reset_index(drop=True))

    # Just do some rough checking here.
    assert list(got.columns) == ['b', 'c']
    assert len(got) > 0
    assert set(got.index.values) & set('abc')
    assert set(got['b']) & set(bb)
    assert set(got['c']) & set(cc)
예제 #2
0
def test_dataframe_join_suffix():
    np.random.seed(0)

    df = DataFrame()
    for k in 'abc':
        df[k] = np.random.randint(0, 5, 5)

    left = df.set_index('a')
    right = df.set_index('c')
    with pytest.raises(ValueError) as raises:
        left.join(right)
    raises.match("there are overlapping columns but lsuffix"
                 " and rsuffix are not defined")

    got = left.join(right, lsuffix='_left', rsuffix='_right', sort=True)
    # Get expected value
    pddf = df.to_pandas()
    expect = pddf.set_index('a').join(pddf.set_index('c'),
                                      lsuffix='_left',
                                      rsuffix='_right')
    # Check
    assert list(expect.columns) == list(got.columns)
    assert np.all(expect.index.values == got.index.values)
    for k in expect.columns:
        _check_series(expect[k], got[k])
예제 #3
0
def test_dataframe_join_how(aa, bb, how, method):
    df = DataFrame()
    df['a'] = aa
    df['b'] = bb

    def work_pandas(df):
        ts = timer()
        df1 = df.set_index('a')
        df2 = df.set_index('b')
        joined = df1.join(df2, how=how, sort=True)
        te = timer()
        print('timing', type(df), te - ts)
        return joined

    def work_gdf(df):
        ts = timer()
        df1 = df.set_index('a')
        df2 = df.set_index('b')
        joined = df1.join(df2, how=how, sort=True, method=method)
        te = timer()
        print('timing', type(df), te - ts)
        return joined

    expect = work_pandas(df.to_pandas())
    got = work_gdf(df)
    expecto = expect.copy()
    goto = got.copy()

    # Type conversion to handle NoneType
    expectb = expect.b
    expecta = expect.a
    gotb = got.b
    gota = got.a
    got.drop_column('b')
    got.add_column('b', gotb.astype(np.float64).fillna(np.nan))
    got.drop_column('a')
    got.add_column('a', gota.astype(np.float64).fillna(np.nan))
    expect.drop(['b'], axis=1)
    expect['b'] = expectb.astype(np.float64).fillna(np.nan)
    expect.drop(['a'], axis=1)
    expect['a'] = expecta.astype(np.float64).fillna(np.nan)

    # print(expect)
    # print(got.to_string(nrows=None))

    assert list(expect.columns) == list(got.columns)
    assert np.all(expect.index.values == got.index.values)
    if(how != 'outer'):
        pd.util.testing.assert_frame_equal(
            got.to_pandas().sort_values(['b', 'a']).reset_index(drop=True),
            expect.sort_values(['b', 'a']).reset_index(drop=True))
        # if(how=='right'):
        #     _sorted_check_series(expect['a'], expect['b'],
        #                          got['a'], got['b'])
        # else:
        #     _sorted_check_series(expect['b'], expect['a'], got['b'],
        #                          got['a'])
    else:
        _check_series(expecto['b'], goto['b'])
        _check_series(expecto['a'], goto['a'])
예제 #4
0
def test_dataframe_append_to_empty():
    pdf = pd.DataFrame()
    pdf['a'] = []
    pdf['b'] = [1, 2, 3]

    gdf = DataFrame()
    gdf['a'] = []
    gdf['b'] = [1, 2, 3]

    pd.testing.assert_frame_equal(gdf.to_pandas(), pdf)
예제 #5
0
def test_dataframe_multi_column_join():
    np.random.seed(0)

    # Make GDF
    df_left = DataFrame()
    nelem = 500
    df_left['key1'] = np.random.randint(0, 30, nelem)
    df_left['key2'] = np.random.randint(0, 50, nelem)
    df_left['val1'] = np.arange(nelem)

    df_right = DataFrame()
    nelem = 500
    df_right['key1'] = np.random.randint(0, 30, nelem)
    df_right['key2'] = np.random.randint(0, 50, nelem)
    df_right['val1'] = np.arange(nelem)

    # Make pandas DF
    pddf_left = df_left.to_pandas()
    pddf_right = df_right.to_pandas()
    # print(pddf_left)
    # print(pddf_right)

    # Expected result
    pddf_joined = pddf_left.merge(pddf_right, on=['key1', 'key2'], how='left',
                                  sort=True)
    # print(pddf_joined)

    # Test (doesn't check for ordering)
    join_result = df_left.merge(df_right, on=['key1', 'key2'], how='left')

    for col in list(pddf_joined.columns):
        if(col.count('_y') > 0):
            join_result[col] = (join_result[col]
                                .astype(np.float64)
                                .fillna(np.nan))

    pd.util.testing.assert_frame_equal(
        join_result
        .to_pandas()
        .sort_values(list(pddf_joined.columns))
        .reset_index(drop=True),
        pddf_joined)
예제 #6
0
def test_df_cat_sort_index():
    df = DataFrame()
    df['a'] = pd.Categorical(list('aababcabbc'), categories=list('abc'))
    df['b'] = np.arange(len(df))

    got = df.set_index('a').sort_index()
    expect = df.to_pandas().set_index('a').sort_index()

    assert list(expect.columns) == list(got.columns)
    assert list(expect.index.values) == list(got.index.values)
    np.testing.assert_array_equal(expect.index.values, got.index.values)
    np.testing.assert_array_equal(expect['b'].values, got['b'].to_array())
예제 #7
0
def test_to_pandas():
    df = DataFrame()
    df['a'] = np.arange(10, dtype=np.int32)
    df['b'] = np.arange(10, 20, dtype=np.float64)

    pdf = df.to_pandas()

    assert tuple(df.columns) == tuple(pdf.columns)

    assert df['a'].dtype == pdf['a'].dtype
    assert df['b'].dtype == pdf['b'].dtype

    assert len(df['a']) == len(pdf['a'])
    assert len(df['b']) == len(pdf['b'])
예제 #8
0
def test_dataframe_masked_slicing(nelem, slice_start, slice_end):
    gdf = DataFrame()
    gdf['a'] = list(range(nelem))
    gdf['b'] = list(range(nelem, 2 * nelem))
    gdf['a'] = gdf['a'].set_mask(utils.random_bitmask(nelem))
    gdf['b'] = gdf['b'].set_mask(utils.random_bitmask(nelem))

    def do_slice(x):
        return x[slice_start:slice_end]

    expect = do_slice(gdf.to_pandas())
    got = do_slice(gdf).to_pandas()

    pd.testing.assert_frame_equal(expect, got)
예제 #9
0
def test_to_pandas():
    df = DataFrame()
    df['a'] = np.arange(10, dtype=np.int32)
    df['b'] = np.arange(10, 20, dtype=np.float64)

    pdf = df.to_pandas()

    assert tuple(df.columns) == tuple(pdf.columns)

    assert df['a'].dtype == pdf['a'].dtype
    assert df['b'].dtype == pdf['b'].dtype

    assert len(df['a']) == len(pdf['a'])
    assert len(df['b']) == len(pdf['b'])
예제 #10
0
def test_groupby_apply():
    np.random.seed(0)
    df = DataFrame()
    nelem = 20
    df['key1'] = np.random.randint(0, 3, nelem)
    df['key2'] = np.random.randint(0, 2, nelem)
    df['val1'] = np.random.random(nelem)
    df['val2'] = np.random.random(nelem)

    expect_grpby = df.to_pandas().groupby(['key1', 'key2'], as_index=False)
    got_grpby = df.groupby(['key1', 'key2'])

    def foo(df):
        df['out'] = df['val1'] + df['val2']
        return df

    expect = expect_grpby.apply(foo)
    expect = expect.sort_values(['key1', 'key2']).reset_index(drop=True)

    got = got_grpby.apply(foo).to_pandas()
    pd.util.testing.assert_frame_equal(expect, got)
예제 #11
0
def test_groupby_apply_grouped():
    from numba import cuda

    np.random.seed(0)
    df = DataFrame()
    nelem = 20
    df['key1'] = np.random.randint(0, 3, nelem)
    df['key2'] = np.random.randint(0, 2, nelem)
    df['val1'] = np.random.random(nelem)
    df['val2'] = np.random.random(nelem)

    expect_grpby = df.to_pandas().groupby(['key1', 'key2'], as_index=False)
    got_grpby = df.groupby(['key1', 'key2'])

    def foo(key1, val1, com1, com2):
        for i in range(cuda.threadIdx.x, len(key1), cuda.blockDim.x):
            com1[i] = key1[i] * 10000 + val1[i]
            com2[i] = i

    got = got_grpby.apply_grouped(foo,
                                  incols=['key1', 'val1'],
                                  outcols={
                                      'com1': np.float64,
                                      'com2': np.int32
                                  },
                                  tpb=8)

    got = got.to_pandas()

    # Get expected result by emulating the operation in pandas
    def emulate(df):
        df['com1'] = df.key1 * 10000 + df.val1
        df['com2'] = np.arange(len(df), dtype=np.int32)
        return df

    expect = expect_grpby.apply(emulate)
    expect = expect.sort_values(['key1', 'key2']).reset_index(drop=True)

    pd.util.testing.assert_frame_equal(expect, got)
예제 #12
0
def test_dataframe_join_how(aa, bb, how):
    df = DataFrame()
    df['a'] = aa
    df['b'] = bb

    def work(df):
        ts = timer()
        df1 = df.set_index('a')
        df2 = df.set_index('b')
        joined = df1.join(df2, how=how, sort=True)
        te = timer()
        print('timing', type(df), te - ts)
        return joined

    expect = work(df.to_pandas())
    got = work(df)

    # print(expect)
    # print(got.to_string(nrows=None))

    assert list(expect.columns) == list(got.columns)
    assert np.all(expect.index.values == got.index.values)
    _check_series(expect['b'], got['b'])
    _check_series(expect['a'], got['a'])
예제 #13
0
def test_dataframe_join_suffix():
    np.random.seed(0)

    df = DataFrame()
    for k in 'abc':
        df[k] = np.random.randint(0, 5, 5)

    left = df.set_index('a')
    right = df.set_index('c')
    with pytest.raises(ValueError) as raises:
        left.join(right)
    raises.match("there are overlapping columns but lsuffix"
                 " and rsuffix are not defined")

    got = left.join(right, lsuffix='_left', rsuffix='_right')
    # Get expected value
    pddf = df.to_pandas()
    expect = pddf.set_index('a').join(pddf.set_index('c'),
                                      lsuffix='_left', rsuffix='_right')
    # Check
    assert list(expect.columns) == list(got.columns)
    assert np.all(expect.index.values == got.index.values)
    for k in expect.columns:
        _check_series(expect[k], got[k])
예제 #14
0
def test_dataframe_join_how(aa, bb, how):
    df = DataFrame()
    df['a'] = aa
    df['b'] = bb

    def work(df):
        ts = timer()
        df1 = df.set_index('a')
        df2 = df.set_index('b')
        joined = df1.join(df2, how=how, sort=True)
        te = timer()
        print('timing', type(df), te - ts)
        return joined

    expect = work(df.to_pandas())
    got = work(df)

    # print(expect)
    # print(got.to_string(nrows=None))

    assert list(expect.columns) == list(got.columns)
    assert np.all(expect.index.values == got.index.values)
    _check_series(expect['b'], got['b'])
    _check_series(expect['a'], got['a'])