示例#1
0
def concat(objs):
    """Concantenate dask gdf objects

    Parameters
    ----------

    objs : sequence of DataFrame, Series, Index
        A sequence of objects to be concatenated.
    """
    objs = [_daskify(x) for x in objs]
    meta = gd.concat(_extract_meta(objs))

    name = "concat-" + uuid4().hex
    dsk = {}
    divisions = [0]
    base = 0
    lastdiv = 0
    for obj in objs:
        for k, i in obj.__dask_keys__():
            dsk[name, base + i] = k, i
        base += obj.npartitions
        divisions.extend([d + lastdiv for d in obj.divisions[1:]])
        lastdiv = obj.divisions[-1]

    dasks = [o.dask for o in objs]
    dsk = merge(dsk, *dasks)
    return new_dd_object(dsk, name, meta, divisions)
def create_12_mon_features(joined_df, **kwargs):
    testdfs = []
    n_months = 12
    for y in range(1, n_months + 1):
        tmpdf = joined_df[[
            'loan_id', 'timestamp_year', 'timestamp_month', 'delinquency_12',
            'upb_12'
        ]]
        tmpdf['josh_months'] = tmpdf['timestamp_year'] * 12 + tmpdf[
            'timestamp_month']
        tmpdf['josh_mody_n'] = (
            (tmpdf['josh_months'].astype('float64') - 24000 - y) / 12).floor()
        tmpdf = tmpdf.groupby(['loan_id', 'josh_mody_n'], method='hash').agg({
            'delinquency_12':
            'max',
            'upb_12':
            'min'
        })
        tmpdf['delinquency_12'] = (tmpdf['max_delinquency_12'] >
                                   3).astype('int32')
        tmpdf['delinquency_12'] += (tmpdf['min_upb_12'] == 0).astype('int32')
        tmpdf.drop_column('max_delinquency_12')
        tmpdf['upb_12'] = tmpdf['min_upb_12']
        tmpdf.drop_column('min_upb_12')
        tmpdf['timestamp_year'] = (((tmpdf['josh_mody_n'] * n_months) + 24000 +
                                    (y - 1)) / 12).floor().astype('int16')
        tmpdf['timestamp_month'] = np.int8(y)
        tmpdf.drop_column('josh_mody_n')
        testdfs.append(tmpdf)
        del (tmpdf)
    del (joined_df)

    return pygdf.concat(testdfs)
示例#3
0
def test_dataframe_to_delayed():
    nelem = 100

    df = gd.DataFrame()
    df['x'] = np.arange(nelem)
    df['y'] = np.random.randint(nelem, size=nelem)

    ddf = dgd.from_pygdf(df, npartitions=5)

    delays = ddf.to_delayed()

    assert len(delays) == 5

    # Concat the delayed partitions
    got = gd.concat([d.compute() for d in delays])
    assert_frame_equal(got.to_pandas(), df.to_pandas())

    # Check individual partitions
    divs = ddf.divisions
    assert len(divs) == len(delays) + 1

    for i, part in enumerate(delays):
        s = divs[i]
        # The last divisions in the last index
        e = None if i + 1 == len(delays) else divs[i + 1]
        expect = df[s:e].to_pandas()
        got = part.compute().to_pandas()
        assert_frame_equal(got, expect)
示例#4
0
def test_series_to_delayed():
    nelem = 100

    sr = gd.Series(np.random.randint(nelem, size=nelem))

    dsr = dgd.from_pygdf(sr, npartitions=5)

    delays = dsr.to_delayed()

    assert len(delays) == 5

    # Concat the delayed partitions
    got = gd.concat([d.compute() for d in delays])
    assert isinstance(got, gd.Series)
    np.testing.assert_array_equal(got.to_pandas(), sr.to_pandas())

    # Check individual partitions
    divs = dsr.divisions
    assert len(divs) == len(delays) + 1

    for i, part in enumerate(delays):
        s = divs[i]
        # The last divisions in the last index
        e = None if i + 1 == len(delays) else divs[i + 1]
        expect = sr[s:e].to_pandas()
        got = part.compute().to_pandas()
        np.testing.assert_array_equal(got, expect)
示例#5
0
def concat(*frames):
    frames = list(filter(len, frames))
    if len(frames) > 1:
        return gd.concat(frames)
    elif len(frames) == 1:
        return frames[0]
    else:
        return None
示例#6
0
def test_series_from_delayed():
    delays = [get_combined_column(load_data(10 * i, i)) for i in range(1, 3)]
    out = dgd.from_delayed(delays)
    res = out.compute()
    assert isinstance(res, gd.Series)

    expected = gd.concat([d.compute() for d in delays])
    np.testing.assert_array_equal(res.to_pandas(), expected.to_pandas())
示例#7
0
def test_dataframe_from_delayed():
    delays = [load_data(10 * i, i) for i in range(1, 3)]
    out = dgd.from_delayed(delays)
    res = out.compute()
    assert isinstance(res, gd.DataFrame)

    expected = gd.concat([d.compute() for d in delays])
    assert_frame_equal(res.to_pandas(), expected.to_pandas())
示例#8
0
def test_concat(index):
    df, df2, gdf, gdf2 = make_frames(index)

    # DataFrame
    res = gd.concat([gdf, gdf2, gdf]).to_pandas()
    sol = pd.concat([df, df2, df])
    pd.util.testing.assert_frame_equal(res, sol, check_names=False)

    # Series
    for c in [i for i in ('x', 'y', 'z') if i != index]:
        res = gd.concat([gdf[c], gdf2[c], gdf[c]]).to_pandas()
        sol = pd.concat([df[c], df2[c], df[c]])
        pd.util.testing.assert_series_equal(res, sol, check_names=False)

    # Index
    res = gd.concat([gdf.index, gdf2.index]).to_pandas()
    sol = df.index.append(df2.index)
    pd.util.testing.assert_index_equal(res, sol, check_names=False)
示例#9
0
def test_concat(index):
    df, df2, gdf, gdf2 = make_frames(index)

    # DataFrame
    res = gd.concat([gdf, gdf2, gdf]).to_pandas()
    sol = pd.concat([df, df2, df])
    pd.util.testing.assert_frame_equal(res, sol, check_names=False)

    # Series
    for c in [i for i in ('x', 'y', 'z') if i != index]:
        res = gd.concat([gdf[c], gdf2[c], gdf[c]]).to_pandas()
        sol = pd.concat([df[c], df2[c], df[c]])
        pd.util.testing.assert_series_equal(res, sol, check_names=False)

    # Index
    res = gd.concat([gdf.index, gdf2.index]).to_pandas()
    sol = df.index.append(df2.index)
    pd.util.testing.assert_index_equal(res, sol, check_names=False)
示例#10
0
def test_concat_misordered_columns():
    df, df2, gdf, gdf2 = make_frames(False)
    gdf2 = gdf2[['z', 'x', 'y']]
    df2 = df2[['z', 'x', 'y']]

    res = gd.concat([gdf, gdf2]).to_pandas()
    sol = pd.concat([df, df2])

    pd.util.testing.assert_frame_equal(res, sol, check_names=False)
示例#11
0
def test_dataframe_empty_concat():
    gdf1 = DataFrame()
    gdf1['a'] = []
    gdf1['b'] = []

    gdf2 = gdf1.copy()

    gdf3 = gd.concat([gdf1, gdf2])
    assert len(gdf3) == 0
    assert len(gdf3.columns) == 2
示例#12
0
 def take(indices, depends):
     first = min(indices)
     last = max(indices)
     others = []
     for d in depends:
         # TODO: this can be replaced with searchsorted
         # Normalize to index data in range before selection.
         firstindex = d.index[0]
         lastindex = d.index[-1]
         s = max(first, firstindex)
         e = min(last, lastindex)
         others.append(d.loc[s:e])
     return gd.concat(others)
示例#13
0
def _compare_frame(a, b, max_part_size, by):
    if a is not None and b is not None:
        joint = pygdf.concat([a, b])
        sorten = joint.sort_values(by=by)
        # Split the sorted frame using the *max_part_size*
        lhs, rhs = sorten[:max_part_size], sorten[max_part_size:]
        # Replace empty frame with None
        return lhs or None, rhs or None
    elif a is None and b is None:
        return None, None
    elif a is None:
        return b.sort_values(by=by), None
    else:
        return a.sort_values(by=by), None
示例#14
0
        def shuffle(sr, prefixes, divs, *deps):
            idxs = sr.to_array()
            parts = np.asarray(get_parts(idxs, divs))

            partdfs = []
            for p, df in zip(sorted(frozenset(parts)), deps):
                cond = parts == p
                valididxs = idxs[cond]
                ordering = np.arange(len(idxs))[cond]
                selected = valididxs - prefixes[p]
                sel = df.take(selected).set_index(ordering)
                partdfs.append(sel)

            joined = gd.concat(partdfs).sort_index()
            return joined
示例#15
0
def test_query_splitted_combine():
    np.random.seed(0)
    df = pd.DataFrame({'x': np.random.randint(0, 5, size=10),
                       'y': np.random.normal(size=10)})
    gdf = DataFrame.from_pandas(df)

    # Split the GDF
    s1 = gdf[:5]
    s2 = gdf[5:]

    # Do the query
    expr = 'x > 2'
    q1 = s1.query(expr)
    q2 = s2.query(expr)
    # Combine
    got = pygdf.concat([q1, q2]).to_pandas()

    # Should equal to just querying the original GDF
    expect = gdf.query(expr).to_pandas()
    assert_frame_equal(got, expect)
示例#16
0
def test_dataframe_basic():
    np.random.seed(0)
    df = DataFrame()

    # Populate with cuda memory
    df['keys'] = cuda.to_device(np.arange(10, dtype=np.float64))
    np.testing.assert_equal(df['keys'].to_array(), np.arange(10))
    assert len(df) == 10

    # Populate with numpy array
    rnd_vals = np.random.random(10)
    df['vals'] = rnd_vals
    np.testing.assert_equal(df['vals'].to_array(), rnd_vals)
    assert len(df) == 10
    assert df.columns == ('keys', 'vals')

    # Make another dataframe
    df2 = DataFrame()
    df2['keys'] = np.array([123], dtype=np.float64)
    df2['vals'] = np.array([321], dtype=np.float64)

    # Concat
    df = gd.concat([df, df2])
    assert len(df) == 11

    hkeys = np.asarray(np.arange(10, dtype=np.float64).tolist() + [123])
    hvals = np.asarray(rnd_vals.tolist() + [321])

    np.testing.assert_equal(df['keys'].to_array(), hkeys)
    np.testing.assert_equal(df['vals'].to_array(), hvals)

    # As matrix
    mat = df.as_matrix()

    expect = np.vstack([hkeys, hvals]).T

    print(expect)
    print(mat)
    np.testing.assert_equal(mat, expect)
示例#17
0
文件: core.py 项目: sklam/dask_gdf
def nlargest_agg(x, **kwargs):
    return gd.concat(x).nlargest(**kwargs)
示例#18
0
 def do_combine(dfs):
     return combine(pygdf.concat(dfs).groupby(by=by))
示例#19
0
 def join(df, other, keys):
     others = [
         other.query('{by}==@k'.format(by=by)) for k in sorted(keys)
     ]
     return gd.concat([df] + others)
示例#20
0
def test_concat_errors():
    df, df2, gdf, gdf2 = make_frames()

    # No objs
    with pytest.raises(ValueError):
        gd.concat([])

    # Mismatched types
    with pytest.raises(ValueError):
        gd.concat([gdf, gdf.x])

    # Unknown type
    with pytest.raises(ValueError):
        gd.concat(['bar', 'foo'])

    # Mismatched column dtypes
    with pytest.raises(ValueError):
        gd.concat([gdf.x, gdf.y])
    with pytest.raises(ValueError):
        gd.concat([gdf.x, gdf.z])

    # Mismatched index dtypes
    gdf3 = gdf2.set_index('z')
    gdf2.drop_column('z')
    with pytest.raises(ValueError):
        gd.concat([gdf2, gdf3])

    # Mismatched columns
    with pytest.raises(ValueError):
        gd.concat([gdf, gdf2])
示例#21
0
文件: core.py 项目: sklam/dask_gdf
def finalize(results):
    return gd.concat(results)
示例#22
0
文件: core.py 项目: sklam/dask_gdf
def unique_k_agg(x, **kwargs):
    return gd.concat(x).unique_k(**kwargs)
示例#23
0
文件: core.py 项目: sklam/dask_gdf
def nsmallest_agg(x, **kwargs):
    return gd.concat(x).nsmallest(**kwargs)
示例#24
0
def test_concat_errors():
    df, df2, gdf, gdf2 = make_frames()

    # No objs
    with pytest.raises(ValueError):
        gd.concat([])

    # Mismatched types
    with pytest.raises(ValueError):
        gd.concat([gdf, gdf.x])

    # Unknown type
    with pytest.raises(ValueError):
        gd.concat(['bar', 'foo'])

    # Mismatched column dtypes
    with pytest.raises(ValueError):
        gd.concat([gdf.x, gdf.y])
    with pytest.raises(ValueError):
        gd.concat([gdf.x, gdf.z])

    # Mismatched index dtypes
    gdf3 = gdf2.set_index('z')
    gdf2.drop_column('z')
    with pytest.raises(ValueError):
        gd.concat([gdf2, gdf3])

    # Mismatched columns
    with pytest.raises(ValueError):
        gd.concat([gdf, gdf2])