Exemplo n.º 1
0
def test_apply_infer_columns():
    df = pd.DataFrame({"x": [1, 2, 3, 4], "y": [10, 20, 30, 40]})
    ddf = dd.from_pandas(df, npartitions=2)

    def return_df(x):
        # will create new DataFrame which columns is ['sum', 'mean']
        return pd.Series([x.sum(), x.mean()], index=["sum", "mean"])

    # DataFrame to completely different DataFrame
    result = ddf.apply(return_df, axis=1)
    assert isinstance(result, dd.DataFrame)
    tm.assert_index_equal(result.columns, pd.Index(["sum", "mean"]))
    assert eq(result, df.apply(return_df, axis=1))

    # DataFrame to Series
    result = ddf.apply(lambda x: 1, axis=1)
    assert isinstance(result, dd.Series)
    assert result.name is None
    assert eq(result, df.apply(lambda x: 1, axis=1))

    def return_df2(x):
        return pd.Series([x * 2, x * 3], index=["x2", "x3"])

    # Series to completely different DataFrame
    result = ddf.x.apply(return_df2)
    assert isinstance(result, dd.DataFrame)
    tm.assert_index_equal(result.columns, pd.Index(["x2", "x3"]))
    assert eq(result, df.x.apply(return_df2))

    # Series to Series
    result = ddf.x.apply(lambda x: 1)
    assert isinstance(result, dd.Series)
    assert result.name == "x"
    assert eq(result, df.x.apply(lambda x: 1))
Exemplo n.º 2
0
def test_join_indexed_dataframe_to_indexed_dataframe():
    A = pd.DataFrame({"x": [1, 2, 3, 4, 5, 6]}, index=[1, 2, 3, 4, 6, 7])
    a = dd.repartition(A, [1, 4, 7])

    B = pd.DataFrame({"y": list("abcdef")}, index=[1, 2, 4, 5, 6, 8])
    b = dd.repartition(B, [1, 2, 5, 8])

    c = join_indexed_dataframes(a, b, how="left")
    assert c.divisions[0] == a.divisions[0]
    assert c.divisions[-1] == max(a.divisions + b.divisions)
    assert eq(c, A.join(B))

    c = join_indexed_dataframes(a, b, how="right")
    assert c.divisions[0] == b.divisions[0]
    assert c.divisions[-1] == b.divisions[-1]
    assert eq(c, A.join(B, how="right"))

    c = join_indexed_dataframes(a, b, how="inner")
    assert c.divisions[0] == 1
    assert c.divisions[-1] == max(a.divisions + b.divisions)
    assert eq(c.compute(), A.join(B, how="inner"))

    c = join_indexed_dataframes(a, b, how="outer")
    assert c.divisions[0] == 1
    assert c.divisions[-1] == 8
    assert eq(c.compute(), A.join(B, how="outer"))

    assert sorted(join_indexed_dataframes(a, b, how="inner").dask) == sorted(
        join_indexed_dataframes(a, b, how="inner").dask
    )
    assert sorted(join_indexed_dataframes(a, b, how="inner").dask) != sorted(
        join_indexed_dataframes(a, b, how="outer").dask
    )
Exemplo n.º 3
0
def test_concat(join):
    pdf1 = pd.DataFrame({'x': [1, 2, 3, 4, 6, 7],
                         'y': list('abcdef')},
                        index=[1, 2, 3, 4, 6, 7])
    ddf1 = dd.from_pandas(pdf1, 2)
    pdf2 = pd.DataFrame({'x': [1, 2, 3, 4, 6, 7],
                         'y': list('abcdef')},
                        index=[8, 9, 10, 11, 12, 13])
    ddf2 = dd.from_pandas(pdf2, 2)

    # different columns
    pdf3 = pd.DataFrame({'x': [1, 2, 3, 4, 6, 7],
                         'z': list('abcdef')},
                        index=[8, 9, 10, 11, 12, 13])
    ddf3 = dd.from_pandas(pdf3, 2)

    for (dd1, dd2, pd1, pd2) in [(ddf1, ddf2, pdf1, pdf2),
                                 (ddf1, ddf3, pdf1, pdf3)]:
        result = dd.concat([dd1, dd2], join=join)
        expected = pd.concat([pd1, pd2], join=join)
        assert eq(result, expected)

    # test outer only, inner has a problem on pandas side
    for (dd1, dd2, pd1, pd2) in [(ddf1, ddf2, pdf1, pdf2),
                                 (ddf1, ddf3, pdf1, pdf3),
                                 (ddf1.x, ddf2.x, pdf1.x, pdf2.x),
                                 (ddf1.x, ddf3.z, pdf1.x, pdf3.z),
                                 (ddf1.x, ddf2.x, pdf1.x, pdf2.x),
                                 (ddf1.x, ddf3.z, pdf1.x, pdf3.z)]:
        result = dd.concat([dd1, dd2])
        expected = pd.concat([pd1, pd2])
        assert eq(result, expected)
Exemplo n.º 4
0
def test_concat4_interleave_partitions():
    pdf1 = pd.DataFrame(np.random.randn(10, 5), columns=list("ABCDE"), index=list("abcdefghij"))
    pdf2 = pd.DataFrame(np.random.randn(13, 5), columns=list("ABCDE"), index=list("fghijklmnopqr"))
    pdf3 = pd.DataFrame(np.random.randn(13, 6), columns=list("CDEXYZ"), index=list("fghijklmnopqr"))

    ddf1 = dd.from_pandas(pdf1, 2)
    ddf2 = dd.from_pandas(pdf2, 3)
    ddf3 = dd.from_pandas(pdf3, 2)

    msg = (
        "All inputs have known divisions which cannnot be "
        "concatenated in order. Specify "
        "interleave_partitions=True to ignore order"
    )

    cases = [[ddf1, ddf1], [ddf1, ddf2], [ddf1, ddf3], [ddf2, ddf1], [ddf2, ddf3], [ddf3, ddf1], [ddf3, ddf2]]
    for case in cases:
        pdcase = [c.compute() for c in case]

        with tm.assertRaisesRegexp(ValueError, msg):
            dd.concat(case)

        assert eq(dd.concat(case, interleave_partitions=True), pd.concat(pdcase))
        assert eq(dd.concat(case, join="inner", interleave_partitions=True), pd.concat(pdcase, join="inner"))

    msg = "'join' must be 'inner' or 'outer'"
    with tm.assertRaisesRegexp(ValueError, msg):
        dd.concat([ddf1, ddf1], join="invalid", interleave_partitions=True)
Exemplo n.º 5
0
 def _check_split_data(orig, d):
     """Check data is split properly"""
     keys = [k for k in d.dask if k[0].startswith("repartition-split")]
     keys = sorted(keys)
     sp = pd.concat([d._get(d.dask, k) for k in keys])
     assert eq(orig, sp)
     assert eq(orig, d)
Exemplo n.º 6
0
def test_header_None():
    with filetexts({'.tmp.1.csv': '1,2',
                    '.tmp.2.csv': '',
                    '.tmp.3.csv': '3,4'}):
        df = read_csv('.tmp.*.csv', header=None)
        expected = pd.DataFrame({0: [1, 3], 1: [2, 4]})
        eq(df.compute().reset_index(drop=True), expected)
Exemplo n.º 7
0
def test_join_indexed_dataframe_to_indexed_dataframe():
    A = pd.DataFrame({'x': [1, 2, 3, 4, 5, 6]},
                     index=[1, 2, 3, 4, 6, 7])
    a = dd.repartition(A, [1, 4, 7])

    B = pd.DataFrame({'y': list('abcdef')},
                     index=[1, 2, 4, 5, 6, 8])
    b = dd.repartition(B, [1, 2, 5, 8])

    c = join_indexed_dataframes(a, b, how='left')
    assert c.divisions[0] == a.divisions[0]
    assert c.divisions[-1] == max(a.divisions + b.divisions)
    assert eq(c, A.join(B))

    c = join_indexed_dataframes(a, b, how='right')
    assert c.divisions[0] == b.divisions[0]
    assert c.divisions[-1] == b.divisions[-1]
    assert eq(c, A.join(B, how='right'))

    c = join_indexed_dataframes(a, b, how='inner')
    assert c.divisions[0] == 1
    assert c.divisions[-1] == max(a.divisions + b.divisions)
    assert eq(c.compute(), A.join(B, how='inner'))

    c = join_indexed_dataframes(a, b, how='outer')
    assert c.divisions[0] == 1
    assert c.divisions[-1] == 8
    assert eq(c.compute(), A.join(B, how='outer'))

    assert sorted(join_indexed_dataframes(a, b, how='inner').dask) == \
           sorted(join_indexed_dataframes(a, b, how='inner').dask)
    assert sorted(join_indexed_dataframes(a, b, how='inner').dask) != \
           sorted(join_indexed_dataframes(a, b, how='outer').dask)
Exemplo n.º 8
0
def test_groupby_on_index():
    dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]},
                                  index=[0, 1, 3]),
           ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]},
                                  index=[5, 6, 8]),
           ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]},
                                  index=[9, 9, 9])}
    d = dd.DataFrame(dsk, 'x', ['a', 'b'], [0, 4, 9, 9])
    full = d.compute()

    e = d.set_index('a')
    efull = full.set_index('a')
    assert eq(d.groupby('a').b.mean(), e.groupby(e.index).b.mean())

    def func(df):
        df.loc[:, 'b'] = df.b - df.b.mean()
        return df

    assert eq(d.groupby('a').apply(func).set_index('a'),
              e.groupby(e.index).apply(func))
    assert eq(d.groupby('a').apply(func), full.groupby('a').apply(func))
    assert eq(d.groupby('a').apply(func).set_index('a'),
              full.groupby('a').apply(func).set_index('a'))
    assert eq(efull.groupby(efull.index).apply(func),
              e.groupby(e.index).apply(func))
Exemplo n.º 9
0
def test_to_hdf_lock_delays():
    pytest.importorskip('tables')
    df16 = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p'],
                       'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]},
                            index=[1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11., 12., 13., 14., 15., 16.])
    a = dd.from_pandas(df16, 16)

    # adding artifichial delays to make sure last tasks finish first
    # that's a way to simulate last tasks finishing last
    def delayed_nop(i):
        if i[1] < 10:
            sleep(0.1*(10-i[1]))
        return i

    # saving to multiple hdf nodes
    with tmpfile() as fn:
        a = a.apply(delayed_nop, axis=1, columns=a.columns)
        a.to_hdf(fn, '/data*')
        out = dd.read_hdf(fn, '/data*')
        eq(df16, out)

    # saving to multiple hdf files
    # adding artifichial delays to make sure last tasks finish first
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data*')
        a = a.apply(delayed_nop, axis=1, columns=a.columns)
        a.to_hdf(fn, '/data')
        out = dd.read_hdf(fn, '/data')
        eq(df16, out)
Exemplo n.º 10
0
def test_unique():
    pdf = pd.DataFrame(
        {"x": [1, 2, 1, 3, 3, 1, 4, 2, 3, 1], "y": ["a", "c", "b", np.nan, "c", "b", "a", "d", np.nan, "a"]}
    )
    ddf = dd.from_pandas(pdf, npartitions=3)
    assert eq(ddf.x.unique(), pd.Series(pdf.x.unique(), name="x"))
    assert eq(ddf.y.unique(), pd.Series(pdf.y.unique(), name="y"))
Exemplo n.º 11
0
def test_read_csv_with_datetime_index_partitions_n():
    with filetext(timeseries) as fn:
        df = pd.read_csv(fn, index_col=0, header=0, usecols=[0, 4],
                         parse_dates=['Date'])
        # because fn is so small, by default, set chunksize small
        ddf = dd.read_csv(fn, index='Date', header=0, usecols=[0, 4],
                          parse_dates=['Date'], chunkbytes=400)
        eq(df, ddf)
Exemplo n.º 12
0
def test_merge_index_without_divisions(shuffle):
    a = pd.DataFrame({"x": [1, 2, 3, 4, 5]}, index=[1, 2, 3, 4, 5])
    b = pd.DataFrame({"y": [1, 2, 3, 4, 5]}, index=[5, 4, 3, 2, 1])

    aa = dd.from_pandas(a, npartitions=3, sort=False)
    bb = dd.from_pandas(b, npartitions=2)

    eq(aa.join(bb, how="inner", shuffle=shuffle), a.join(b, how="inner"))
Exemplo n.º 13
0
def test_from_pandas_with_datetime_index():
    with filetext(timeseries) as fn:
        df = pd.read_csv(fn, index_col=0, header=0, usecols=[0, 4],
                         parse_dates=['Date'])
        ddf = dd.from_pandas(df, 2)
        eq(df, ddf)
        ddf = dd.from_pandas(df, chunksize=2)
        eq(df, ddf)
Exemplo n.º 14
0
def test_read_csv_header_issue_823():
    text = '''a b c-d\n1 2 3\n4 5 6'''.replace(' ', '\t')
    with filetext(text) as fn:
        df = dd.read_csv(fn, sep='\t')
        eq(df, pd.read_csv(fn, sep='\t'))

        df = dd.read_csv(fn, delimiter='\t')
        eq(df, pd.read_csv(fn, delimiter='\t'))
Exemplo n.º 15
0
def test_rolling_partition_size():
    df = pd.DataFrame(np.random.randn(50, 2))
    ddf = dd.from_pandas(df, npartitions=5)

    for obj, dobj in [(df, ddf), (df[0], ddf[0])]:
        eq(obj.rolling(10).mean(), dobj.rolling(10).mean())
        eq(obj.rolling(11).mean(), dobj.rolling(11).mean())
        raises(NotImplementedError, lambda: dobj.rolling(12).mean())
Exemplo n.º 16
0
def test_set_partition_tasks_3(shuffle):
    df = pd.DataFrame(np.random.random((10, 2)), columns=['x', 'y'])
    ddf = dd.from_pandas(df, npartitions=5)

    ddf2 = ddf.set_index('x', shuffle=shuffle, max_branch=2)
    df2 = df.set_index('x')
    eq(df2, ddf2)
    assert ddf2.npartitions == ddf.npartitions
Exemplo n.º 17
0
def test_shuffle_sort(shuffle):
    df = pd.DataFrame({'x': [1, 2, 3, 2, 1], 'y': [9, 8, 7, 1, 5]})
    ddf = dd.from_pandas(df, npartitions=3)

    df2 = df.set_index('x').sort_index()
    ddf2 = ddf.set_index('x', shuffle=shuffle)

    eq(ddf2.loc[2:3], df2.loc[2:3])
Exemplo n.º 18
0
def test_numeric_column_names():
    # df.groupby(0)[df.columns] fails if all columns are numbers (pandas bug)
    # This ensures that we cast all column iterables to list beforehand.
    df = pd.DataFrame({0: [0, 1, 0, 1],
                       1: [1, 2, 3, 4]})
    ddf = dd.from_pandas(df, npartitions=2)
    eq(ddf.groupby(0).sum(), df.groupby(0).sum())
    eq(ddf.groupby(0).apply(lambda x: x), df.groupby(0).apply(lambda x: x))
Exemplo n.º 19
0
def test_from_dask_array_struct_dtype():
    x = np.array([(1, 'a'), (2, 'b')], dtype=[('a', 'i4'), ('b', 'object')])
    y = da.from_array(x, chunks=(1,))
    df = dd.from_dask_array(y)
    tm.assert_index_equal(df.columns, pd.Index(['a', 'b']))
    assert eq(df, pd.DataFrame(x))

    assert eq(dd.from_dask_array(y, columns=['b', 'a']),
              pd.DataFrame(x, columns=['b', 'a']))
Exemplo n.º 20
0
def test_read_csv_files():
    with filetexts(files, mode='b'):
        df = read_csv('2014-01-*.csv')
        eq(df, expected, check_dtype=False)

        fn = '2014-01-01.csv'
        df = read_csv(fn)
        expected2 = pd.read_csv(BytesIO(files[fn]))
        eq(df, expected2, check_dtype=False)
Exemplo n.º 21
0
def test_map():
    assert eq(d.a.map(lambda x: x + 1), full.a.map(lambda x: x + 1))
    lk = dict((v, v + 1) for v in full.a.values)
    assert eq(d.a.map(lk), full.a.map(lk))
    assert eq(d.b.map(lk), full.b.map(lk))
    lk = pd.Series(lk)
    assert eq(d.a.map(lk), full.a.map(lk))
    assert eq(d.b.map(lk), full.b.map(lk))
    assert raises(TypeError, lambda: d.a.map(d.b))
Exemplo n.º 22
0
def test_from_dask_array_struct_dtype():
    x = np.array([(1, 'a'), (2, 'b')], dtype=[('a', 'i4'), ('b', 'object')])
    y = da.from_array(x, chunks=(1,))
    df = dd.from_dask_array(y)
    assert tuple(df.columns) == y.dtype.names
    eq(df, pd.DataFrame(x))

    eq(dd.from_dask_array(y, columns=['b', 'a']),
       pd.DataFrame(x, columns=['b', 'a']))
Exemplo n.º 23
0
def test_from_pandas_non_sorted():
    df = pd.DataFrame({'x': [1, 2, 3]}, index=[3, 1, 2])
    ddf = dd.from_pandas(df, npartitions=2, sort=False)
    assert not ddf.known_divisions
    eq(df, ddf)
    
    ddf = dd.from_pandas(df, chunksize=2, sort=False)
    assert not ddf.known_divisions
    eq(df, ddf)
Exemplo n.º 24
0
def test_getitem_slice():
    df = pd.DataFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8, 9],
                       'B': [9, 8, 7, 6, 5, 4, 3, 2, 1],
                       'C': [True, False, True] * 3},
                      index=list('abcdefghi'))
    ddf = dd.from_pandas(df, 3)
    assert eq(ddf['a':'e'], df['a':'e'])
    assert eq(ddf['a':'b'], df['a':'b'])
    assert eq(ddf['f':], df['f':])
Exemplo n.º 25
0
def test_nlargest():
    from string import ascii_lowercase

    df = pd.DataFrame({"a": np.random.permutation(10), "b": list(ascii_lowercase[:10])})
    ddf = dd.from_pandas(df, npartitions=2)

    res = ddf.nlargest(5, "a")
    exp = df.nlargest(5, "a")
    eq(res, exp)
Exemplo n.º 26
0
def test_getitem_slice():
    df = pd.DataFrame(
        {"A": [1, 2, 3, 4, 5, 6, 7, 8, 9], "B": [9, 8, 7, 6, 5, 4, 3, 2, 1], "C": [True, False, True] * 3},
        index=list("abcdefghi"),
    )
    ddf = dd.from_pandas(df, 3)
    assert eq(ddf["a":"e"], df["a":"e"])
    assert eq(ddf["a":"b"], df["a":"b"])
    assert eq(ddf["f":], df["f":])
Exemplo n.º 27
0
def test_set_index_self_index(shuffle):
    df = pd.DataFrame({'x': np.random.random(100),
                       'y': np.random.random(100) // 0.2},
                      index=np.random.random(100))

    a = dd.from_pandas(df, npartitions=4)
    b = a.set_index(a.index, shuffle=shuffle)
    assert a is b

    eq(b, df.set_index(df.index))
Exemplo n.º 28
0
 def _check(a, b, aa, bb):
     assert isinstance(a, dd.DataFrame)
     assert isinstance(b, dd.DataFrame)
     assert isinstance(aa, dd.DataFrame)
     assert isinstance(bb, dd.DataFrame)
     assert eq(a, aa)
     assert eq(b, bb)
     assert divisions == (10, 30, 40, 60, 80, 100)
     assert isinstance(L, list)
     assert len(divisions) == 1 + len(L)
Exemplo n.º 29
0
def test_from_pandas_small():
    for sort in [True, False]:
        for i in [0, 2]:
            df = pd.DataFrame({'x': [0] * i})
            ddf = dd.from_pandas(df, npartitions=5, sort=sort)
            eq(df, ddf)

            s = pd.Series([0] * i, name='x')
            ds = dd.from_pandas(s, npartitions=5, sort=sort)
            eq(s, ds)
Exemplo n.º 30
0
def test_apply_shuffle():
    pdf = pd.DataFrame({'A': [1, 2, 3, 4] * 5,
                        'B': np.random.randn(20),
                        'C': np.random.randn(20),
                        'D': np.random.randn(20)})
    ddf = dd.from_pandas(pdf, 3)

    assert eq(ddf.groupby('A').apply(lambda x: x.sum()),
              pdf.groupby('A').apply(lambda x: x.sum()))

    assert eq(ddf.groupby(ddf['A']).apply(lambda x: x.sum()),
              pdf.groupby(pdf['A']).apply(lambda x: x.sum()))

    assert eq(ddf.groupby(ddf['A'] + 1).apply(lambda x: x.sum()),
              pdf.groupby(pdf['A'] + 1).apply(lambda x: x.sum()))

    # SeriesGroupBy
    assert eq(ddf.groupby('A')['B'].apply(lambda x: x.sum()),
              pdf.groupby('A')['B'].apply(lambda x: x.sum()))

    assert eq(ddf.groupby(ddf['A'])['B'].apply(lambda x: x.sum()),
              pdf.groupby(pdf['A'])['B'].apply(lambda x: x.sum()))

    assert eq(ddf.groupby(ddf['A'] + 1)['B'].apply(lambda x: x.sum()),
              pdf.groupby(pdf['A'] + 1)['B'].apply(lambda x: x.sum()))

    # DataFrameGroupBy with column slice
    assert eq(ddf.groupby('A')[['B', 'C']].apply(lambda x: x.sum()),
              pdf.groupby('A')[['B', 'C']].apply(lambda x: x.sum()))

    assert eq(ddf.groupby(ddf['A'])[['B', 'C']].apply(lambda x: x.sum()),
              pdf.groupby(pdf['A'])[['B', 'C']].apply(lambda x: x.sum()))

    assert eq(ddf.groupby(ddf['A'] + 1)[['B', 'C']].apply(lambda x: x.sum()),
              pdf.groupby(pdf['A'] + 1)[['B', 'C']].apply(lambda x: x.sum()))
Exemplo n.º 31
0
Arquivo: test_io.py Projeto: ifzz/dask
def test_read_csv_with_datetime_index_partitions_one():
    with filetext(timeseries) as fn:
        df = pd.read_csv(fn,
                         index_col=0,
                         header=0,
                         usecols=[0, 4],
                         parse_dates=['Date'])
        # chunkbytes set to explicitly set to single chunk
        ddf = dd.read_csv(fn,
                          index='Date',
                          header=0,
                          usecols=[0, 4],
                          parse_dates=['Date'],
                          chunkbytes=10000000)
        eq(df, ddf)

        # because fn is so small, by default, this will only be one chunk
        ddf = dd.read_csv(fn,
                          index='Date',
                          header=0,
                          usecols=[0, 4],
                          parse_dates=['Date'])
        eq(df, ddf)
Exemplo n.º 32
0
def test_read_csv(open_comp_pair, infer):
    myopen, compression = open_comp_pair
    text_ = text if compression is None else text.encode()
    ext = dict((v, k) for (k, v) in compressions.items()).get(compression, '')
    with filetext(text_, open=myopen, extension=ext) as fn:
        compression = 'infer' if infer else compression
        f = dd.read_csv(fn, chunkbytes=30, compression=compression,
                lineterminator='\n')
        assert list(f.columns) == ['name', 'amount']
        assert f.npartitions > 1
        result = f.compute(get=dask.get)
        # index may be different
        assert eq(result.reset_index(drop=True),
                  pd.read_csv(fn, compression=compression, lineterminator='\n'))
Exemplo n.º 33
0
def test_reductions_frame_dtypes():
    df = pd.DataFrame({
        'int': [1, 2, 3, 4, 5, 6, 7, 8],
        'float': [1., 2., 3., 4., np.nan, 6., 7., 8.],
        'dt': [pd.NaT] + [datetime(2011, i, 1) for i in range(1, 8)],
        'str':
        list('abcdefgh')
    })
    ddf = dd.from_pandas(df, 3)
    assert eq(df.sum(), ddf.sum())
    assert eq(df.min(), ddf.min())
    assert eq(df.max(), ddf.max())
    assert eq(df.count(), ddf.count())
    assert eq(df.std(), ddf.std())
    assert eq(df.var(), ddf.var())
    assert eq(df.std(ddof=0), ddf.std(ddof=0))
    assert eq(df.var(ddof=0), ddf.var(ddof=0))
    assert eq(df.mean(), ddf.mean())

    assert eq(df._get_numeric_data(), ddf._get_numeric_data())

    numerics = ddf[['int', 'float']]
    assert numerics._get_numeric_data().dask == numerics.dask
Exemplo n.º 34
0
def test_map_partitions_column_info():
    df = pd.DataFrame({'x': [1, 2, 3, 4], 'y': [5, 6, 7, 8]})
    a = dd.from_pandas(df, npartitions=2)

    b = dd.map_partitions(lambda x: x, a.columns, a)
    assert b.columns == a.columns
    assert eq(df, b)

    b = dd.map_partitions(lambda x: x, a.x.name, a.x)
    assert b.name == a.x.name
    assert eq(df.x, b)

    b = dd.map_partitions(lambda x: x, a.x.name, a.x)
    assert b.name == a.x.name
    assert eq(df.x, b)

    b = dd.map_partitions(lambda df: df.x + df.y, None, a)
    assert b.name == None
    assert isinstance(b, dd.Series)

    b = dd.map_partitions(lambda df: df.x + 1, 'x', a)
    assert isinstance(b, dd.Series)
    assert b.name == 'x'
Exemplo n.º 35
0
def test_from_castra_with_selection():
    """ Optimizations fuse getitems with load_partitions

    We used to use getitem for both column access and selections
    """
    pytest.importorskip('castra')
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'],
                       'y': [2, 3, 4, 5]},
                       index=pd.Index([1., 2., 3., 4.], name='ind'))
    a = dd.from_pandas(df, 2)

    b = dd.from_castra(a.to_castra())

    assert eq(b[b.y > 3].x, df[df.y > 3].x)
Exemplo n.º 36
0
Arquivo: test_io.py Projeto: ifzz/dask
def test_Series_from_dask_array():
    x = da.ones(10, chunks=4)

    ser = from_dask_array(x, 'a')
    assert ser.name == 'a'
    assert list(ser.divisions) == [0, 4, 8, 9]
    assert (ser.compute(get=get_sync).values == x.compute(get=get_sync)).all()

    ser = from_dask_array(x)
    assert ser.name is None

    # dd.from_array should re-route to from_dask_array
    ser2 = dd.from_array(x)
    assert eq(ser, ser2)
Exemplo n.º 37
0
Arquivo: test_io.py Projeto: ifzz/dask
def test_read_csv(myopen, compression):
    text_ = text if compression is None else text.encode()
    with filetext(text_, open=myopen) as fn:
        f = dd.read_csv(fn,
                        chunkbytes=30,
                        compression=compression,
                        lineterminator='\n')
        assert list(f.columns) == ['name', 'amount']
        assert f.npartitions > 1
        result = f.compute(get=dask.get)
        # index may be different
        assert eq(
            result.reset_index(drop=True),
            pd.read_csv(fn, compression=compression, lineterminator='\n'))
Exemplo n.º 38
0
def test_scalar_arithmetics_with_dask_instances():
    s = dd.core.Scalar({('s', 0): 10}, 's')
    e = 10

    pds = pd.Series([1, 2, 3, 4, 5, 6, 7])
    dds = dd.from_pandas(pds, 2)

    pdf = pd.DataFrame({'a': [1, 2, 3, 4, 5, 6, 7],
                        'b': [7, 6, 5, 4, 3, 2, 1]})
    ddf = dd.from_pandas(pdf, 2)

    # pandas Series
    result = pds + s   # this result pd.Series (automatically computed)
    assert isinstance(result, pd.Series)
    assert eq(result, pds + e)

    result = s + pds   # this result dd.Series
    assert isinstance(result, dd.Series)
    assert eq(result, pds + e)

    # dask Series
    result = dds + s   # this result dd.Series
    assert isinstance(result, dd.Series)
    assert eq(result, pds + e)

    result = s + dds   # this result dd.Series
    assert isinstance(result, dd.Series)
    assert eq(result, pds + e)


    # pandas DataFrame
    result = pdf + s   # this result pd.DataFrame (automatically computed)
    assert isinstance(result, pd.DataFrame)
    assert eq(result, pdf + e)

    result = s + pdf   # this result dd.DataFrame
    assert isinstance(result, dd.DataFrame)
    assert eq(result, pdf + e)

    # dask DataFrame
    result = ddf + s   # this result dd.DataFrame
    assert isinstance(result, dd.DataFrame)
    assert eq(result, pdf + e)

    result = s + ddf   # this result dd.DataFrame
    assert isinstance(result, dd.DataFrame)
    assert eq(result, pdf + e)
Exemplo n.º 39
0
def test_read_csv_index():
    with filetext(text) as fn:
        f = dd.read_csv(fn, blocksize=20).set_index('amount')
        result = f.compute(get=get_sync)
        assert result.index.name == 'amount'

        blocks = dd.DataFrame._get(f.dask, f._keys(), get=get_sync)
        for i, block in enumerate(blocks):
            if i < len(f.divisions) - 2:
                assert (block.index < f.divisions[i + 1]).all()
            if i > 0:
                assert (block.index >= f.divisions[i]).all()

        expected = pd.read_csv(fn).set_index('amount')
        assert eq(result, expected)
Exemplo n.º 40
0
def test_datetime_accessor():
    df = pd.DataFrame({'x': [1, 2, 3, 4]})
    df['x'] = df.x.astype('M8[us]')

    a = dd.from_pandas(df, 2)

    assert 'date' in dir(a.x.dt)

    # pandas loses Series.name via datetime accessor
    # see https://github.com/pydata/pandas/issues/10712
    assert eq(a.x.dt.date, df.x.dt.date, check_names=False)
    assert (a.x.dt.to_pydatetime().compute() == df.x.dt.to_pydatetime()).all()

    assert a.x.dt.date.dask == a.x.dt.date.dask
    assert a.x.dt.to_pydatetime().dask == a.x.dt.to_pydatetime().dask
Exemplo n.º 41
0
def test_quantile():
    # series / multiple
    result = d.b.quantile([.3, .7])
    exp = full.b.quantile([.3, .7])  # result may different
    assert len(result) == 2
    assert result.divisions == (.3, .7)
    assert eq(result.index, exp.index)
    assert isinstance(result, dd.Series)

    result = result.compute()
    assert isinstance(result, pd.Series)
    assert result.iloc[0] == 0
    assert 5 < result.iloc[1] < 6

    # index
    s = pd.Series(np.arange(10), index=np.arange(10))
    ds = dd.from_pandas(s, 2)

    result = ds.index.quantile([.3, .7])
    exp = s.quantile([.3, .7])
    assert len(result) == 2
    assert result.divisions == (.3, .7)
    assert eq(result.index, exp.index)
    assert isinstance(result, dd.Series)

    result = result.compute()
    assert isinstance(result, pd.Series)
    assert 1 < result.iloc[0] < 2
    assert 7 < result.iloc[1] < 8

    # series / single
    result = d.b.quantile(.5)
    exp = full.b.quantile(.5)  # result may different
    assert isinstance(result, dd.core.Scalar)
    result = result.compute()
    assert 4 < result < 6
Exemplo n.º 42
0
def test_get_dummies_kwargs():
    s = pd.Series([1, 1, 1, 2, 2, 1, 3, 4], dtype='category')
    exp = pd.get_dummies(s, prefix='X', prefix_sep='-')

    ds = dd.from_pandas(s, 2)
    res = dd.get_dummies(ds, prefix='X', prefix_sep='-')
    assert eq(res, exp)
    tm.assert_index_equal(res.columns, pd.Index(['X-1', 'X-2', 'X-3', 'X-4']))

    exp = pd.get_dummies(s, drop_first=True)

    ds = dd.from_pandas(s, 2)
    res = dd.get_dummies(ds, drop_first=True)
    assert eq(res, exp)
    tm.assert_index_equal(res.columns, exp.columns)

    # nan
    s = pd.Series([1, 1, 1, 2, np.nan, 3, np.nan, 5], dtype='category')
    exp = pd.get_dummies(s)

    ds = dd.from_pandas(s, 2)
    res = dd.get_dummies(ds)
    assert eq(res, exp)
    tm.assert_index_equal(res.columns, exp.columns)

    # dummy_na
    exp = pd.get_dummies(s, dummy_na=True)

    ds = dd.from_pandas(s, 2)
    res = dd.get_dummies(ds, dummy_na=True)
    assert eq(res, exp)
    tm.assert_index_equal(res.columns, pd.Index([1, 2, 3, 5, np.nan]))

    msg = 'sparse=True is not supported'
    with tm.assertRaisesRegexp(NotImplementedError, msg):
        dd.get_dummies(ds, sparse=True)
Exemplo n.º 43
0
def test_encoding_gh601(encoding):
    ar = pd.Series(range(0, 100))
    br = ar % 7
    cr = br * 3.3
    dr = br / 1.9836
    test_df = pd.DataFrame({'a': ar, 'b': br, 'c': cr, 'd': dr})

    with tmpfile('.csv') as fn:
        test_df.to_csv(fn, encoding=encoding, index=False)

        a = pd.read_csv(fn, encoding=encoding)
        d = dd.read_csv(fn, encoding=encoding, chunkbytes=1000)
        d = d.compute()
        d.index = range(len(d.index))
        assert eq(d, a)
Exemplo n.º 44
0
def test_groupby_multilevel_getitem():
    df = pd.DataFrame({
        'a': [1, 2, 3, 1, 2, 3],
        'b': [1, 2, 1, 4, 2, 1],
        'c': [1, 3, 2, 1, 1, 2],
        'd': [1, 2, 1, 1, 2, 2]
    })
    ddf = dd.from_pandas(df, 2)

    cases = [(ddf.groupby('a')['b'], df.groupby('a')['b']),
             (ddf.groupby(['a', 'b']), df.groupby(['a', 'b'])),
             (ddf.groupby(['a', 'b'])['c'], df.groupby(['a', 'b'])['c']),
             (ddf.groupby('a')[['b', 'c']], df.groupby('a')[['b', 'c']]),
             (ddf.groupby('a')[['b']], df.groupby('a')[['b']]),
             (ddf.groupby(['a', 'b', 'c']), df.groupby(['a', 'b', 'c']))]

    for d, p in cases:
        assert isinstance(d, dd.groupby._GroupBy)
        assert isinstance(p, pd.core.groupby.GroupBy)
        assert eq(d.sum(), p.sum())
        assert eq(d.min(), p.min())
        assert eq(d.max(), p.max())
        assert eq(d.count(), p.count())
        assert eq(d.mean(), p.mean().astype(float))
Exemplo n.º 45
0
def test_to_hdf_multiple_files():
    pytest.importorskip('tables')
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'],
                       'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.])
    a = dd.from_pandas(df, 2)
    df16 = pd.DataFrame({'x': ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i',
                               'j', 'k', 'l', 'm', 'n', 'o', 'p'],
                         'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
                               15, 16]},
                        index=[1., 2., 3., 4., 5., 6., 7., 8., 9., 10., 11.,
                               12., 13., 14., 15., 16.])
    b = dd.from_pandas(df16, 16)

    # saving to multiple files
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data_*.h5')
        a.to_hdf(fn, '/data')
        out = dd.read_hdf(fn, '/data')
        eq(df, out)

    # saving to multiple files making sure order is kept
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data_*.h5')
        b.to_hdf(fn, '/data')
        out = dd.read_hdf(fn, '/data')
        eq(df16, out)

    # saving to multiple files with custom name_function
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data_*.h5')
        a.to_hdf(fn, '/data', name_function=lambda i: 'a' * (i + 1))
        out = dd.read_hdf(fn, '/data')
        eq(df, out)

        out = pd.read_hdf(os.path.join(dn, 'data_a.h5'), '/data')
        tm.assert_frame_equal(out, df.iloc[:2])
        out = pd.read_hdf(os.path.join(dn, 'data_aa.h5'), '/data')
        tm.assert_frame_equal(out, df.iloc[2:])

    # test hdf object
    with tmpfile('h5') as fn:
        with pd.HDFStore(fn) as hdf:
            a.to_hdf(hdf, '/data*')
            out = dd.read_hdf(fn, '/data*')
            eq(df, out)
Exemplo n.º 46
0
def test_indexed_concat(join):
    A = pd.DataFrame({'x': [1, 2, 3, 4, 6, 7], 'y': list('abcdef')},
                     index=[1, 2, 3, 4, 6, 7])
    a = dd.repartition(A, [1, 4, 7])

    B = pd.DataFrame({'x': [10, 20, 40, 50, 60, 80]},
                     index=[1, 2, 4, 5, 6, 8])
    b = dd.repartition(B, [1, 2, 5, 8])

    result = concat_indexed_dataframes([a, b], join=join)
    expected = pd.concat([A, B], axis=0, join=join)
    assert eq(result, expected)

    assert sorted(concat_indexed_dataframes([a, b], join=join).dask) == \
           sorted(concat_indexed_dataframes([a, b], join=join).dask)
    assert sorted(concat_indexed_dataframes([a, b], join='inner').dask) != \
           sorted(concat_indexed_dataframes([a, b], join='outer').dask)
Exemplo n.º 47
0
def test_to_hdf_modes_multiple_files():
    pytest.importorskip('tables')
    df = pd.DataFrame({
        'x': ['a', 'b', 'c', 'd'],
        'y': [1, 2, 3, 4]
    },
                      index=[1., 2., 3., 4.])

    # appending a single partition to existing data
    a = dd.from_pandas(df, 1)
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data*')
        a.to_hdf(os.path.join(dn, 'data2'), '/data')
        a.to_hdf(fn, '/data', mode='a')
        out = dd.read_hdf(fn, '/data*')
        eq(df.append(df), out)

    # appending two partitions to existing data
    a = dd.from_pandas(df, 2)
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data*')
        a.to_hdf(os.path.join(dn, 'data2'), '/data')
        a.to_hdf(fn, '/data', mode='a')
        out = dd.read_hdf(fn, '/data')
        eq(df.append(df), out)

    # overwriting a file with two partitions
    a = dd.from_pandas(df, 2)
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data*')
        a.to_hdf(os.path.join(dn, 'data1'), '/data')
        a.to_hdf(fn, '/data', mode='w')
        out = dd.read_hdf(fn, '/data')
        eq(df, out)

    # overwriting a single partition, keeping other partitions
    a = dd.from_pandas(df, 2)
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data*')
        a.to_hdf(os.path.join(dn, 'data1'), '/data')
        a.to_hdf(fn, '/data', mode='a', append=False)
        out = dd.read_hdf(fn, '/data')
        eq(df.append(df), out)
Exemplo n.º 48
0
def test_full_groupby():
    dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]},
                                  index=[0, 1, 3]),
           ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]},
                                  index=[5, 6, 8]),
           ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]},
                                  index=[9, 9, 9])}
    d = dd.DataFrame(dsk, 'x', ['a', 'b'], [0, 4, 9, 9])
    full = d.compute()

    assert raises(Exception, lambda: d.groupby('does_not_exist'))
    assert raises(Exception, lambda: d.groupby('a').does_not_exist)
    assert 'b' in dir(d.groupby('a'))
    def func(df):
        df['b'] = df.b - df.b.mean()
        return df

    assert eq(d.groupby('a').apply(func), full.groupby('a').apply(func))
Exemplo n.º 49
0
def test_full_groupby():
    df = pd.DataFrame(
        {
            'a': [1, 2, 3, 4, 5, 6, 7, 8, 9],
            'b': [4, 5, 6, 3, 2, 1, 0, 0, 0]
        },
        index=[0, 1, 3, 5, 6, 8, 9, 9, 9])
    ddf = dd.from_pandas(df, npartitions=3)

    assert raises(Exception, lambda: df.groupby('does_not_exist'))
    assert raises(Exception, lambda: df.groupby('a').does_not_exist)
    assert 'b' in dir(df.groupby('a'))

    def func(df):
        df['b'] = df.b - df.b.mean()
        return df

    assert eq(df.groupby('a').apply(func), ddf.groupby('a').apply(func))
Exemplo n.º 50
0
def test_dropna():
    df = pd.DataFrame(
        {
            'x': [np.nan, 2, 3, 4, np.nan, 6],
            'y': [1, 2, np.nan, 4, np.nan, np.nan],
            'z': [1, 2, 3, 4, np.nan, np.nan]
        },
        index=[10, 20, 30, 40, 50, 60])
    ddf = dd.from_pandas(df, 3)

    assert eq(ddf.x.dropna(), df.x.dropna())
    assert eq(ddf.y.dropna(), df.y.dropna())
    assert eq(ddf.z.dropna(), df.z.dropna())

    assert eq(ddf.dropna(), df.dropna())
    assert eq(ddf.dropna(how='all'), df.dropna(how='all'))
    assert eq(ddf.dropna(subset=['x']), df.dropna(subset=['x']))
    assert eq(ddf.dropna(subset=['y', 'z']), df.dropna(subset=['y', 'z']))
    assert eq(ddf.dropna(subset=['y', 'z'], how='all'),
              df.dropna(subset=['y', 'z'], how='all'))
Exemplo n.º 51
0
def test_dataframe_quantile():

    # column X is for test column order and result division
    df = pd.DataFrame(
        {
            'A': np.arange(20),
            'X': np.arange(20, 40),
            'B': np.arange(10, 30),
            'C': ['a', 'b', 'c', 'd'] * 5
        },
        columns=['A', 'X', 'B', 'C'])
    ddf = dd.from_pandas(df, 3)

    result = ddf.quantile()
    assert result.npartitions == 1
    assert result.divisions == ('A', 'X')

    result = result.compute()
    assert isinstance(result, pd.Series)
    tm.assert_index_equal(result.index, pd.Index(['A', 'X', 'B']))
    assert (result > pd.Series([16, 36, 26], index=['A', 'X', 'B'])).all()
    assert (result < pd.Series([17, 37, 27], index=['A', 'X', 'B'])).all()

    result = ddf.quantile([0.25, 0.75])
    assert result.npartitions == 1
    assert result.divisions == (0.25, 0.75)

    result = result.compute()
    assert isinstance(result, pd.DataFrame)
    tm.assert_index_equal(result.index, pd.Index([0.25, 0.75]))
    tm.assert_index_equal(result.columns, pd.Index(['A', 'X', 'B']))
    minexp = pd.DataFrame([[1, 21, 11], [17, 37, 27]],
                          index=[0.25, 0.75],
                          columns=['A', 'X', 'B'])
    assert (result > minexp).all().all()
    maxexp = pd.DataFrame([[2, 22, 12], [18, 38, 28]],
                          index=[0.25, 0.75],
                          columns=['A', 'X', 'B'])
    assert (result < maxexp).all().all()

    assert eq(ddf.quantile(axis=1), df.quantile(axis=1))
    assert raises(ValueError, lambda: ddf.quantile([0.25, 0.75], axis=1))
Exemplo n.º 52
0
def test_from_castra_with_selection():
    """ Optimizations fuse getitems with load_partitions

    We used to use getitem for both column access and selections
    """
    castra = pytest.importorskip('castra')
    blosc = pytest.importorskip('blosc')
    if (LooseVersion(blosc.__version__) == '1.3.0'
            or LooseVersion(castra.__version__) < '0.1.8'):
        pytest.skip()
    df = pd.DataFrame({
        'x': ['a', 'b', 'c', 'd'],
        'y': [2, 3, 4, 5]
    },
                      index=pd.Index([1., 2., 3., 4.], name='ind'))
    a = dd.from_pandas(df, 2)

    b = dd.from_castra(a.to_castra())

    assert eq(b[b.y > 3].x, df[df.y > 3].x)
Exemplo n.º 53
0
def test_apply():
    df = pd.DataFrame({'x': [1, 2, 3, 4], 'y': [10, 20, 30, 40]})
    a = dd.from_pandas(df, npartitions=2)

    func = lambda row: row['x'] + row['y']
    eq(a.x.apply(lambda x: x + 1), df.x.apply(lambda x: x + 1))

    eq(a.apply(lambda xy: xy[0] + xy[1], axis=1, columns=None),
       df.apply(lambda xy: xy[0] + xy[1], axis=1))

    assert raises(NotImplementedError, lambda: a.apply(lambda xy: xy, axis=0))
    assert raises(ValueError, lambda: a.apply(lambda xy: xy, axis=1))

    func = lambda x: pd.Series([x, x])
    eq(a.x.apply(func, name=[0, 1]), df.x.apply(func))
Exemplo n.º 54
0
def test_to_csv_multiple_files_cornercases():
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]})
    a = dd.from_pandas(df, 2)
    with tmpdir() as dn:
        with pytest.raises(ValueError):
            fn = os.path.join(dn, "data_*_*.csv")
            a.to_csv(fn)

    df16 = pd.DataFrame({
        'x': [
            'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm',
            'n', 'o', 'p'
        ],
        'y': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16]
    })
    a = dd.from_pandas(df16, 16)
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data_*.csv')
        a.to_csv(fn, index=False)
        result = dd.read_csv(fn).compute().reset_index(drop=True)
        eq(result, df16)

    # test handling existing files when links are optimized out
    a = dd.from_pandas(df, 2)
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data_1.csv')
        a.to_csv(fn, index=False)
        fn = os.path.join(dn, 'data_*.csv')
        a.to_csv(fn, mode='w', index=False)
        result = dd.read_csv(fn).compute().reset_index(drop=True)
        eq(result, df)

    # test handling existing files when links are optimized out
    a = dd.from_pandas(df16, 16)
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data_01.csv')
        a.to_csv(fn, index=False)
        fn = os.path.join(dn, 'data_*.csv')
        a.to_csv(fn, mode='w', index=False)
        result = dd.read_csv(fn).compute().reset_index(drop=True)
        eq(result, df16)

    # test handling existing files when mode isn't 'w'
    a = dd.from_pandas(df, 2)
    with tmpdir() as dn:
        fn = os.path.join(dn, 'data_*.csv')
        with pytest.raises(ValueError):
            a.to_csv(fn, mode='a')
Exemplo n.º 55
0
def test_set_partition_compute():
    d2 = d.set_partition('b', [0, 2, 9])
    d3 = d.set_partition('b', [0, 2, 9], compute=True)

    assert eq(d2, d3)
    assert eq(d2, full.set_index('b'))
    assert eq(d3, full.set_index('b'))
    assert len(d2.dask) > len(d3.dask)

    d4 = d.set_partition(d.b, [0, 2, 9])
    d5 = d.set_partition(d.b, [0, 2, 9], compute=True)
    exp = full.copy()
    exp.index = exp.b
    assert eq(d4, d5)
    assert eq(d4, exp)
    assert eq(d5, exp)
    assert len(d4.dask) > len(d5.dask)
Exemplo n.º 56
0
def test_embarrassingly_parallel_operations():
    df = pd.DataFrame({
        'x': [1, 2, 3, 4, None, 6],
        'y': list('abdabd')
    },
                      index=[10, 20, 30, 40, 50, 60])
    a = dd.from_pandas(df, 2)

    assert eq(a.x.astype('float32'), df.x.astype('float32'))
    assert a.x.astype('float32').compute().dtype == 'float32'

    assert eq(a.x.dropna(), df.x.dropna())

    assert eq(a.x.fillna(100), df.x.fillna(100))
    assert eq(a.fillna(100), df.fillna(100))

    assert eq(a.x.between(2, 4), df.x.between(2, 4))

    assert eq(a.x.clip(2, 4), df.x.clip(2, 4))

    assert eq(a.x.notnull(), df.x.notnull())

    assert len(a.sample(0.5).compute()) < len(df)
Exemplo n.º 57
0
def test_to_csv():
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]})

    for npartitions in [1, 2]:
        a = dd.from_pandas(df, npartitions)
        with tmpfile('csv') as fn:
            a.to_csv(fn, index=False)
            result = dd.read_csv(fn).compute().reset_index(drop=True)
            eq(result, df)

        with tmpfile('csv') as fn:
            r = a.to_csv(fn, index=False, compute=False)
            r.compute()
            result = dd.read_csv(fn).compute().reset_index(drop=True)
            eq(result, df)

        with tmpdir() as dn:
            fn = os.path.join(dn, 'data_*.csv')
            a.to_csv(fn, index=False)
            result = dd.read_csv(fn).compute().reset_index(drop=True)
            eq(result, df)
Exemplo n.º 58
0
def test_getitem_timestamp_str():

    df = pd.DataFrame({
        'A': np.random.randn(100),
        'B': np.random.randn(100)
    },
                      index=pd.date_range('2011-01-01', freq='H', periods=100))
    ddf = dd.from_pandas(df, 10)

    # partial string slice
    assert eq(df['2011-01-02'], ddf['2011-01-02'])
    assert eq(df['2011-01-02':'2011-01-10'], df['2011-01-02':'2011-01-10'])

    df = pd.DataFrame({
        'A': np.random.randn(100),
        'B': np.random.randn(100)
    },
                      index=pd.date_range('2011-01-01', freq='D', periods=100))
    ddf = dd.from_pandas(df, 50)
    assert eq(df['2011-01'], ddf['2011-01'])
    assert eq(df['2011'], ddf['2011'])

    assert eq(df['2011-01':'2012-05'], ddf['2011-01':'2012-05'])
    assert eq(df['2011':'2015'], ddf['2011':'2015'])
Exemplo n.º 59
0
def test_read_csv_with_nrows():
    with filetext(text) as fn:
        f = dd.read_csv(fn, nrows=3)
        assert list(f.columns) == ['name', 'amount']
        assert f.npartitions == 1
        assert eq(dd.read_csv(fn, nrows=3), pd.read_csv(fn, nrows=3))
Exemplo n.º 60
0
def test_from_pandas_single_row():
    df = pd.DataFrame({'x': [1]}, index=[1])
    ddf = dd.from_pandas(df, npartitions=1)
    assert ddf.divisions == (1, 1)
    assert eq(ddf, df)