Пример #1
0
def test_series_groupby_errors():
    s = pd.Series([1, 2, 2, 1, 1])

    ss = dd.from_pandas(s, npartitions=2)

    msg = "Grouper for '1' not 1-dimensional"
    with tm.assertRaisesRegexp(ValueError, msg):
        s.groupby([1, 2])  # pandas
    with tm.assertRaisesRegexp(ValueError, msg):
        ss.groupby([1, 2]) # dask should raise the same error
    msg = "Grouper for '2' not 1-dimensional"
    with tm.assertRaisesRegexp(ValueError, msg):
        s.groupby([2])  # pandas
    with tm.assertRaisesRegexp(ValueError, msg):
        ss.groupby([2]) # dask should raise the same error

    msg = "No group keys passed!"
    with tm.assertRaisesRegexp(ValueError, msg):
        s.groupby([])  # pandas
    with tm.assertRaisesRegexp(ValueError, msg):
        ss.groupby([]) # dask should raise the same error

    sss = dd.from_pandas(s, npartitions=3)
    assert raises(NotImplementedError, lambda: ss.groupby(sss))

    with tm.assertRaises(KeyError):
        s.groupby('x')  # pandas
    with tm.assertRaises(KeyError):
        ss.groupby('x') # dask should raise the same error
Пример #2
0
    def test_to_dask_dataframe(self):
        # Test conversion of Datasets to dask DataFrames
        x = da.from_array(np.random.randn(10), chunks=4)
        y = np.arange(10, dtype='uint8')
        t = list('abcdefghij')

        ds = Dataset(OrderedDict([('a', ('t', x)),
                                  ('b', ('t', y)),
                                  ('t', ('t', t))]))

        expected_pd = pd.DataFrame({'a': x,
                                    'b': y},
                                   index=pd.Index(t, name='t'))

        # test if 1-D index is correctly set up
        expected = dd.from_pandas(expected_pd, chunksize=4)
        actual = ds.to_dask_dataframe(set_index=True)
        # test if we have dask dataframes
        self.assertIsInstance(actual, dd.DataFrame)

        # use the .equals from pandas to check dataframes are equivalent
        assert_frame_equal(expected.compute(), actual.compute())

        # test if no index is given
        expected = dd.from_pandas(expected_pd.reset_index(drop=False),
                                  chunksize=4)

        actual = ds.to_dask_dataframe(set_index=False)

        self.assertIsInstance(actual, dd.DataFrame)
        assert_frame_equal(expected.compute(), actual.compute())
Пример #3
0
def test_concat4_interleave_partitions():
    pdf1 = pd.DataFrame(np.random.randn(10, 5),
                        columns=list('ABCDE'), index=list('abcdefghij'))
    pdf2 = pd.DataFrame(np.random.randn(13, 5),
                        columns=list('ABCDE'), index=list('fghijklmnopqr'))
    pdf3 = pd.DataFrame(np.random.randn(13, 6),
                        columns=list('CDEXYZ'), index=list('fghijklmnopqr'))

    ddf1 = dd.from_pandas(pdf1, 2)
    ddf2 = dd.from_pandas(pdf2, 3)
    ddf3 = dd.from_pandas(pdf3, 2)

    msg = ('All inputs have known divisions which cannot be '
           'concatenated in order. Specify '
           'interleave_partitions=True to ignore order')

    cases = [[ddf1, ddf1], [ddf1, ddf2], [ddf1, ddf3], [ddf2, ddf1],
             [ddf2, ddf3], [ddf3, ddf1], [ddf3, ddf2]]
    for case in cases:
        pdcase = [c.compute() for c in case]

        with pytest.raises(ValueError) as err:
            dd.concat(case)
        assert msg in str(err.value)

        assert_eq(dd.concat(case, interleave_partitions=True),
                  pd.concat(pdcase))
        assert_eq(dd.concat(case, join='inner', interleave_partitions=True),
                  pd.concat(pdcase, join='inner'))

    msg = "'join' must be 'inner' or 'outer'"
    with pytest.raises(ValueError) as err:
        dd.concat([ddf1, ddf1], join='invalid', interleave_partitions=True)
    assert msg in str(err.value)
Пример #4
0
def test_concat3():
    pdf1 = pd.DataFrame(np.random.randn(6, 5),
                        columns=list('ABCDE'), index=list('abcdef'))
    pdf2 = pd.DataFrame(np.random.randn(6, 5),
                        columns=list('ABCFG'), index=list('ghijkl'))
    pdf3 = pd.DataFrame(np.random.randn(6, 5),
                        columns=list('ABCHI'), index=list('mnopqr'))
    ddf1 = dd.from_pandas(pdf1, 2)
    ddf2 = dd.from_pandas(pdf2, 3)
    ddf3 = dd.from_pandas(pdf3, 2)

    result = dd.concat([ddf1, ddf2])
    assert result.divisions == ddf1.divisions[:-1] + ddf2.divisions
    assert result.npartitions == ddf1.npartitions + ddf2.npartitions
    assert_eq(result, pd.concat([pdf1, pdf2]))

    assert_eq(dd.concat([ddf1, ddf2], interleave_partitions=True),
              pd.concat([pdf1, pdf2]))

    result = dd.concat([ddf1, ddf2, ddf3])
    assert result.divisions == (ddf1.divisions[:-1] + ddf2.divisions[:-1] +
                                ddf3.divisions)
    assert result.npartitions == (ddf1.npartitions + ddf2.npartitions +
                                  ddf3.npartitions)
    assert_eq(result, pd.concat([pdf1, pdf2, pdf3]))

    assert_eq(dd.concat([ddf1, ddf2, ddf3], interleave_partitions=True),
              pd.concat([pdf1, pdf2, pdf3]))
Пример #5
0
def test_clip():

    # clip internally calls dd.Series.clip

    s = pd.Series(np.random.randint(1, 100, size=20))
    ds = dd.from_pandas(s, 3)

    # applying Dask ufunc doesn't trigger computation
    assert isinstance(da.clip(ds, 5, 50), dd.Series)
    assert_eq(da.clip(ds, 5, 50), np.clip(s, 5, 50))

    # applying Dask ufunc doesn't trigger computation
    assert isinstance(np.clip(ds, 5, 50), dd.Series)
    assert_eq(np.clip(ds, 5, 50), np.clip(s, 5, 50))

    # applying Dask ufunc to normal Series triggers computation
    assert isinstance(da.clip(s, 5, 50), pd.Series)
    assert_eq(da.clip(s, 5, 50), np.clip(s, 5, 50))

    df = pd.DataFrame(np.random.randint(1, 100, size=(20, 2)),
                      columns=['A', 'B'])
    ddf = dd.from_pandas(df, 3)

    # applying Dask ufunc doesn't trigger computation
    assert isinstance(da.clip(ddf, 5.5, 40.5), dd.DataFrame)
    assert_eq(da.clip(ddf, 5.5, 40.5), np.clip(df, 5.5, 40.5))

    # applying Dask ufunc doesn't trigger computation
    assert isinstance(np.clip(ddf, 5.5, 40.5), dd.DataFrame)
    assert_eq(np.clip(ddf, 5.5, 40.5), np.clip(df, 5.5, 40.5))

    # applying Dask ufunc to normal DataFrame triggers computation
    assert isinstance(da.clip(df, 5.5, 40.5), pd.DataFrame)
    assert_eq(da.clip(df, 5.5, 40.5), np.clip(df, 5.5, 40.5))
Пример #6
0
def test_pivot_table_errors():
    df = pd.DataFrame({'A': np.random.choice(list('abc'), size=10),
                       'B': np.random.randn(10),
                       'C': pd.Categorical(np.random.choice(list('abc'), size=10))})
    ddf = dd.from_pandas(df, 2)

    msg = "'index' must be the name of an existing column"
    with tm.assertRaisesRegexp(ValueError, msg):
        dd.pivot_table(ddf, index=['A'], columns='C', values='B')
    msg = "'columns' must be the name of an existing column"
    with tm.assertRaisesRegexp(ValueError, msg):
        dd.pivot_table(ddf, index='A', columns=['C'], values='B')
    msg = "'values' must be the name of an existing column"
    with tm.assertRaisesRegexp(ValueError, msg):
        dd.pivot_table(ddf, index='A', columns='C', values=['B'])

    msg = "aggfunc must be either 'mean', 'sum' or 'count'"
    with tm.assertRaisesRegexp(ValueError, msg):
        dd.pivot_table(ddf, index='A', columns='C', values='B', aggfunc=['sum'])

    with tm.assertRaisesRegexp(ValueError, msg):
        dd.pivot_table(ddf, index='A', columns='C', values='B', aggfunc='xx')

    df = pd.DataFrame({'A': np.random.choice(list('abc'), size=10),
                       'B': np.random.randn(10),
                       'C': np.random.choice(list('abc'), size=10)})
    ddf = dd.from_pandas(df, 2)
    msg = "'columns' must be category dtype"
    with tm.assertRaisesRegexp(ValueError, msg):
        dd.pivot_table(ddf, index='A', columns='C', values='B')
Пример #7
0
def test_series_format():
    s = pd.Series([1, 2, 3, 4, 5, 6, 7, 8],
                  index=list('ABCDEFGH'))
    ds = dd.from_pandas(s, 3)
    exp = """Dask Series Structure:
npartitions=3
A    int64
D      ...
G      ...
H      ...
dtype: int64
Dask Name: from_pandas, 3 tasks"""
    assert repr(ds) == exp
    assert str(ds) == exp

    exp = """npartitions=3
A    int64
D      ...
G      ...
H      ..."""
    assert ds.to_string() == exp

    s = pd.Series([1, 2, 3, 4, 5, 6, 7, 8],
                  index=list('ABCDEFGH'), name='XXX')
    ds = dd.from_pandas(s, 3)
    exp = """Dask Series Structure:
npartitions=3
A    int64
D      ...
G      ...
H      ...
Name: XXX, dtype: int64
Dask Name: from_pandas, 3 tasks"""
    assert repr(ds) == exp
    assert str(ds) == exp
Пример #8
0
def test_frame_2ufunc_out():
    input_matrix = np.random.randint(1, 100, size=(20, 2))

    df = pd.DataFrame(input_matrix, columns=['A', 'B'])
    ddf = dd.from_pandas(df, 3)

    # column number mismatch
    df_out = pd.DataFrame(np.random.randint(1, 100, size=(20, 3)),
                          columns=['X', 'Y', 'Z'])
    ddf_out = dd.from_pandas(df_out, 3)

    with pytest.raises(ValueError):
        np.sin(ddf, out=ddf_out)

    # types mismatch
    ddf_out = dd.from_pandas(pd.Series([0]),1)
    with pytest.raises(TypeError):
        np.sin(ddf, out=ddf_out)

    df_out = pd.DataFrame(np.random.randint(1, 100, size=(20, 2)),
                          columns=['X', 'Y'])
    ddf_out = dd.from_pandas(df_out, 3)

    np.sin(ddf, out=ddf_out)
    np.add(ddf_out, 10, out=ddf_out)

    expected = pd.DataFrame(np.sin(input_matrix) + 10, columns=['A', 'B'])

    assert_eq(ddf_out, expected)
Пример #9
0
def test_concat(join):
    pdf1 = pd.DataFrame({'x': [1, 2, 3, 4, 6, 7],
                         'y': list('abcdef')},
                        index=[1, 2, 3, 4, 6, 7])
    ddf1 = dd.from_pandas(pdf1, 2)
    pdf2 = pd.DataFrame({'x': [1, 2, 3, 4, 6, 7],
                         'y': list('abcdef')},
                        index=[8, 9, 10, 11, 12, 13])
    ddf2 = dd.from_pandas(pdf2, 2)

    # different columns
    pdf3 = pd.DataFrame({'x': [1, 2, 3, 4, 6, 7],
                         'z': list('abcdef')},
                        index=[8, 9, 10, 11, 12, 13])
    ddf3 = dd.from_pandas(pdf3, 2)

    for (dd1, dd2, pd1, pd2) in [(ddf1, ddf2, pdf1, pdf2),
                                 (ddf1, ddf3, pdf1, pdf3)]:
        result = dd.concat([dd1, dd2], join=join)
        expected = pd.concat([pd1, pd2], join=join)
        assert eq(result, expected)

    # test outer only, inner has a problem on pandas side
    for (dd1, dd2, pd1, pd2) in [(ddf1, ddf2, pdf1, pdf2),
                                 (ddf1, ddf3, pdf1, pdf3),
                                 (ddf1.x, ddf2.x, pdf1.x, pdf2.x),
                                 (ddf1.x, ddf3.z, pdf1.x, pdf3.z),
                                 (ddf1.x, ddf2.x, pdf1.x, pdf2.x),
                                 (ddf1.x, ddf3.z, pdf1.x, pdf3.z)]:
        result = dd.concat([dd1, dd2])
        expected = pd.concat([pd1, pd2])
        assert eq(result, expected)
Пример #10
0
def test_groupy_non_aligned_index():
    pdf = pd.DataFrame({'a': [1, 2, 6, 4, 4, 6, 4, 3, 7] * 10,
                        'b': [4, 2, 7, 3, 3, 1, 1, 1, 2] * 10,
                        'c': [0, 1, 2, 3, 4, 5, 6, 7, 8] * 10},
                       columns=['c', 'b', 'a'])

    ddf3 = dd.from_pandas(pdf, npartitions=3)
    ddf7 = dd.from_pandas(pdf, npartitions=7)

    # working examples
    ddf3.groupby(['a', 'b'])
    ddf3.groupby([ddf3['a'], ddf3['b']])

    # misaligned divisions
    with pytest.raises(NotImplementedError):
        ddf3.groupby(ddf7['a'])

    with pytest.raises(NotImplementedError):
        ddf3.groupby([ddf7['a'], ddf7['b']])

    with pytest.raises(NotImplementedError):
        ddf3.groupby([ddf7['a'], ddf3['b']])

    with pytest.raises(NotImplementedError):
        ddf3.groupby([ddf3['a'], ddf7['b']])

    with pytest.raises(NotImplementedError):
        ddf3.groupby([ddf7['a'], 'b'])
Пример #11
0
def test_to_hdf():
    pytest.importorskip('tables')
    df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'],
                       'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.])
    a = dd.from_pandas(df, 2)

    with tmpfile('h5') as fn:
        a.to_hdf(fn, '/data')
        out = pd.read_hdf(fn, '/data')
        tm.assert_frame_equal(df, out[:])

    with tmpfile('h5') as fn:
        a.x.to_hdf(fn, '/data')
        out = pd.read_hdf(fn, '/data')
        tm.assert_series_equal(df.x, out[:])

    a = dd.from_pandas(df, 1)
    with tmpfile('h5') as fn:
        a.to_hdf(fn, '/data')
        out = pd.read_hdf(fn, '/data')
        tm.assert_frame_equal(df, out[:])

    # test compute = False
    with tmpfile('h5') as fn:
        r = a.to_hdf(fn, '/data', compute=False)
        r.compute()
        out = pd.read_hdf(fn, '/data')
        tm.assert_frame_equal(df, out[:])
Пример #12
0
def test_get_dummies_kwargs():
    s = pd.Series([1, 1, 1, 2, 2, 1, 3, 4], dtype='category')
    exp = pd.get_dummies(s, prefix='X', prefix_sep='-')

    ds = dd.from_pandas(s, 2)
    res = dd.get_dummies(ds, prefix='X', prefix_sep='-')
    assert_eq(res, exp)
    tm.assert_index_equal(res.columns, pd.Index(['X-1', 'X-2', 'X-3', 'X-4']))

    exp = pd.get_dummies(s, drop_first=True)

    ds = dd.from_pandas(s, 2)
    res = dd.get_dummies(ds, drop_first=True)
    assert_eq(res, exp)
    tm.assert_index_equal(res.columns, exp.columns)

    # nan
    s = pd.Series([1, 1, 1, 2, np.nan, 3, np.nan, 5], dtype='category')
    exp = pd.get_dummies(s)

    ds = dd.from_pandas(s, 2)
    res = dd.get_dummies(ds)
    assert_eq(res, exp)
    tm.assert_index_equal(res.columns, exp.columns)

    # dummy_na
    exp = pd.get_dummies(s, dummy_na=True)

    ds = dd.from_pandas(s, 2)
    res = dd.get_dummies(ds, dummy_na=True)
    assert_eq(res, exp)
    tm.assert_index_equal(res.columns, pd.Index([1, 2, 3, 5, np.nan]))
Пример #13
0
def test_getitem():
    df = pd.DataFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8, 9],
                       'B': [9, 8, 7, 6, 5, 4, 3, 2, 1],
                       'C': [True, False, True] * 3},
                      columns=list('ABC'))
    ddf = dd.from_pandas(df, 2)
    assert_eq(ddf['A'], df['A'])
    # check cache consistency
    tm.assert_series_equal(ddf['A']._meta, ddf._meta['A'])

    assert_eq(ddf[['A', 'B']], df[['A', 'B']])
    tm.assert_frame_equal(ddf[['A', 'B']]._meta, ddf._meta[['A', 'B']])

    assert_eq(ddf[ddf.C], df[df.C])
    tm.assert_series_equal(ddf.C._meta, ddf._meta.C)

    assert_eq(ddf[ddf.C.repartition([0, 2, 5, 8])], df[df.C])

    pytest.raises(KeyError, lambda: df['X'])
    pytest.raises(KeyError, lambda: df[['A', 'X']])
    pytest.raises(AttributeError, lambda: df.X)

    # not str/unicode
    df = pd.DataFrame(np.random.randn(10, 5))
    ddf = dd.from_pandas(df, 2)
    assert_eq(ddf[0], df[0])
    assert_eq(ddf[[1, 2]], df[[1, 2]])

    pytest.raises(KeyError, lambda: df[8])
    pytest.raises(KeyError, lambda: df[[1, 8]])
Пример #14
0
def test_set_index_drop(drop):
    pdf = pd.DataFrame({'A': list('ABAABBABAA'),
                        'B': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                        'C': [1, 2, 3, 2, 1, 3, 2, 4, 2, 3]})
    ddf = dd.from_pandas(pdf, 3)

    assert_eq(ddf.set_index('A', drop=drop),
              pdf.set_index('A', drop=drop))
    assert_eq(ddf.set_index('B', drop=drop),
              pdf.set_index('B', drop=drop))
    assert_eq(ddf.set_index('C', drop=drop),
              pdf.set_index('C', drop=drop))
    assert_eq(ddf.set_index(ddf.A, drop=drop),
              pdf.set_index(pdf.A, drop=drop))
    assert_eq(ddf.set_index(ddf.B, drop=drop),
              pdf.set_index(pdf.B, drop=drop))
    assert_eq(ddf.set_index(ddf.C, drop=drop),
              pdf.set_index(pdf.C, drop=drop))

    # numeric columns
    pdf = pd.DataFrame({0: list('ABAABBABAA'),
                        1: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
                        2: [1, 2, 3, 2, 1, 3, 2, 4, 2, 3]})
    ddf = dd.from_pandas(pdf, 3)
    assert_eq(ddf.set_index(0, drop=drop),
              pdf.set_index(0, drop=drop))
    assert_eq(ddf.set_index(2, drop=drop),
              pdf.set_index(2, drop=drop))
Пример #15
0
def test_groupby_column_and_index_apply(group_args, apply_func):
    df = pd.DataFrame({'idx': [1, 1, 1, 2, 2, 2],
                       'a': [1, 2, 1, 2, 1, 2],
                       'b': np.arange(6)}
                      ).set_index('idx')

    ddf = dd.from_pandas(df, npartitions=df.index.nunique())
    ddf_no_divs = dd.from_pandas(df, npartitions=df.index.nunique(), sort=False)

    # Expected result
    expected = df.groupby(group_args).apply(apply_func)

    # Compute on dask DataFrame with divisions (no shuffling)
    result = ddf.groupby(group_args).apply(apply_func)
    assert_eq(expected, result, check_divisions=False)

    # Check that partitioning is preserved
    assert ddf.divisions == result.divisions

    # Check that no shuffling occurred.
    # The groupby operation should add only 1 task per partition
    assert len(result.dask) == (len(ddf.dask) + ddf.npartitions)

    # Compute on dask DataFrame without divisions (requires shuffling)
    result = ddf_no_divs.groupby(group_args).apply(apply_func)
    assert_eq(expected, result, check_divisions=False)

    # Check that divisions were preserved (all None in this case)
    assert ddf_no_divs.divisions == result.divisions

    # Crude check to see if shuffling was performed.
    # The groupby operation should add only more than 1 task per partition
    assert len(result.dask) > (len(ddf_no_divs.dask) + ddf_no_divs.npartitions)
Пример #16
0
def test_append():
    df = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6], "b": [1, 2, 3, 4, 5, 6]})
    df2 = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6], "b": [1, 2, 3, 4, 5, 6]}, index=[6, 7, 8, 9, 10, 11])
    df3 = pd.DataFrame({"b": [1, 2, 3, 4, 5, 6], "c": [1, 2, 3, 4, 5, 6]}, index=[6, 7, 8, 9, 10, 11])

    ddf = dd.from_pandas(df, 2)
    ddf2 = dd.from_pandas(df2, 2)
    ddf3 = dd.from_pandas(df3, 2)
    assert eq(ddf.append(ddf2), df.append(df2))
    assert eq(ddf.a.append(ddf2.a), df.a.append(df2.a))
    # different columns
    assert eq(ddf.append(ddf3), df.append(df3))
    assert eq(ddf.a.append(ddf3.b), df.a.append(df3.b))

    # dask + pandas
    assert eq(ddf.append(df2), df.append(df2))
    assert eq(ddf.a.append(df2.a), df.a.append(df2.a))

    assert eq(ddf.append(df3), df.append(df3))
    assert eq(ddf.a.append(df3.b), df.a.append(df3.b))

    s = pd.Series([7, 8], name=6, index=["a", "b"])
    assert eq(ddf.append(s), df.append(s))

    df4 = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6], "b": [1, 2, 3, 4, 5, 6]}, index=[4, 5, 6, 7, 8, 9])
    ddf4 = dd.from_pandas(df4, 2)
    msg = (
        "Unable to append two dataframes to each other with known "
        "divisions if those divisions are not ordered. "
        "The divisions/index of the second dataframe must be "
        "greater than the divisions/index of the first dataframe."
    )
    with tm.assertRaisesRegexp(ValueError, msg):
        ddf.append(ddf4)
Пример #17
0
def test_concat4_interleave_partitions():
    pdf1 = pd.DataFrame(np.random.randn(10, 5), columns=list("ABCDE"), index=list("abcdefghij"))
    pdf2 = pd.DataFrame(np.random.randn(13, 5), columns=list("ABCDE"), index=list("fghijklmnopqr"))
    pdf3 = pd.DataFrame(np.random.randn(13, 6), columns=list("CDEXYZ"), index=list("fghijklmnopqr"))

    ddf1 = dd.from_pandas(pdf1, 2)
    ddf2 = dd.from_pandas(pdf2, 3)
    ddf3 = dd.from_pandas(pdf3, 2)

    msg = (
        "All inputs have known divisions which cannnot be "
        "concatenated in order. Specify "
        "interleave_partitions=True to ignore order"
    )

    cases = [[ddf1, ddf1], [ddf1, ddf2], [ddf1, ddf3], [ddf2, ddf1], [ddf2, ddf3], [ddf3, ddf1], [ddf3, ddf2]]
    for case in cases:
        pdcase = [c.compute() for c in case]

        with tm.assertRaisesRegexp(ValueError, msg):
            dd.concat(case)

        assert eq(dd.concat(case, interleave_partitions=True), pd.concat(pdcase))
        assert eq(dd.concat(case, join="inner", interleave_partitions=True), pd.concat(pdcase, join="inner"))

    msg = "'join' must be 'inner' or 'outer'"
    with tm.assertRaisesRegexp(ValueError, msg):
        dd.concat([ddf1, ddf1], join="invalid", interleave_partitions=True)
Пример #18
0
def test_getitem():
    df = pd.DataFrame(
        {"A": [1, 2, 3, 4, 5, 6, 7, 8, 9], "B": [9, 8, 7, 6, 5, 4, 3, 2, 1], "C": [True, False, True] * 3},
        columns=list("ABC"),
    )
    ddf = dd.from_pandas(df, 2)
    assert eq(ddf["A"], df["A"])
    tm.assert_series_equal(ddf["A"]._pd, ddf._pd["A"])  # check cache consistency

    assert eq(ddf[["A", "B"]], df[["A", "B"]])
    tm.assert_frame_equal(ddf[["A", "B"]]._pd, ddf._pd[["A", "B"]])

    assert eq(ddf[ddf.C], df[df.C])
    tm.assert_series_equal(ddf.C._pd, ddf._pd.C)

    assert eq(ddf[ddf.C.repartition([0, 2, 5, 8])], df[df.C])

    assert raises(KeyError, lambda: df["X"])
    assert raises(KeyError, lambda: df[["A", "X"]])
    assert raises(AttributeError, lambda: df.X)

    # not str/unicode
    df = pd.DataFrame(np.random.randn(10, 5))
    ddf = dd.from_pandas(df, 2)
    assert eq(ddf[0], df[0])
    assert eq(ddf[[1, 2]], df[[1, 2]])

    assert raises(KeyError, lambda: df[8])
    assert raises(KeyError, lambda: df[[1, 8]])
Пример #19
0
def test_index_format():
    s = pd.Series([1, 2, 3, 4, 5, 6, 7, 8],
                  index=list('ABCDEFGH'))
    ds = dd.from_pandas(s, 3)
    exp = """Dask Index Structure:
npartitions=3
A    object
D       ...
G       ...
H       ...
dtype: object
Dask Name: from_pandas, 6 tasks"""
    assert repr(ds.index) == exp
    assert str(ds.index) == exp

    s = pd.Series([1, 2, 3, 4, 5, 6, 7, 8],
                  index=pd.CategoricalIndex([1, 2, 3, 4, 5, 6, 7, 8], name='YYY'))
    ds = dd.from_pandas(s, 3)
    exp = dedent("""\
    Dask Index Structure:
    npartitions=3
    1    category[known]
    4                ...
    7                ...
    8                ...
    Name: YYY, dtype: category
    Dask Name: from_pandas, 6 tasks""")
    assert repr(ds.index) == exp
    assert str(ds.index) == exp
Пример #20
0
def test_reductions_out(frame, axis, out, redfunc):
    dsk_in = dd.from_pandas(frame, 3)
    dsk_out = dd.from_pandas(pd.Series([0]), 1).sum()

    if out is not None:
        dsk_out = dd.from_pandas(out, 3)

    np_redfunc = getattr(np, redfunc)
    pd_redfunc = getattr(frame.__class__, redfunc)
    dsk_redfunc = getattr(dsk_in.__class__, redfunc)

    if redfunc in ['var', 'std']:
        # numpy has default ddof value 0 while
        # dask and pandas have 1, so ddof should be passed
        # explicitly when calling np.var(dask)
        np_redfunc(dsk_in, axis=axis, ddof=1, out=dsk_out)
    else:
        np_redfunc(dsk_in, axis=axis, out=dsk_out)

    assert_eq(dsk_out, pd_redfunc(frame, axis=axis))

    dsk_redfunc(dsk_in, axis=axis, split_every=False, out=dsk_out)
    assert_eq(dsk_out, pd_redfunc(frame, axis=axis))

    dsk_redfunc(dsk_in, axis=axis, split_every=2, out=dsk_out)
    assert_eq(dsk_out, pd_redfunc(frame, axis=axis))
Пример #21
0
def test_concat_unknown_divisions_errors():
    a = pd.Series([1, 2, 3, 4, 5, 6])
    b = pd.Series([4, 3, 2, 1])
    aa = dd.from_pandas(a, npartitions=2, sort=False)
    bb = dd.from_pandas(b, npartitions=2, sort=False)

    with pytest.raises(ValueError):
        dd.concat([aa, bb], axis=1).compute()
Пример #22
0
def test_from_pandas_with_datetime_index():
    with filetext(timeseries) as fn:
        df = pd.read_csv(fn, index_col=0, header=0, usecols=[0, 4],
                         parse_dates=['Date'])
        ddf = dd.from_pandas(df, 2)
        eq(df, ddf)
        ddf = dd.from_pandas(df, chunksize=2)
        eq(df, ddf)
Пример #23
0
def test_gh_517():
    arr = np.random.randn(100, 2)
    df = pd.DataFrame(arr, columns=["a", "b"])
    ddf = dd.from_pandas(df, 2)
    assert ddf.index.nunique().compute() == 100

    ddf2 = dd.from_pandas(pd.concat([df, df]), 5)
    assert ddf2.index.nunique().compute() == 100
Пример #24
0
def test_merge_index_without_divisions(shuffle):
    a = pd.DataFrame({"x": [1, 2, 3, 4, 5]}, index=[1, 2, 3, 4, 5])
    b = pd.DataFrame({"y": [1, 2, 3, 4, 5]}, index=[5, 4, 3, 2, 1])

    aa = dd.from_pandas(a, npartitions=3, sort=False)
    bb = dd.from_pandas(b, npartitions=2)

    eq(aa.join(bb, how="inner", shuffle=shuffle), a.join(b, how="inner"))
Пример #25
0
def test_concat5():
    pdf1 = pd.DataFrame(np.random.randn(7, 5),
                        columns=list('ABCDE'), index=list('abcdefg'))
    pdf2 = pd.DataFrame(np.random.randn(7, 6),
                        columns=list('FGHIJK'), index=list('abcdefg'))
    pdf3 = pd.DataFrame(np.random.randn(7, 6),
                        columns=list('FGHIJK'), index=list('cdefghi'))
    pdf4 = pd.DataFrame(np.random.randn(7, 5),
                        columns=list('FGHAB'), index=list('cdefghi'))
    pdf5 = pd.DataFrame(np.random.randn(7, 5),
                        columns=list('FGHAB'), index=list('fklmnop'))

    ddf1 = dd.from_pandas(pdf1, 2)
    ddf2 = dd.from_pandas(pdf2, 3)
    ddf3 = dd.from_pandas(pdf3, 2)
    ddf4 = dd.from_pandas(pdf4, 2)
    ddf5 = dd.from_pandas(pdf5, 3)

    cases = [[ddf1, ddf2], [ddf1, ddf3], [ddf1, ddf4], [ddf1, ddf5],
             [ddf3, ddf4], [ddf3, ddf5], [ddf5, ddf1, ddf4], [ddf5, ddf3],
             [ddf1.A, ddf4.A], [ddf2.F, ddf3.F], [ddf4.A, ddf5.A],
             [ddf1.A, ddf4.F], [ddf2.F, ddf3.H], [ddf4.A, ddf5.B],
             [ddf1, ddf4.A], [ddf3.F, ddf2], [ddf5, ddf1.A, ddf2]]

    for case in cases:
        pdcase = [c.compute() for c in case]

        with pytest.warns(None):
            # some cases will raise warning directly from pandas
            assert_eq(dd.concat(case, interleave_partitions=True),
                      pd.concat(pdcase))

        assert_eq(dd.concat(case, join='inner', interleave_partitions=True),
                  pd.concat(pdcase, join='inner'))

        assert_eq(dd.concat(case, axis=1), pd.concat(pdcase, axis=1))

        assert_eq(dd.concat(case, axis=1, join='inner'),
                  pd.concat(pdcase, axis=1, join='inner'))

    # Dask + pandas
    cases = [[ddf1, pdf2], [ddf1, pdf3], [pdf1, ddf4],
             [pdf1.A, ddf4.A], [ddf2.F, pdf3.F],
             [ddf1, pdf4.A], [ddf3.F, pdf2], [ddf2, pdf1, ddf3.F]]

    for case in cases:
        pdcase = [c.compute() if isinstance(c, _Frame) else c for c in case]

        assert_eq(dd.concat(case, interleave_partitions=True),
                  pd.concat(pdcase))

        assert_eq(dd.concat(case, join='inner', interleave_partitions=True),
                  pd.concat(pdcase, join='inner'))

        assert_eq(dd.concat(case, axis=1), pd.concat(pdcase, axis=1))

        assert_eq(dd.concat(case, axis=1, join='inner'),
                  pd.concat(pdcase, axis=1, join='inner'))
Пример #26
0
def test_merge_by_multiple_columns():

    pdf1l = pd.DataFrame({'a': list('abcdefghij'),
                          'b': list('abcdefghij'),
                          'c': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
                         index=list('abcdefghij'))
    pdf1r = pd.DataFrame({'d': list('abcdefghij'),
                          'e': list('abcdefghij'),
                          'f': [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]},
                         index=list('abcdefghij'))

    pdf2l = pd.DataFrame({'a': list('abcdeabcde'),
                          'b': list('abcabcabca'),
                          'c': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
                         index=list('abcdefghij'))
    pdf2r = pd.DataFrame({'d': list('edcbaedcba'),
                          'e': list('aaabbbcccd'),
                          'f': [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]},
                         index=list('fghijklmno'))

    pdf3l = pd.DataFrame({'a': list('aaaaaaaaaa'),
                          'b': list('aaaaaaaaaa'),
                          'c': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]},
                         index=list('abcdefghij'))
    pdf3r = pd.DataFrame({'d': list('aaabbbccaa'),
                          'e': list('abbbbbbbbb'),
                          'f': [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]},
                         index=list('ABCDEFGHIJ'))

    for pdl, pdr in [(pdf1l, pdf1r), (pdf2l, pdf2r), (pdf3l, pdf3r)]:

        for lpart, rpart in [(2, 2), (3, 2), (2, 3)]:

            ddl = dd.from_pandas(pdl, lpart)
            ddr = dd.from_pandas(pdr, rpart)

            for how in ['inner', 'outer', 'left', 'right']:
                eq(ddl.join(ddr, how=how), pdl.join(pdr, how=how))
                eq(ddr.join(ddl, how=how), pdr.join(pdl, how=how))

                eq(dd.merge(ddl, ddr, how=how, left_index=True, right_index=True),
                   pd.merge(pdl, pdr, how=how, left_index=True, right_index=True))
                eq(dd.merge(ddr, ddl, how=how, left_index=True, right_index=True),
                   pd.merge(pdr, pdl, how=how, left_index=True, right_index=True))

                # hash join
                list_eq(dd.merge(ddl, ddr, how=how, left_on='a', right_on='d'),
                        pd.merge(pdl, pdr, how=how, left_on='a', right_on='d'))
                list_eq(dd.merge(ddl, ddr, how=how, left_on='b', right_on='e'),
                        pd.merge(pdl, pdr, how=how, left_on='b', right_on='e'))

                list_eq(dd.merge(ddr, ddl, how=how, left_on='d', right_on='a'),
                        pd.merge(pdr, pdl, how=how, left_on='d', right_on='a'))
                list_eq(dd.merge(ddr, ddl, how=how, left_on='e', right_on='b'),
                        pd.merge(pdr, pdl, how=how, left_on='e', right_on='b'))

                list_eq(dd.merge(ddl, ddr, how=how, left_on=['a', 'b'], right_on=['d', 'e']),
                        pd.merge(pdl, pdr, how=how, left_on=['a', 'b'], right_on=['d', 'e']))
Пример #27
0
def test_reductions():
    dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]},
                                  index=[0, 1, 3]),
           ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]},
                                  index=[5, 6, 8]),
           ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]},
                                  index=[9, 9, 9])}
    ddf1 = dd.DataFrame(dsk, 'x', ['a', 'b'], [0, 4, 9, 9])
    pdf1 = ddf1.compute()

    nans1 = pd.Series([1] + [np.nan] * 4 + [2] + [np.nan] * 3)
    nands1 = dd.from_pandas(nans1, 2)
    nans2 = pd.Series([1] + [np.nan] * 8)
    nands2 = dd.from_pandas(nans2, 2)
    nans3 = pd.Series([np.nan] * 9)
    nands3 = dd.from_pandas(nans3, 2)

    bools = pd.Series([True, False, True, False, True], dtype=bool)
    boolds = dd.from_pandas(bools, 2)

    for dds, pds in [(ddf1.b, pdf1.b), (ddf1.a, pdf1.a),
                     (ddf1['a'], pdf1['a']), (ddf1['b'], pdf1['b']),
                     (nands1, nans1), (nands2, nans2), (nands3, nans3),
                     (boolds, bools)]:
        assert isinstance(dds, dd.Series)
        assert isinstance(pds, pd.Series)
        assert eq(dds.sum(), pds.sum())
        assert eq(dds.min(), pds.min())
        assert eq(dds.max(), pds.max())
        assert eq(dds.count(), pds.count())
        assert eq(dds.std(), pds.std())
        assert eq(dds.var(), pds.var())
        assert eq(dds.std(ddof=0), pds.std(ddof=0))
        assert eq(dds.var(ddof=0), pds.var(ddof=0))
        assert eq(dds.mean(), pds.mean())
        assert eq(dds.nunique(), pds.nunique())
        assert eq(dds.nbytes, pds.nbytes)

        assert eq(dds.sum(skipna=False), pds.sum(skipna=False))
        assert eq(dds.min(skipna=False), pds.min(skipna=False))
        assert eq(dds.max(skipna=False), pds.max(skipna=False))
        assert eq(dds.std(skipna=False), pds.std(skipna=False))
        assert eq(dds.var(skipna=False), pds.var(skipna=False))
        assert eq(dds.std(skipna=False, ddof=0), pds.std(skipna=False, ddof=0))
        assert eq(dds.var(skipna=False, ddof=0), pds.var(skipna=False, ddof=0))
        assert eq(dds.mean(skipna=False), pds.mean(skipna=False))

    assert_dask_graph(ddf1.b.sum(), 'series-sum')
    assert_dask_graph(ddf1.b.min(), 'series-min')
    assert_dask_graph(ddf1.b.max(), 'series-max')
    assert_dask_graph(ddf1.b.count(), 'series-count')
    assert_dask_graph(ddf1.b.std(), 'series-std(ddof=1)')
    assert_dask_graph(ddf1.b.var(), 'series-var(ddof=1)')
    assert_dask_graph(ddf1.b.std(ddof=0), 'series-std(ddof=0)')
    assert_dask_graph(ddf1.b.var(ddof=0), 'series-var(ddof=0)')
    assert_dask_graph(ddf1.b.mean(), 'series-mean')
    # nunique is performed using drop-duplicates
    assert_dask_graph(ddf1.b.nunique(), 'drop-duplicates')
Пример #28
0
def test_register_extension_type():
    arr = DecimalArray._from_sequence([Decimal('1.0')] * 10)
    ser = pd.Series(arr)
    dser = dd.from_pandas(ser, 2)
    assert_eq(ser, dser)

    df = pd.DataFrame({"A": ser})
    ddf = dd.from_pandas(df, 2)
    assert_eq(df, ddf)
Пример #29
0
def test_from_pandas_non_sorted():
    df = pd.DataFrame({'x': [1, 2, 3]}, index=[3, 1, 2])
    ddf = dd.from_pandas(df, npartitions=2, sort=False)
    assert not ddf.known_divisions
    assert_eq(df, ddf)

    ddf = dd.from_pandas(df, chunksize=2, sort=False)
    assert not ddf.known_divisions
    assert_eq(df, ddf)
Пример #30
0
def test_merge_index_without_divisions(shuffle):
    a = pd.DataFrame({'x': [1, 2, 3, 4, 5]}, index=[1, 2, 3, 4, 5])
    b = pd.DataFrame({'y': [1, 2, 3, 4, 5]}, index=[5, 4, 3, 2, 1])

    aa = dd.from_pandas(a, npartitions=3, sort=False)
    bb = dd.from_pandas(b, npartitions=2)

    assert_eq(aa.join(bb, how='inner', shuffle=shuffle),
              a.join(b, how='inner'))
    import Database_Handler as dh
    from bson import ObjectId
    mongodb = dh.ToMongoDB(*dh.AWS_MongoDB_Information())
    dbname = 'hy_db'
    useDB = dh.Use_Database(mongodb, dbname)
    commentCollection = dh.Use_Collection(useDB, 'comments')
    info = {'site': row['site'],
            'category': row['category'],
            'date': row['date'],
            'rank': row['rank']}
    commentsForNews = commentCollection.find(info)
    realNumCount = commentsForNews.count()
    site = row['site']
    oid = ObjectId(row['id'])
    if site == 'daum':
        newsCollection = dh.Use_Collection(useDB, 'newsDaum')
    else:
        newsCollection = dh.Use_Collection(useDB, 'newsNaver')
    if realNumCount != row['number_of_crawled_comment']:
        newsCollection.update_one({'_id': oid},
                                  {'$set': {'real_number_of_comment': realNumCount}})
    if row.name % 100 == 0:
        print(row.name)


if __name__ == "__main__":
    start = datetime.now()
    ddf = dd.from_pandas(extData, npartitions=cpu_count())
    ddf.apply(GetNumberOfCommentInDB, axis=1, meta = int).compute()
    end = datetime.now()
    print('running time : {}'.format(end - start))
Пример #32
0
def time_df1(npartitions):
    pandas_df = pd.DataFrame({
        'time': pd.to_datetime([1, 2, 3, 4]),
        'value': [1.1, 2.2, 3.3, 4.4]
    })
    return dd.from_pandas(pandas_df, npartitions=npartitions)
Пример #33
0
def test_add_last_time_indexes():
    pd_es = EntitySet(id="pd_es")
    dask_es = EntitySet(id="dask_es")

    sessions = pd.DataFrame({
        "id": [0, 1, 2, 3],
        "user": [1, 2, 1, 3],
        "time": [
            pd.to_datetime('2019-01-10'),
            pd.to_datetime('2019-02-03'),
            pd.to_datetime('2019-01-01'),
            pd.to_datetime('2017-08-25')
        ],
        "strings": ["I am a string", "23", "abcdef ghijk", ""]
    })
    sessions_dask = dd.from_pandas(sessions, npartitions=2)
    sessions_logical_types = {
        "id": Integer,
        "user": Integer,
        "time": Datetime,
        "strings": NaturalLanguage
    }

    transactions = pd.DataFrame({
        "id": [0, 1, 2, 3, 4, 5],
        "session_id": [0, 0, 1, 2, 2, 3],
        "amount": [1.23, 5.24, 123.52, 67.93, 40.34, 50.13],
        "time": [
            pd.to_datetime('2019-01-10 03:53'),
            pd.to_datetime('2019-01-10 04:12'),
            pd.to_datetime('2019-02-03 10:34'),
            pd.to_datetime('2019-01-01 12:35'),
            pd.to_datetime('2019-01-01 12:49'),
            pd.to_datetime('2017-08-25 04:53')
        ]
    })
    transactions_dask = dd.from_pandas(transactions, npartitions=2)

    transactions_logical_types = {
        "id": Integer,
        "session_id": Integer,
        "time": Datetime,
        "amount": Double
    }

    pd_es.add_dataframe(dataframe_name="sessions",
                        dataframe=sessions,
                        index="id",
                        time_index="time")
    dask_es.add_dataframe(dataframe_name="sessions",
                          dataframe=sessions_dask,
                          index="id",
                          time_index="time",
                          logical_types=sessions_logical_types)

    pd_es.add_dataframe(dataframe_name="transactions",
                        dataframe=transactions,
                        index="id",
                        time_index="time")
    dask_es.add_dataframe(dataframe_name="transactions",
                          dataframe=transactions_dask,
                          index="id",
                          time_index="time",
                          logical_types=transactions_logical_types)

    pd_es = pd_es.add_relationship("sessions", "id", "transactions",
                                   "session_id")
    dask_es = dask_es.add_relationship("sessions", "id", "transactions",
                                       "session_id")

    assert 'foreign_key' in pd_es['transactions'].ww.semantic_tags[
        'session_id']
    assert 'foreign_key' in dask_es['transactions'].ww.semantic_tags[
        'session_id']

    assert pd_es['sessions'].ww.metadata.get('last_time_index') is None
    assert dask_es['sessions'].ww.metadata.get('last_time_index') is None

    pd_es.add_last_time_indexes()
    dask_es.add_last_time_indexes()

    pd_lti_name = pd_es['sessions'].ww.metadata.get('last_time_index')
    ks_lti_name = dask_es['sessions'].ww.metadata.get('last_time_index')
    assert pd_lti_name == ks_lti_name
    pd.testing.assert_series_equal(
        pd_es['sessions'][pd_lti_name].sort_index(),
        dask_es['sessions'][ks_lti_name].compute().sort_index(),
        check_names=False)
Пример #34
0
    def partition_and_write(df,
                            x,
                            y,
                            filename,
                            p=10,
                            npartitions=None,
                            shuffle=None,
                            compression='default'):
        """
        Perform spatial partitioning on an input dataframe and write the
        result to a parquet file.  The resulting parquet file will contain
        the same columns as the input dataframe, but the dataframe's original
        index will be dropped.

        The resulting parquet file will contain all of the rows from the
        input dataframe, but they will be spatially sorted and partitioned
        along a 2D Hilbert curve (https://en.wikipedia.org/wiki/Hilbert_curve).

        The parquet file will also contain custom metadata that is needed to
        reconstruct the Hilbert curve distances on load.  This parquet file
        may then be used to construct SpatialPointsFrame instances.

        Parameters
        ----------
        df: pd.DataFrame or dd.DataFrame
            The input dataframe to partition
        x, y
            The column labels in df of the x and y coordinates of each row
        filename: str
            The path where the resulting parquet file should be written.
            See dask.dataframe.to_parquet for description of supported path
            specifications.
        p: int (default 10)
            The Hilbert curve order parameter that determines the resolution
            of the 2D grid that data points are rounded to before computing
            their Hilbert distance. Points will be discretized into 2 ** p
            bins in each the x and y dimensions.

            This parameter should be increased if the partitions of the
            resulting parquet files are significantly unbalanced.

        npartitions: int or None (default None)
            The number of partitions for the resulting parquet file.  If None
            (the default) this is chosen to be the greater of 8 and
            len(df) // 2**23.

            In general, increasing the number of partitions will improve
            performance when processing small subsets of the overall parquet
            data set.  But this comes at the cost of some additional overhead
            when processing the entire data set.

        shuffle: str or None (default None)
            The dask.dataframe.DataFrame.set_index shuffle method. If None,
            a default is chosen based on the current scheduler.

        compression: str or None (default)
            The dask.dataframe.to_parquet compression method.
        """

        _validate_fastparquet()

        # Validate filename
        if (not isinstance(filename, basestring)
                or not (filename.endswith('.parquet')
                        or filename.endswith('.parq'))):
            raise ValueError("""\
'filename must be a string ending with a .parquet or .parq extension""")

        # Remove any existing directory
        if os.path.exists(filename):
            shutil.rmtree(filename)

        # Normalize to dask dataframe
        if isinstance(df, pd.DataFrame):
            ddf = dd.from_pandas(df, npartitions=4)
        elif isinstance(df, dd.DataFrame):
            ddf = df
        else:
            raise ValueError("""
df must be a pandas or dask DataFrame instance.
Received value of type {typ}""".format(typ=type(df)))

        # Compute npartitions if needed
        if npartitions is None:
            # Make partitions of ~8 million rows with a minimum of 8
            # partitions
            npartitions = max(len(df) // 2**23, 8)

        # Compute data extents
        extents = ddf.map_partitions(_compute_extents, x, y).compute()

        x_range = (float(extents['x_min'].min()),
                   float(extents['x_max'].max()))

        y_range = (float(extents['y_min'].min()),
                   float(extents['y_max'].max()))

        # Compute distance of points along the Hilbert-curve
        ddf = ddf.assign(distance=ddf.map_partitions(_compute_distance,
                                                     x=x,
                                                     y=y,
                                                     p=p,
                                                     x_range=x_range,
                                                     y_range=y_range,
                                                     as_series=True))

        # Set index to distance. This will trigger an expensive shuffle
        # sort operation
        ddf = ddf.set_index('distance',
                            npartitions=npartitions,
                            shuffle=shuffle)

        # Get list of the distance divisions computed by dask
        distance_divisions = [int(d) for d in ddf.divisions]

        # Save properties as custom metadata in the parquet file
        props = dict(
            version='1.0',
            x=x,
            y=y,
            p=p,
            distance_divisions=distance_divisions,
            x_range=x_range,
            y_range=y_range,
        )

        # Drop distance index to save storage space
        ddf = ddf.reset_index(drop=True)

        # Save ddf to parquet
        dd.to_parquet(ddf,
                      filename,
                      engine='fastparquet',
                      compression=compression)

        # Open resulting parquet file
        pf = fp.ParquetFile(filename)

        # Add a new property to the file metadata
        new_fmd = copy.copy(pf.fmd)
        new_kv = fp.parquet_thrift.KeyValue()
        new_kv.key = 'SpatialPointsFrame'
        new_kv.value = json.dumps(props)
        new_fmd.key_value_metadata.append(new_kv)

        # Overwrite file metadata
        fn = os.path.join(filename, '_metadata')
        fp.writer.write_common_metadata(fn, new_fmd, no_row_groups=False)

        fn = os.path.join(filename, '_common_metadata')
        fp.writer.write_common_metadata(fn, new_fmd)
Пример #35
0
import pandas as pd
import pytest
import numpy as np

import dask.dataframe as dd
from dask.dataframe.utils import assert_eq, PANDAS_VERSION

N = 40
df = pd.DataFrame({
    'a': np.random.randn(N).cumsum(),
    'b': np.random.randint(100, size=(N, )),
    'c': np.random.randint(100, size=(N, )),
    'd': np.random.randint(100, size=(N, )),
    'e': np.random.randint(100, size=(N, ))
})
ddf = dd.from_pandas(df, 3)

idx = (pd.date_range('2016-01-01', freq='3s', periods=100)
       | pd.date_range('2016-01-01', freq='5s', periods=100))[:N]

ts = pd.DataFrame(
    {
        'a': np.random.randn(N).cumsum(),
        'b': np.random.randint(100, size=(N, )),
        'c': np.random.randint(100, size=(N, )),
        'd': np.random.randint(100, size=(N, )),
        'e': np.random.randint(100, size=(N, ))
    },
    index=idx)
dts = dd.from_pandas(ts, 3)
Пример #36
0
def test_from_pandas_single_row():
    df = pd.DataFrame({"x": [1]}, index=[1])
    ddf = dd.from_pandas(df, npartitions=1)
    assert ddf.divisions == (1, 1)
    assert_eq(ddf, df)
Пример #37
0
def test_to_bag():
    a = pd.DataFrame(
        {
            "x": ["a", "b", "c", "d"],
            "y": [2, 3, 4, 5]
        },
        index=pd.Index([1.0, 2.0, 3.0, 4.0], name="ind"),
    )
    ddf = dd.from_pandas(a, 2)

    assert ddf.to_bag().compute() == list(a.itertuples(False))
    assert ddf.to_bag(True).compute() == list(a.itertuples(True))
    assert ddf.to_bag(format="dict").compute() == [
        {
            "x": "a",
            "y": 2
        },
        {
            "x": "b",
            "y": 3
        },
        {
            "x": "c",
            "y": 4
        },
        {
            "x": "d",
            "y": 5
        },
    ]
    assert ddf.to_bag(True, format="dict").compute() == [
        {
            "index": 1.0,
            "x": "a",
            "y": 2
        },
        {
            "index": 2.0,
            "x": "b",
            "y": 3
        },
        {
            "index": 3.0,
            "x": "c",
            "y": 4
        },
        {
            "index": 4.0,
            "x": "d",
            "y": 5
        },
    ]
    assert ddf.x.to_bag(True).compute() == list(a.x.items())
    assert ddf.x.to_bag().compute() == list(a.x)

    assert ddf.x.to_bag(True, format="dict").compute() == [
        {
            "x": "a"
        },
        {
            "x": "b"
        },
        {
            "x": "c"
        },
        {
            "x": "d"
        },
    ]
    assert ddf.x.to_bag(format="dict").compute() == [
        {
            "x": "a"
        },
        {
            "x": "b"
        },
        {
            "x": "c"
        },
        {
            "x": "d"
        },
    ]
Пример #38
0
 def __init__(self):
     self.nodes = dd.from_pandas(pd.DataFrame(
         columns=['node_1', 'node_2', 'weight']).set_index(['node_1'],
                                                           drop=False),
                                 chunksize=1e9)
Пример #39
0
def test_set_index_sorted_single_partition():
    df = pd.DataFrame({"x": [1, 2, 3, 4], "y": [1, 0, 1, 0]})
    ddf = dd.from_pandas(df, npartitions=1)
    assert_eq(ddf.set_index("x", sorted=True), df.set_index("x"))
Пример #40
0
    def _fetch_profile_and_discrete(self):
        self._logger.info("Fetching Profiles and Discrete...")
        svdf_arrays = {}
        label_arrays = {}

        for idx, row in self._sources_df.iterrows():
            self._logger.info("------------------------")
            self._logger.info(row['cruise_id'])
            url = row['summary_url']
            self._logger.info(url)
            if url.endswith('.csv'):
                svdf = pd.read_csv(url, na_values=['-9999999'])
            elif url.endswith('.xlsx'):
                svdf = pd.read_excel(url, na_values=['-9999999'])

            if row['array_rd'] not in svdf_arrays:
                svdf_arrays[row['array_rd']] = []

            clean_svdf, discrete_sample_labels = clean_ship_verification(svdf)
            label_arrays[row['array_rd']] = pd.DataFrame(
                discrete_sample_labels
            ).set_index('name')
            if row['array_rd'] == 'CE':
                # Fix some O, 0 weirdness...
                if 'CEO2' in clean_svdf['station'].unique():
                    self._logger.warning('CEO2 found! Fixing to CE02...')
                clean_svdf.loc[:, 'station'] = clean_svdf['station'].apply(
                    lambda r: r.replace('O', '0')
                )
            clean_svdf.loc[:, 'cruise_id'] = row['cruise_id']
            final_svdf = clean_svdf.reset_index(drop=True)
            cleaned_final_svdf = self.check_types_and_replace(final_svdf)
            svdf_arrays[row['array_rd']].append(cleaned_final_svdf)

        svdf_dict = {
            k: pd.concat(v, sort=False) for k, v in svdf_arrays.items()
        }

        # Creates label mapping for Display Names and Units
        self._create_label_map(label_arrays)

        profile_list, discrete_list = [], []
        for k, v in svdf_dict.items():
            sampledf = v.copy()
            profile_df, discrete_df = self.parse_profile_and_discrete(
                sampledf, k
            )
            profile_list.append(profile_df)
            if any(
                discrete_df.columns.isin(['calculated_dic', 'calculated_pco2'])
            ):
                if all(discrete_df['calculated_dic'].isna()):
                    discrete_df.drop('calculated_dic', axis=1, inplace=True)
                if all(discrete_df['calculated_pco2'].isna()):
                    discrete_df.drop('calculated_pco2', axis=1, inplace=True)
            discrete_list.append(discrete_df)

        all_profiles = pd.concat(profile_list, sort=False).reset_index(
            drop=True
        )
        all_discrete = pd.concat(discrete_list, sort=False).reset_index(
            drop=True
        )
        apdd = dd.from_pandas(all_profiles, npartitions=2)
        addd = dd.from_pandas(all_discrete, npartitions=2)

        apdd.to_parquet(
            f"s3://{self._cadai_bucket}/{settings.SHIP_DATA_PROFILES}",
            write_index=False,
        )

        addd.to_parquet(
            f"s3://{self._cadai_bucket}/{settings.SHIP_DATA_DISCRETE}",
            write_index=False,
        )
Пример #41
0
# df_=dd.from_pandas(df_, npartitions=2*multiprocessing.cpu_count())
# df['elmo']=df_.usable_text.map(lambda usable_text: give_paragraph_elmo_vector(usable_text), meta=('usable_text', str)).compute() # df['elmo'] = df.apply(lambda row: give_paragraph_elmo_vector(row['usable_text']) , axis=1)
# print(time.process_time() - start)
# df.to_csv(os.path.join(standards_dir,'iso_final_all_clean_text_w_elmo.csv'))

# the above code used dask to parallelize the operations for calculating elmo vectors. But this still uses the tf_hub on a paragraph basis. We can try one shot:
# -- sent-tokenize the paragraphs and maintain an array to track which sentences belong to which data points.
# -- give all sents at once to the tf_hub and extract the tokens then
# -- implemented the above in the elmo_utils.py

standards_dir = '../standards/data'
df = dd.read_csv(
    os.path.join(standards_dir, 'iso_final_all_clean_text.csv')
)  # df=pd.read_csv(os.path.join(standards_dir,'iso_final_all_clean_text.csv'), index_col=0)
df = df.compute()
df = dd.from_pandas(df, npartitions=2 * multiprocessing.cpu_count())
df = df[df['type'] == 'standard'].reset_index(drop=True)
df = df.fillna('')
df = df.map_partitions(lambda df: df.assign(usable_text=df['description_clean']
                                            + ' ' + df['title'])).compute()
# df=df.head(100)
# df=df.reset_index()
df_splits = np.array_split(df, 31)

for df_split in df_splits:
    start = time.process_time()
    df_split['elmo'] = give_paragraph_elmo_vector_multi(
        list(df_split['usable_text']))
    print(time.process_time() - start)
df = pd.concat(df_splits)
df.to_csv(
Пример #42
0
def _download_prices(date):
    '''
    input: datetime object
    output: pandas dataframe with prices for all available futures for the
            specified date
    '''
    db = DataBase()

    errors = []
    if type(date) == type('str'):
        date = pd.to_datetime(date, format='%Y-%m-%d')
    y = str(date.year)
    if len(str(date.month)) == 2:
        m = str(date.month)
    else:
        m = '0' + str(date.month)
    if len(str(date.day)) == 2:
        d = str(date.day)
    else:
        d = '0' + str(date.day)
    try:
        url = f'https://www.mrci.com/ohlc/{y}/{y[-2:]+m+d}.php'
        soup = db._get_session(url)

        df = pd.read_html(str(soup.find('map').find_next('table')))[0]
        try:
            futures_lookup = pd.read_csv(
                os.path.dirname(__file__) +
                '/futures_lookup.csv').name.tolist()
        except:
            futures_lookup = pd.read_csv(
                os.path.dirname(__file__) +
                '\\futures_lookup.csv').name.tolist()
        indices = [
            i for i, j in enumerate(df.iloc[:, 0]) if j in futures_lookup
        ]
        columns = [
            'month', 'date', 'open', 'high', 'low', 'close', 'change',
            'volume', 'open_interest', 'change_in_oi'
        ]
        if len(df.columns) == 11:
            df = df.iloc[indices[0]:-2, :len(df.columns) - 1]
        else:
            df = df.iloc[indices[0]:-2, :]
        #session.close()
    except:
        errors.append(date)
        #session.close()
        return errors
    df.columns = columns
    #[ i for i in np.unique(df.month).tolist() if i not in futures_lookup ]

    first = True
    for i in range(1, len(indices)):
        temp = df.loc[indices[i - 1] + 1:indices[i] - 2].copy()
        temp['future'] = df.loc[indices[i - 1], 'month']
        if first:
            out = temp.copy()
            first = False
        else:
            out = out.append(temp)
    out = out[out.iloc[:, 1] != 'Total Volume and Open Interest']
    # out.to_csv('futures.csv')
    out.index = [date] * len(
        out
    )  #pd.to_datetime( [ f'{i[-2:]}/{i[2:4]}/{i[:2]}' for i in out.date ] )
    out.replace('\+', '', regex=True, inplace=True)
    out.replace('unch', np.nan, inplace=True)

    out = db._col_to_float(out)

    return dd.from_pandas(out, npartitions=1)
Пример #43
0
def test_set_index_interpolate_int():
    L = sorted(list(range(0, 200, 10)) * 2)
    df = pd.DataFrame({'x': 2 * L})
    d = dd.from_pandas(df, 2)
    d1 = d.set_index('x', npartitions=10)
    assert all(np.issubdtype(type(x), np.integer) for x in d1.divisions)
Пример #44
0
def Agriculture():
    """
    Import and format previously calculated agriculture fuel and energy use.
    ag_electricity.py and ag_fuel.py should be refactored for methods, etc.
    """

    # use 2012 for 2010 - 2014; use 2017 for 2015 - 2016
    
    fuel_results_file_12 = \
        '../results/ag_output_fuel_use_by_county_2012_20190812_2238.csv'
    
    fuel_results_file_17 = \
        '../results/ag_output_fuel_use_by_county_2017_20190812_2230.csv'

    elect_results_file_12 = \
        '../results/ag_output_electricity_use_by_county_2012_20190813_1024.csv'
    
    elect_results_file_17 =\
        '../results/ag_output_electricity_use_by_county_2017_20190813_1019.csv'
    

    def import_format(results_filepath):
        """
        
        """

        ag_energy = pd.read_csv(results_filepath, index_col=0)
        
        # Check if index was written to file
        if ag_energy.index.names != [None]:
            
            ag_energy.reset_index(inplace=True)

        ag_energy.replace({'LP GAS': 'LPG_NGL', 'NATURAL_GAS': 'Natural_gas',
                           'DIESEL': 'Diesel', 'LPG': 'LPG_NGL',
                           'GASOLINE': 'Other', 'OTHER': 'Residual_fuel_oil',
                           'ELECTRICITY': 'Net_electricity'}, inplace=True)
    
        
        ag_energy.rename({'fuel_type': 'MECS_FT'}, inplace=True)
        
        return ag_energy
    
    fuel_12 = import_format(fuel_results_file_12)
    
    fuel_17 = import_format(fuel_results_file_17)
    
    elect_12 = import_format(elect_results_file_12)
    
    elect_17 = import_format(elect_results_file_17)

    multiplier_12 = ag.calc_multiplier(base_year=2012, 
                                       calculation_years=range(2010, 2015))

    multiplier_17 = ag.calc_multiplier(base_year=2017,
                                       calculation_years=range(2015, 2018))

    county_fuel = pd.concat(
            [ag.calc_county_fuel(fuel_12, multiplier_12,
                                 calculation_years=range(2010, 2015)), 
             ag.calc_county_fuel(fuel_17, multiplier_17,
                                 calculation_years=range(2015, 2018))],
            axis=1, ignore_index=False
            )

    county_elec = pd.concat(
            [ag.calc_county_fuel(elect_12, multiplier_12,
                                 calculation_years=range(2010, 2015)), 
             ag.calc_county_fuel(elect_17, multiplier_17,
                                 calculation_years=range(2015, 2018))],
            axis=1, ignore_index=False
            )
             
#    county_elec.state.fillna(method='ffill', inplace=True)
#    
#    county_elec.fipstate.fillna(method='ffill', inplace=True)
             
    county_total = pd.DataFrame()
    
    for df in [county_fuel, county_elec]:
        
        df = df.iloc[:, 2:]
        
        df.state.fillna(method='ffill', inplace=True)
        
        df.fipstate.fillna(method='ffill', inplace=True)
        
        df['fipstate'] = df.fipstate.astype(int)
        
        df.reset_index(inplace=True)
        
        # Drop any Alaksan counties missing info
        df = df[df.COUNTY_FIPS !=2]
        
        df.rename(columns={'fuel_type': 'MECS_FT'}, inplace=True)
        
        county_total = county_total.append(df.melt(
                id_vars=['NAICS', 'COUNTY_FIPS', 'fipstate', 'MECS_FT',
                         'state'],
                var_name='year', value_name='MMBtu_TOTAL'
                ))
        
    county_total = dd.from_pandas(
            county_total.set_index('fipstate'),
            npartitions=len(county_total.fipstate.unique())
            )
    
    filename = 'ag_county_energy_' + \
        dt.datetime.now().strftime('%Y%m%d_%H%M')+'.parquet.gzip'
    
    county_total.to_parquet('../results/'+filename, engine='pyarrow',
                            compression='gzip')
    
    return county_total
Пример #45
0
def test_from_pandas_with_index_nulls(null_value):
    df = pd.DataFrame({"x": [1, 2, 3]}, index=["C", null_value, "A"])
    with pytest.raises(NotImplementedError,
                       match="is non-numeric and contains nulls"):
        dd.from_pandas(df, npartitions=2, sort=False)
Пример #46
0
from dask.array.utils import assert_eq as assert_eq_ar
from dask.dataframe.utils import assert_eq as assert_eq_df

from dask_ml.datasets import make_classification
from dask_ml.utils import (
    _num_samples,
    assert_estimator_equal,
    check_array,
    check_chunks,
    check_matching_blocks,
    check_random_state,
    handle_zeros_in_scale,
    slice_columns,
)

df = dd.from_pandas(pd.DataFrame(5 * [range(42)]).T, npartitions=5)
s = dd.from_pandas(pd.Series([0, 1, 2, 3, 0]), npartitions=5)
a = da.from_array(np.array([0, 1, 2, 3, 0]), chunks=3)
X, y = make_classification(chunks=(2, 20))

Foo = namedtuple("Foo", "a_ b_ c_ d_")
Bar = namedtuple("Bar", "a_ b_ d_ e_")


def test_slice_columns():
    columns = [2, 3]
    df2 = slice_columns(df, columns)
    X2 = slice_columns(X, columns)

    assert list(df2.columns) == columns
    assert_eq_df(df[columns].compute(), df2.compute())
Пример #47
0
def test_rolling_repr():
    ddf = dd.from_pandas(pd.DataFrame([10] * 30), npartitions=3)
    assert repr(ddf.rolling(4)) == 'Rolling [window=4,center=False,axis=0]'
Пример #48
0
import pandas.util.testing as tm
import pytest
import sklearn.preprocessing as spp
from dask import compute
from dask.array.utils import assert_eq as assert_eq_ar
from dask.dataframe.utils import assert_eq as assert_eq_df
from pandas.api.types import is_categorical_dtype, is_object_dtype
from sklearn.exceptions import NotFittedError

import dask_ml.preprocessing as dpp
from dask_ml.datasets import make_classification
from dask_ml.utils import assert_estimator_equal

X, y = make_classification(chunks=50)
df = X.to_dask_dataframe().rename(columns=str)
df2 = dd.from_pandas(pd.DataFrame(5 * [range(42)]).T.rename(columns=str),
                     npartitions=5)
raw = pd.DataFrame(
    {
        "A": ["a", "b", "c", "a"],
        "B": ["a", "b", "c", "a"],
        "C": ["a", "b", "c", "a"],
        "D": [1, 2, 3, 4],
    },
    columns=["A", "B", "C", "D"],
)
dummy = pd.DataFrame(
    {
        "A": pd.Categorical(["a", "b", "c", "a"], ordered=True),
        "B": pd.Categorical(["a", "b", "c", "a"], ordered=False),
        "C": pd.Categorical(["a", "b", "c", "a"],
                            categories=["a", "b", "c", "d"]),
Пример #49
0
def test_rolling_names():
    df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]})
    a = dd.from_pandas(df, npartitions=2)
    assert sorted(a.rolling(2).sum().dask) == sorted(a.rolling(2).sum().dask)
Пример #50
0
 def test_da(self):
     a = dd.from_pandas(dummy, npartitions=2)
     de = dpp.OrdinalEncoder()
     result = de.fit_transform(a)
     assert isinstance(result, dd.DataFrame)
Пример #51
0
def test_make_meta():
    df = pd.DataFrame(
        {"a": [1, 2, 3], "b": list("abc"), "c": [1.0, 2.0, 3.0]}, index=[10, 20, 30]
    )

    # Pandas dataframe
    meta = make_meta(df)
    assert len(meta) == 0
    assert (meta.dtypes == df.dtypes).all()
    assert isinstance(meta.index, type(df.index))

    # Pandas series
    meta = make_meta(df.a)
    assert len(meta) == 0
    assert meta.dtype == df.a.dtype
    assert isinstance(meta.index, type(df.index))

    # Pandas index
    meta = make_meta(df.index)
    assert isinstance(meta, type(df.index))
    assert len(meta) == 0

    # Dask object
    ddf = dd.from_pandas(df, npartitions=2)
    assert make_meta(ddf) is ddf._meta

    # Dict
    meta = make_meta({"a": "i8", "b": "O", "c": "f8"})
    assert isinstance(meta, pd.DataFrame)
    assert len(meta) == 0
    assert (meta.dtypes == df.dtypes).all()
    assert isinstance(meta.index, pd.RangeIndex)

    # Iterable
    meta = make_meta([("a", "i8"), ("c", "f8"), ("b", "O")])
    assert (meta.columns == ["a", "c", "b"]).all()
    assert len(meta) == 0
    assert (meta.dtypes == df.dtypes[meta.dtypes.index]).all()
    assert isinstance(meta.index, pd.RangeIndex)

    # Tuple
    meta = make_meta(("a", "i8"))
    assert isinstance(meta, pd.Series)
    assert len(meta) == 0
    assert meta.dtype == "i8"
    assert meta.name == "a"

    # With index
    meta = make_meta({"a": "i8", "b": "i4"}, index=pd.Int64Index([1, 2], name="foo"))
    assert isinstance(meta.index, pd.Int64Index)
    assert len(meta.index) == 0
    meta = make_meta(("a", "i8"), index=pd.Int64Index([1, 2], name="foo"))
    assert isinstance(meta.index, pd.Int64Index)
    assert len(meta.index) == 0

    # Categoricals
    meta = make_meta({"a": "category"})
    assert len(meta.a.cat.categories) == 1
    assert meta.a.cat.categories[0] == UNKNOWN_CATEGORIES
    meta = make_meta(("a", "category"))
    assert len(meta.cat.categories) == 1
    assert meta.cat.categories[0] == UNKNOWN_CATEGORIES

    # Numpy scalar
    meta = make_meta(np.float64(1.0))
    assert isinstance(meta, np.float64)

    # Python scalar
    meta = make_meta(1.0)
    assert isinstance(meta, np.float64)

    # Timestamp
    x = pd.Timestamp(2000, 1, 1)
    meta = make_meta(x)
    assert meta is x

    # Dtype expressions
    meta = make_meta("i8")
    assert isinstance(meta, np.int64)
    meta = make_meta(float)
    assert isinstance(meta, np.dtype(float).type)
    meta = make_meta(np.dtype("bool"))
    assert isinstance(meta, np.bool_)
    assert pytest.raises(TypeError, lambda: make_meta(None))
Пример #52
0
def dask_df(pandas_df):
    return dd.from_pandas(pandas_df, npartitions=5)
Пример #53
0
 def to_dc(self, input_item, table_name: str, format: str = None, **kwargs):
     npartitions = kwargs.pop("npartitions", 1)
     return dd.from_pandas(input_item, npartitions=npartitions, **kwargs)
Пример #54
0
def test_set_index_reduces_partitions_small(shuffle):
    df = pd.DataFrame({'x': np.random.random(100)})
    ddf = dd.from_pandas(df, npartitions=50)

    ddf2 = ddf.set_index('x', shuffle=shuffle, npartitions='auto')
    assert ddf2.npartitions < 10
Пример #55
0
def time_df2(npartitions):
    pandas_df = pd.DataFrame({
        'time': pd.to_datetime([2, 4]),
        'other_value': [1.2, 2.0]
    })
    return dd.from_pandas(pandas_df, npartitions=npartitions)
Пример #56
0
def test_set_index_detects_sorted_data(shuffle):
    df = pd.DataFrame({'x': range(100), 'y': range(100)})
    ddf = dd.from_pandas(df, npartitions=10, name='x', sort=False)

    ddf2 = ddf.set_index('x', shuffle=shuffle)
    assert len(ddf2.dask) < ddf.npartitions * 4
Пример #57
0
def test_basic(df, npartitions):
    ddf = dd.from_pandas(df, npartitions=npartitions)

    approx = ddf.nunique_approx().compute(scheduler="sync")
    exact = len(df.drop_duplicates())
    assert abs(approx - exact) <= 2 or abs(approx - exact) / exact < 0.05
Пример #58
0
train_df['len_word_q2'] = train_df.question2.apply(
    lambda x: len(str(x).split()))

#Test
test_df['len_char_q1'] = test_df.question1.apply(
    lambda x: len(''.join(set(str(x).replace(' ', '')))))
test_df['len_char_q2'] = test_df.question2.apply(
    lambda x: len(''.join(set(str(x).replace(' ', '')))))
test_df['len_word_q1'] = test_df.question1.apply(lambda x: len(str(x).split()))
test_df['len_word_q2'] = test_df.question2.apply(lambda x: len(str(x).split()))

###############################################################################
# Try paralell computation with dask
#Train
print('extra fuzzy features, train....')
train_dd = from_pandas(train_df[['question1', 'question2']], npartitions=8)

start_time = time.time()
train_df['fuzz_qratio'] = train_dd.apply(
    lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])),
    axis=1,
    meta=('a', np.dtype('int64'))).compute(get=dask.multiprocessing.get)
train_df['fuzz_WRatio'] = train_dd.apply(
    lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])),
    axis=1,
    meta=('a', np.dtype('int64'))).compute(get=dask.multiprocessing.get)
train_df['fuzz_token_set_ratio'] = train_dd.apply(
    lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])),
    axis=1,
    meta=('a', np.dtype('int64'))).compute(get=dask.multiprocessing.get)
train_df['fuzz_token_sort_ratio'] = train_dd.apply(
Пример #59
0
 def test_da(self, data):
     a = dd.from_pandas(data, npartitions=2)
     ct = DummyEncoder()
     result = ct.fit_transform(a)
     expected = DummyEncoder().fit_transform(data)
     assert_eq(result, expected)
Пример #60
0
def test_to_csv_paths():
    df = pd.DataFrame({"A": range(10)})
    ddf = dd.from_pandas(df, npartitions=2)
    assert ddf.to_csv("foo*.csv") == ['foo0.csv', 'foo1.csv']
    os.remove('foo0.csv')
    os.remove('foo1.csv')