def test_series_groupby_errors(): s = pd.Series([1, 2, 2, 1, 1]) ss = dd.from_pandas(s, npartitions=2) msg = "Grouper for '1' not 1-dimensional" with tm.assertRaisesRegexp(ValueError, msg): s.groupby([1, 2]) # pandas with tm.assertRaisesRegexp(ValueError, msg): ss.groupby([1, 2]) # dask should raise the same error msg = "Grouper for '2' not 1-dimensional" with tm.assertRaisesRegexp(ValueError, msg): s.groupby([2]) # pandas with tm.assertRaisesRegexp(ValueError, msg): ss.groupby([2]) # dask should raise the same error msg = "No group keys passed!" with tm.assertRaisesRegexp(ValueError, msg): s.groupby([]) # pandas with tm.assertRaisesRegexp(ValueError, msg): ss.groupby([]) # dask should raise the same error sss = dd.from_pandas(s, npartitions=3) assert raises(NotImplementedError, lambda: ss.groupby(sss)) with tm.assertRaises(KeyError): s.groupby('x') # pandas with tm.assertRaises(KeyError): ss.groupby('x') # dask should raise the same error
def test_to_dask_dataframe(self): # Test conversion of Datasets to dask DataFrames x = da.from_array(np.random.randn(10), chunks=4) y = np.arange(10, dtype='uint8') t = list('abcdefghij') ds = Dataset(OrderedDict([('a', ('t', x)), ('b', ('t', y)), ('t', ('t', t))])) expected_pd = pd.DataFrame({'a': x, 'b': y}, index=pd.Index(t, name='t')) # test if 1-D index is correctly set up expected = dd.from_pandas(expected_pd, chunksize=4) actual = ds.to_dask_dataframe(set_index=True) # test if we have dask dataframes self.assertIsInstance(actual, dd.DataFrame) # use the .equals from pandas to check dataframes are equivalent assert_frame_equal(expected.compute(), actual.compute()) # test if no index is given expected = dd.from_pandas(expected_pd.reset_index(drop=False), chunksize=4) actual = ds.to_dask_dataframe(set_index=False) self.assertIsInstance(actual, dd.DataFrame) assert_frame_equal(expected.compute(), actual.compute())
def test_concat4_interleave_partitions(): pdf1 = pd.DataFrame(np.random.randn(10, 5), columns=list('ABCDE'), index=list('abcdefghij')) pdf2 = pd.DataFrame(np.random.randn(13, 5), columns=list('ABCDE'), index=list('fghijklmnopqr')) pdf3 = pd.DataFrame(np.random.randn(13, 6), columns=list('CDEXYZ'), index=list('fghijklmnopqr')) ddf1 = dd.from_pandas(pdf1, 2) ddf2 = dd.from_pandas(pdf2, 3) ddf3 = dd.from_pandas(pdf3, 2) msg = ('All inputs have known divisions which cannot be ' 'concatenated in order. Specify ' 'interleave_partitions=True to ignore order') cases = [[ddf1, ddf1], [ddf1, ddf2], [ddf1, ddf3], [ddf2, ddf1], [ddf2, ddf3], [ddf3, ddf1], [ddf3, ddf2]] for case in cases: pdcase = [c.compute() for c in case] with pytest.raises(ValueError) as err: dd.concat(case) assert msg in str(err.value) assert_eq(dd.concat(case, interleave_partitions=True), pd.concat(pdcase)) assert_eq(dd.concat(case, join='inner', interleave_partitions=True), pd.concat(pdcase, join='inner')) msg = "'join' must be 'inner' or 'outer'" with pytest.raises(ValueError) as err: dd.concat([ddf1, ddf1], join='invalid', interleave_partitions=True) assert msg in str(err.value)
def test_concat3(): pdf1 = pd.DataFrame(np.random.randn(6, 5), columns=list('ABCDE'), index=list('abcdef')) pdf2 = pd.DataFrame(np.random.randn(6, 5), columns=list('ABCFG'), index=list('ghijkl')) pdf3 = pd.DataFrame(np.random.randn(6, 5), columns=list('ABCHI'), index=list('mnopqr')) ddf1 = dd.from_pandas(pdf1, 2) ddf2 = dd.from_pandas(pdf2, 3) ddf3 = dd.from_pandas(pdf3, 2) result = dd.concat([ddf1, ddf2]) assert result.divisions == ddf1.divisions[:-1] + ddf2.divisions assert result.npartitions == ddf1.npartitions + ddf2.npartitions assert_eq(result, pd.concat([pdf1, pdf2])) assert_eq(dd.concat([ddf1, ddf2], interleave_partitions=True), pd.concat([pdf1, pdf2])) result = dd.concat([ddf1, ddf2, ddf3]) assert result.divisions == (ddf1.divisions[:-1] + ddf2.divisions[:-1] + ddf3.divisions) assert result.npartitions == (ddf1.npartitions + ddf2.npartitions + ddf3.npartitions) assert_eq(result, pd.concat([pdf1, pdf2, pdf3])) assert_eq(dd.concat([ddf1, ddf2, ddf3], interleave_partitions=True), pd.concat([pdf1, pdf2, pdf3]))
def test_clip(): # clip internally calls dd.Series.clip s = pd.Series(np.random.randint(1, 100, size=20)) ds = dd.from_pandas(s, 3) # applying Dask ufunc doesn't trigger computation assert isinstance(da.clip(ds, 5, 50), dd.Series) assert_eq(da.clip(ds, 5, 50), np.clip(s, 5, 50)) # applying Dask ufunc doesn't trigger computation assert isinstance(np.clip(ds, 5, 50), dd.Series) assert_eq(np.clip(ds, 5, 50), np.clip(s, 5, 50)) # applying Dask ufunc to normal Series triggers computation assert isinstance(da.clip(s, 5, 50), pd.Series) assert_eq(da.clip(s, 5, 50), np.clip(s, 5, 50)) df = pd.DataFrame(np.random.randint(1, 100, size=(20, 2)), columns=['A', 'B']) ddf = dd.from_pandas(df, 3) # applying Dask ufunc doesn't trigger computation assert isinstance(da.clip(ddf, 5.5, 40.5), dd.DataFrame) assert_eq(da.clip(ddf, 5.5, 40.5), np.clip(df, 5.5, 40.5)) # applying Dask ufunc doesn't trigger computation assert isinstance(np.clip(ddf, 5.5, 40.5), dd.DataFrame) assert_eq(np.clip(ddf, 5.5, 40.5), np.clip(df, 5.5, 40.5)) # applying Dask ufunc to normal DataFrame triggers computation assert isinstance(da.clip(df, 5.5, 40.5), pd.DataFrame) assert_eq(da.clip(df, 5.5, 40.5), np.clip(df, 5.5, 40.5))
def test_pivot_table_errors(): df = pd.DataFrame({'A': np.random.choice(list('abc'), size=10), 'B': np.random.randn(10), 'C': pd.Categorical(np.random.choice(list('abc'), size=10))}) ddf = dd.from_pandas(df, 2) msg = "'index' must be the name of an existing column" with tm.assertRaisesRegexp(ValueError, msg): dd.pivot_table(ddf, index=['A'], columns='C', values='B') msg = "'columns' must be the name of an existing column" with tm.assertRaisesRegexp(ValueError, msg): dd.pivot_table(ddf, index='A', columns=['C'], values='B') msg = "'values' must be the name of an existing column" with tm.assertRaisesRegexp(ValueError, msg): dd.pivot_table(ddf, index='A', columns='C', values=['B']) msg = "aggfunc must be either 'mean', 'sum' or 'count'" with tm.assertRaisesRegexp(ValueError, msg): dd.pivot_table(ddf, index='A', columns='C', values='B', aggfunc=['sum']) with tm.assertRaisesRegexp(ValueError, msg): dd.pivot_table(ddf, index='A', columns='C', values='B', aggfunc='xx') df = pd.DataFrame({'A': np.random.choice(list('abc'), size=10), 'B': np.random.randn(10), 'C': np.random.choice(list('abc'), size=10)}) ddf = dd.from_pandas(df, 2) msg = "'columns' must be category dtype" with tm.assertRaisesRegexp(ValueError, msg): dd.pivot_table(ddf, index='A', columns='C', values='B')
def test_series_format(): s = pd.Series([1, 2, 3, 4, 5, 6, 7, 8], index=list('ABCDEFGH')) ds = dd.from_pandas(s, 3) exp = """Dask Series Structure: npartitions=3 A int64 D ... G ... H ... dtype: int64 Dask Name: from_pandas, 3 tasks""" assert repr(ds) == exp assert str(ds) == exp exp = """npartitions=3 A int64 D ... G ... H ...""" assert ds.to_string() == exp s = pd.Series([1, 2, 3, 4, 5, 6, 7, 8], index=list('ABCDEFGH'), name='XXX') ds = dd.from_pandas(s, 3) exp = """Dask Series Structure: npartitions=3 A int64 D ... G ... H ... Name: XXX, dtype: int64 Dask Name: from_pandas, 3 tasks""" assert repr(ds) == exp assert str(ds) == exp
def test_frame_2ufunc_out(): input_matrix = np.random.randint(1, 100, size=(20, 2)) df = pd.DataFrame(input_matrix, columns=['A', 'B']) ddf = dd.from_pandas(df, 3) # column number mismatch df_out = pd.DataFrame(np.random.randint(1, 100, size=(20, 3)), columns=['X', 'Y', 'Z']) ddf_out = dd.from_pandas(df_out, 3) with pytest.raises(ValueError): np.sin(ddf, out=ddf_out) # types mismatch ddf_out = dd.from_pandas(pd.Series([0]),1) with pytest.raises(TypeError): np.sin(ddf, out=ddf_out) df_out = pd.DataFrame(np.random.randint(1, 100, size=(20, 2)), columns=['X', 'Y']) ddf_out = dd.from_pandas(df_out, 3) np.sin(ddf, out=ddf_out) np.add(ddf_out, 10, out=ddf_out) expected = pd.DataFrame(np.sin(input_matrix) + 10, columns=['A', 'B']) assert_eq(ddf_out, expected)
def test_concat(join): pdf1 = pd.DataFrame({'x': [1, 2, 3, 4, 6, 7], 'y': list('abcdef')}, index=[1, 2, 3, 4, 6, 7]) ddf1 = dd.from_pandas(pdf1, 2) pdf2 = pd.DataFrame({'x': [1, 2, 3, 4, 6, 7], 'y': list('abcdef')}, index=[8, 9, 10, 11, 12, 13]) ddf2 = dd.from_pandas(pdf2, 2) # different columns pdf3 = pd.DataFrame({'x': [1, 2, 3, 4, 6, 7], 'z': list('abcdef')}, index=[8, 9, 10, 11, 12, 13]) ddf3 = dd.from_pandas(pdf3, 2) for (dd1, dd2, pd1, pd2) in [(ddf1, ddf2, pdf1, pdf2), (ddf1, ddf3, pdf1, pdf3)]: result = dd.concat([dd1, dd2], join=join) expected = pd.concat([pd1, pd2], join=join) assert eq(result, expected) # test outer only, inner has a problem on pandas side for (dd1, dd2, pd1, pd2) in [(ddf1, ddf2, pdf1, pdf2), (ddf1, ddf3, pdf1, pdf3), (ddf1.x, ddf2.x, pdf1.x, pdf2.x), (ddf1.x, ddf3.z, pdf1.x, pdf3.z), (ddf1.x, ddf2.x, pdf1.x, pdf2.x), (ddf1.x, ddf3.z, pdf1.x, pdf3.z)]: result = dd.concat([dd1, dd2]) expected = pd.concat([pd1, pd2]) assert eq(result, expected)
def test_groupy_non_aligned_index(): pdf = pd.DataFrame({'a': [1, 2, 6, 4, 4, 6, 4, 3, 7] * 10, 'b': [4, 2, 7, 3, 3, 1, 1, 1, 2] * 10, 'c': [0, 1, 2, 3, 4, 5, 6, 7, 8] * 10}, columns=['c', 'b', 'a']) ddf3 = dd.from_pandas(pdf, npartitions=3) ddf7 = dd.from_pandas(pdf, npartitions=7) # working examples ddf3.groupby(['a', 'b']) ddf3.groupby([ddf3['a'], ddf3['b']]) # misaligned divisions with pytest.raises(NotImplementedError): ddf3.groupby(ddf7['a']) with pytest.raises(NotImplementedError): ddf3.groupby([ddf7['a'], ddf7['b']]) with pytest.raises(NotImplementedError): ddf3.groupby([ddf7['a'], ddf3['b']]) with pytest.raises(NotImplementedError): ddf3.groupby([ddf3['a'], ddf7['b']]) with pytest.raises(NotImplementedError): ddf3.groupby([ddf7['a'], 'b'])
def test_to_hdf(): pytest.importorskip('tables') df = pd.DataFrame({'x': ['a', 'b', 'c', 'd'], 'y': [1, 2, 3, 4]}, index=[1., 2., 3., 4.]) a = dd.from_pandas(df, 2) with tmpfile('h5') as fn: a.to_hdf(fn, '/data') out = pd.read_hdf(fn, '/data') tm.assert_frame_equal(df, out[:]) with tmpfile('h5') as fn: a.x.to_hdf(fn, '/data') out = pd.read_hdf(fn, '/data') tm.assert_series_equal(df.x, out[:]) a = dd.from_pandas(df, 1) with tmpfile('h5') as fn: a.to_hdf(fn, '/data') out = pd.read_hdf(fn, '/data') tm.assert_frame_equal(df, out[:]) # test compute = False with tmpfile('h5') as fn: r = a.to_hdf(fn, '/data', compute=False) r.compute() out = pd.read_hdf(fn, '/data') tm.assert_frame_equal(df, out[:])
def test_get_dummies_kwargs(): s = pd.Series([1, 1, 1, 2, 2, 1, 3, 4], dtype='category') exp = pd.get_dummies(s, prefix='X', prefix_sep='-') ds = dd.from_pandas(s, 2) res = dd.get_dummies(ds, prefix='X', prefix_sep='-') assert_eq(res, exp) tm.assert_index_equal(res.columns, pd.Index(['X-1', 'X-2', 'X-3', 'X-4'])) exp = pd.get_dummies(s, drop_first=True) ds = dd.from_pandas(s, 2) res = dd.get_dummies(ds, drop_first=True) assert_eq(res, exp) tm.assert_index_equal(res.columns, exp.columns) # nan s = pd.Series([1, 1, 1, 2, np.nan, 3, np.nan, 5], dtype='category') exp = pd.get_dummies(s) ds = dd.from_pandas(s, 2) res = dd.get_dummies(ds) assert_eq(res, exp) tm.assert_index_equal(res.columns, exp.columns) # dummy_na exp = pd.get_dummies(s, dummy_na=True) ds = dd.from_pandas(s, 2) res = dd.get_dummies(ds, dummy_na=True) assert_eq(res, exp) tm.assert_index_equal(res.columns, pd.Index([1, 2, 3, 5, np.nan]))
def test_getitem(): df = pd.DataFrame({'A': [1, 2, 3, 4, 5, 6, 7, 8, 9], 'B': [9, 8, 7, 6, 5, 4, 3, 2, 1], 'C': [True, False, True] * 3}, columns=list('ABC')) ddf = dd.from_pandas(df, 2) assert_eq(ddf['A'], df['A']) # check cache consistency tm.assert_series_equal(ddf['A']._meta, ddf._meta['A']) assert_eq(ddf[['A', 'B']], df[['A', 'B']]) tm.assert_frame_equal(ddf[['A', 'B']]._meta, ddf._meta[['A', 'B']]) assert_eq(ddf[ddf.C], df[df.C]) tm.assert_series_equal(ddf.C._meta, ddf._meta.C) assert_eq(ddf[ddf.C.repartition([0, 2, 5, 8])], df[df.C]) pytest.raises(KeyError, lambda: df['X']) pytest.raises(KeyError, lambda: df[['A', 'X']]) pytest.raises(AttributeError, lambda: df.X) # not str/unicode df = pd.DataFrame(np.random.randn(10, 5)) ddf = dd.from_pandas(df, 2) assert_eq(ddf[0], df[0]) assert_eq(ddf[[1, 2]], df[[1, 2]]) pytest.raises(KeyError, lambda: df[8]) pytest.raises(KeyError, lambda: df[[1, 8]])
def test_set_index_drop(drop): pdf = pd.DataFrame({'A': list('ABAABBABAA'), 'B': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 'C': [1, 2, 3, 2, 1, 3, 2, 4, 2, 3]}) ddf = dd.from_pandas(pdf, 3) assert_eq(ddf.set_index('A', drop=drop), pdf.set_index('A', drop=drop)) assert_eq(ddf.set_index('B', drop=drop), pdf.set_index('B', drop=drop)) assert_eq(ddf.set_index('C', drop=drop), pdf.set_index('C', drop=drop)) assert_eq(ddf.set_index(ddf.A, drop=drop), pdf.set_index(pdf.A, drop=drop)) assert_eq(ddf.set_index(ddf.B, drop=drop), pdf.set_index(pdf.B, drop=drop)) assert_eq(ddf.set_index(ddf.C, drop=drop), pdf.set_index(pdf.C, drop=drop)) # numeric columns pdf = pd.DataFrame({0: list('ABAABBABAA'), 1: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10], 2: [1, 2, 3, 2, 1, 3, 2, 4, 2, 3]}) ddf = dd.from_pandas(pdf, 3) assert_eq(ddf.set_index(0, drop=drop), pdf.set_index(0, drop=drop)) assert_eq(ddf.set_index(2, drop=drop), pdf.set_index(2, drop=drop))
def test_groupby_column_and_index_apply(group_args, apply_func): df = pd.DataFrame({'idx': [1, 1, 1, 2, 2, 2], 'a': [1, 2, 1, 2, 1, 2], 'b': np.arange(6)} ).set_index('idx') ddf = dd.from_pandas(df, npartitions=df.index.nunique()) ddf_no_divs = dd.from_pandas(df, npartitions=df.index.nunique(), sort=False) # Expected result expected = df.groupby(group_args).apply(apply_func) # Compute on dask DataFrame with divisions (no shuffling) result = ddf.groupby(group_args).apply(apply_func) assert_eq(expected, result, check_divisions=False) # Check that partitioning is preserved assert ddf.divisions == result.divisions # Check that no shuffling occurred. # The groupby operation should add only 1 task per partition assert len(result.dask) == (len(ddf.dask) + ddf.npartitions) # Compute on dask DataFrame without divisions (requires shuffling) result = ddf_no_divs.groupby(group_args).apply(apply_func) assert_eq(expected, result, check_divisions=False) # Check that divisions were preserved (all None in this case) assert ddf_no_divs.divisions == result.divisions # Crude check to see if shuffling was performed. # The groupby operation should add only more than 1 task per partition assert len(result.dask) > (len(ddf_no_divs.dask) + ddf_no_divs.npartitions)
def test_append(): df = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6], "b": [1, 2, 3, 4, 5, 6]}) df2 = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6], "b": [1, 2, 3, 4, 5, 6]}, index=[6, 7, 8, 9, 10, 11]) df3 = pd.DataFrame({"b": [1, 2, 3, 4, 5, 6], "c": [1, 2, 3, 4, 5, 6]}, index=[6, 7, 8, 9, 10, 11]) ddf = dd.from_pandas(df, 2) ddf2 = dd.from_pandas(df2, 2) ddf3 = dd.from_pandas(df3, 2) assert eq(ddf.append(ddf2), df.append(df2)) assert eq(ddf.a.append(ddf2.a), df.a.append(df2.a)) # different columns assert eq(ddf.append(ddf3), df.append(df3)) assert eq(ddf.a.append(ddf3.b), df.a.append(df3.b)) # dask + pandas assert eq(ddf.append(df2), df.append(df2)) assert eq(ddf.a.append(df2.a), df.a.append(df2.a)) assert eq(ddf.append(df3), df.append(df3)) assert eq(ddf.a.append(df3.b), df.a.append(df3.b)) s = pd.Series([7, 8], name=6, index=["a", "b"]) assert eq(ddf.append(s), df.append(s)) df4 = pd.DataFrame({"a": [1, 2, 3, 4, 5, 6], "b": [1, 2, 3, 4, 5, 6]}, index=[4, 5, 6, 7, 8, 9]) ddf4 = dd.from_pandas(df4, 2) msg = ( "Unable to append two dataframes to each other with known " "divisions if those divisions are not ordered. " "The divisions/index of the second dataframe must be " "greater than the divisions/index of the first dataframe." ) with tm.assertRaisesRegexp(ValueError, msg): ddf.append(ddf4)
def test_concat4_interleave_partitions(): pdf1 = pd.DataFrame(np.random.randn(10, 5), columns=list("ABCDE"), index=list("abcdefghij")) pdf2 = pd.DataFrame(np.random.randn(13, 5), columns=list("ABCDE"), index=list("fghijklmnopqr")) pdf3 = pd.DataFrame(np.random.randn(13, 6), columns=list("CDEXYZ"), index=list("fghijklmnopqr")) ddf1 = dd.from_pandas(pdf1, 2) ddf2 = dd.from_pandas(pdf2, 3) ddf3 = dd.from_pandas(pdf3, 2) msg = ( "All inputs have known divisions which cannnot be " "concatenated in order. Specify " "interleave_partitions=True to ignore order" ) cases = [[ddf1, ddf1], [ddf1, ddf2], [ddf1, ddf3], [ddf2, ddf1], [ddf2, ddf3], [ddf3, ddf1], [ddf3, ddf2]] for case in cases: pdcase = [c.compute() for c in case] with tm.assertRaisesRegexp(ValueError, msg): dd.concat(case) assert eq(dd.concat(case, interleave_partitions=True), pd.concat(pdcase)) assert eq(dd.concat(case, join="inner", interleave_partitions=True), pd.concat(pdcase, join="inner")) msg = "'join' must be 'inner' or 'outer'" with tm.assertRaisesRegexp(ValueError, msg): dd.concat([ddf1, ddf1], join="invalid", interleave_partitions=True)
def test_getitem(): df = pd.DataFrame( {"A": [1, 2, 3, 4, 5, 6, 7, 8, 9], "B": [9, 8, 7, 6, 5, 4, 3, 2, 1], "C": [True, False, True] * 3}, columns=list("ABC"), ) ddf = dd.from_pandas(df, 2) assert eq(ddf["A"], df["A"]) tm.assert_series_equal(ddf["A"]._pd, ddf._pd["A"]) # check cache consistency assert eq(ddf[["A", "B"]], df[["A", "B"]]) tm.assert_frame_equal(ddf[["A", "B"]]._pd, ddf._pd[["A", "B"]]) assert eq(ddf[ddf.C], df[df.C]) tm.assert_series_equal(ddf.C._pd, ddf._pd.C) assert eq(ddf[ddf.C.repartition([0, 2, 5, 8])], df[df.C]) assert raises(KeyError, lambda: df["X"]) assert raises(KeyError, lambda: df[["A", "X"]]) assert raises(AttributeError, lambda: df.X) # not str/unicode df = pd.DataFrame(np.random.randn(10, 5)) ddf = dd.from_pandas(df, 2) assert eq(ddf[0], df[0]) assert eq(ddf[[1, 2]], df[[1, 2]]) assert raises(KeyError, lambda: df[8]) assert raises(KeyError, lambda: df[[1, 8]])
def test_index_format(): s = pd.Series([1, 2, 3, 4, 5, 6, 7, 8], index=list('ABCDEFGH')) ds = dd.from_pandas(s, 3) exp = """Dask Index Structure: npartitions=3 A object D ... G ... H ... dtype: object Dask Name: from_pandas, 6 tasks""" assert repr(ds.index) == exp assert str(ds.index) == exp s = pd.Series([1, 2, 3, 4, 5, 6, 7, 8], index=pd.CategoricalIndex([1, 2, 3, 4, 5, 6, 7, 8], name='YYY')) ds = dd.from_pandas(s, 3) exp = dedent("""\ Dask Index Structure: npartitions=3 1 category[known] 4 ... 7 ... 8 ... Name: YYY, dtype: category Dask Name: from_pandas, 6 tasks""") assert repr(ds.index) == exp assert str(ds.index) == exp
def test_reductions_out(frame, axis, out, redfunc): dsk_in = dd.from_pandas(frame, 3) dsk_out = dd.from_pandas(pd.Series([0]), 1).sum() if out is not None: dsk_out = dd.from_pandas(out, 3) np_redfunc = getattr(np, redfunc) pd_redfunc = getattr(frame.__class__, redfunc) dsk_redfunc = getattr(dsk_in.__class__, redfunc) if redfunc in ['var', 'std']: # numpy has default ddof value 0 while # dask and pandas have 1, so ddof should be passed # explicitly when calling np.var(dask) np_redfunc(dsk_in, axis=axis, ddof=1, out=dsk_out) else: np_redfunc(dsk_in, axis=axis, out=dsk_out) assert_eq(dsk_out, pd_redfunc(frame, axis=axis)) dsk_redfunc(dsk_in, axis=axis, split_every=False, out=dsk_out) assert_eq(dsk_out, pd_redfunc(frame, axis=axis)) dsk_redfunc(dsk_in, axis=axis, split_every=2, out=dsk_out) assert_eq(dsk_out, pd_redfunc(frame, axis=axis))
def test_concat_unknown_divisions_errors(): a = pd.Series([1, 2, 3, 4, 5, 6]) b = pd.Series([4, 3, 2, 1]) aa = dd.from_pandas(a, npartitions=2, sort=False) bb = dd.from_pandas(b, npartitions=2, sort=False) with pytest.raises(ValueError): dd.concat([aa, bb], axis=1).compute()
def test_from_pandas_with_datetime_index(): with filetext(timeseries) as fn: df = pd.read_csv(fn, index_col=0, header=0, usecols=[0, 4], parse_dates=['Date']) ddf = dd.from_pandas(df, 2) eq(df, ddf) ddf = dd.from_pandas(df, chunksize=2) eq(df, ddf)
def test_gh_517(): arr = np.random.randn(100, 2) df = pd.DataFrame(arr, columns=["a", "b"]) ddf = dd.from_pandas(df, 2) assert ddf.index.nunique().compute() == 100 ddf2 = dd.from_pandas(pd.concat([df, df]), 5) assert ddf2.index.nunique().compute() == 100
def test_merge_index_without_divisions(shuffle): a = pd.DataFrame({"x": [1, 2, 3, 4, 5]}, index=[1, 2, 3, 4, 5]) b = pd.DataFrame({"y": [1, 2, 3, 4, 5]}, index=[5, 4, 3, 2, 1]) aa = dd.from_pandas(a, npartitions=3, sort=False) bb = dd.from_pandas(b, npartitions=2) eq(aa.join(bb, how="inner", shuffle=shuffle), a.join(b, how="inner"))
def test_concat5(): pdf1 = pd.DataFrame(np.random.randn(7, 5), columns=list('ABCDE'), index=list('abcdefg')) pdf2 = pd.DataFrame(np.random.randn(7, 6), columns=list('FGHIJK'), index=list('abcdefg')) pdf3 = pd.DataFrame(np.random.randn(7, 6), columns=list('FGHIJK'), index=list('cdefghi')) pdf4 = pd.DataFrame(np.random.randn(7, 5), columns=list('FGHAB'), index=list('cdefghi')) pdf5 = pd.DataFrame(np.random.randn(7, 5), columns=list('FGHAB'), index=list('fklmnop')) ddf1 = dd.from_pandas(pdf1, 2) ddf2 = dd.from_pandas(pdf2, 3) ddf3 = dd.from_pandas(pdf3, 2) ddf4 = dd.from_pandas(pdf4, 2) ddf5 = dd.from_pandas(pdf5, 3) cases = [[ddf1, ddf2], [ddf1, ddf3], [ddf1, ddf4], [ddf1, ddf5], [ddf3, ddf4], [ddf3, ddf5], [ddf5, ddf1, ddf4], [ddf5, ddf3], [ddf1.A, ddf4.A], [ddf2.F, ddf3.F], [ddf4.A, ddf5.A], [ddf1.A, ddf4.F], [ddf2.F, ddf3.H], [ddf4.A, ddf5.B], [ddf1, ddf4.A], [ddf3.F, ddf2], [ddf5, ddf1.A, ddf2]] for case in cases: pdcase = [c.compute() for c in case] with pytest.warns(None): # some cases will raise warning directly from pandas assert_eq(dd.concat(case, interleave_partitions=True), pd.concat(pdcase)) assert_eq(dd.concat(case, join='inner', interleave_partitions=True), pd.concat(pdcase, join='inner')) assert_eq(dd.concat(case, axis=1), pd.concat(pdcase, axis=1)) assert_eq(dd.concat(case, axis=1, join='inner'), pd.concat(pdcase, axis=1, join='inner')) # Dask + pandas cases = [[ddf1, pdf2], [ddf1, pdf3], [pdf1, ddf4], [pdf1.A, ddf4.A], [ddf2.F, pdf3.F], [ddf1, pdf4.A], [ddf3.F, pdf2], [ddf2, pdf1, ddf3.F]] for case in cases: pdcase = [c.compute() if isinstance(c, _Frame) else c for c in case] assert_eq(dd.concat(case, interleave_partitions=True), pd.concat(pdcase)) assert_eq(dd.concat(case, join='inner', interleave_partitions=True), pd.concat(pdcase, join='inner')) assert_eq(dd.concat(case, axis=1), pd.concat(pdcase, axis=1)) assert_eq(dd.concat(case, axis=1, join='inner'), pd.concat(pdcase, axis=1, join='inner'))
def test_merge_by_multiple_columns(): pdf1l = pd.DataFrame({'a': list('abcdefghij'), 'b': list('abcdefghij'), 'c': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, index=list('abcdefghij')) pdf1r = pd.DataFrame({'d': list('abcdefghij'), 'e': list('abcdefghij'), 'f': [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]}, index=list('abcdefghij')) pdf2l = pd.DataFrame({'a': list('abcdeabcde'), 'b': list('abcabcabca'), 'c': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, index=list('abcdefghij')) pdf2r = pd.DataFrame({'d': list('edcbaedcba'), 'e': list('aaabbbcccd'), 'f': [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]}, index=list('fghijklmno')) pdf3l = pd.DataFrame({'a': list('aaaaaaaaaa'), 'b': list('aaaaaaaaaa'), 'c': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}, index=list('abcdefghij')) pdf3r = pd.DataFrame({'d': list('aaabbbccaa'), 'e': list('abbbbbbbbb'), 'f': [10, 9, 8, 7, 6, 5, 4, 3, 2, 1]}, index=list('ABCDEFGHIJ')) for pdl, pdr in [(pdf1l, pdf1r), (pdf2l, pdf2r), (pdf3l, pdf3r)]: for lpart, rpart in [(2, 2), (3, 2), (2, 3)]: ddl = dd.from_pandas(pdl, lpart) ddr = dd.from_pandas(pdr, rpart) for how in ['inner', 'outer', 'left', 'right']: eq(ddl.join(ddr, how=how), pdl.join(pdr, how=how)) eq(ddr.join(ddl, how=how), pdr.join(pdl, how=how)) eq(dd.merge(ddl, ddr, how=how, left_index=True, right_index=True), pd.merge(pdl, pdr, how=how, left_index=True, right_index=True)) eq(dd.merge(ddr, ddl, how=how, left_index=True, right_index=True), pd.merge(pdr, pdl, how=how, left_index=True, right_index=True)) # hash join list_eq(dd.merge(ddl, ddr, how=how, left_on='a', right_on='d'), pd.merge(pdl, pdr, how=how, left_on='a', right_on='d')) list_eq(dd.merge(ddl, ddr, how=how, left_on='b', right_on='e'), pd.merge(pdl, pdr, how=how, left_on='b', right_on='e')) list_eq(dd.merge(ddr, ddl, how=how, left_on='d', right_on='a'), pd.merge(pdr, pdl, how=how, left_on='d', right_on='a')) list_eq(dd.merge(ddr, ddl, how=how, left_on='e', right_on='b'), pd.merge(pdr, pdl, how=how, left_on='e', right_on='b')) list_eq(dd.merge(ddl, ddr, how=how, left_on=['a', 'b'], right_on=['d', 'e']), pd.merge(pdl, pdr, how=how, left_on=['a', 'b'], right_on=['d', 'e']))
def test_reductions(): dsk = {('x', 0): pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}, index=[0, 1, 3]), ('x', 1): pd.DataFrame({'a': [4, 5, 6], 'b': [3, 2, 1]}, index=[5, 6, 8]), ('x', 2): pd.DataFrame({'a': [7, 8, 9], 'b': [0, 0, 0]}, index=[9, 9, 9])} ddf1 = dd.DataFrame(dsk, 'x', ['a', 'b'], [0, 4, 9, 9]) pdf1 = ddf1.compute() nans1 = pd.Series([1] + [np.nan] * 4 + [2] + [np.nan] * 3) nands1 = dd.from_pandas(nans1, 2) nans2 = pd.Series([1] + [np.nan] * 8) nands2 = dd.from_pandas(nans2, 2) nans3 = pd.Series([np.nan] * 9) nands3 = dd.from_pandas(nans3, 2) bools = pd.Series([True, False, True, False, True], dtype=bool) boolds = dd.from_pandas(bools, 2) for dds, pds in [(ddf1.b, pdf1.b), (ddf1.a, pdf1.a), (ddf1['a'], pdf1['a']), (ddf1['b'], pdf1['b']), (nands1, nans1), (nands2, nans2), (nands3, nans3), (boolds, bools)]: assert isinstance(dds, dd.Series) assert isinstance(pds, pd.Series) assert eq(dds.sum(), pds.sum()) assert eq(dds.min(), pds.min()) assert eq(dds.max(), pds.max()) assert eq(dds.count(), pds.count()) assert eq(dds.std(), pds.std()) assert eq(dds.var(), pds.var()) assert eq(dds.std(ddof=0), pds.std(ddof=0)) assert eq(dds.var(ddof=0), pds.var(ddof=0)) assert eq(dds.mean(), pds.mean()) assert eq(dds.nunique(), pds.nunique()) assert eq(dds.nbytes, pds.nbytes) assert eq(dds.sum(skipna=False), pds.sum(skipna=False)) assert eq(dds.min(skipna=False), pds.min(skipna=False)) assert eq(dds.max(skipna=False), pds.max(skipna=False)) assert eq(dds.std(skipna=False), pds.std(skipna=False)) assert eq(dds.var(skipna=False), pds.var(skipna=False)) assert eq(dds.std(skipna=False, ddof=0), pds.std(skipna=False, ddof=0)) assert eq(dds.var(skipna=False, ddof=0), pds.var(skipna=False, ddof=0)) assert eq(dds.mean(skipna=False), pds.mean(skipna=False)) assert_dask_graph(ddf1.b.sum(), 'series-sum') assert_dask_graph(ddf1.b.min(), 'series-min') assert_dask_graph(ddf1.b.max(), 'series-max') assert_dask_graph(ddf1.b.count(), 'series-count') assert_dask_graph(ddf1.b.std(), 'series-std(ddof=1)') assert_dask_graph(ddf1.b.var(), 'series-var(ddof=1)') assert_dask_graph(ddf1.b.std(ddof=0), 'series-std(ddof=0)') assert_dask_graph(ddf1.b.var(ddof=0), 'series-var(ddof=0)') assert_dask_graph(ddf1.b.mean(), 'series-mean') # nunique is performed using drop-duplicates assert_dask_graph(ddf1.b.nunique(), 'drop-duplicates')
def test_register_extension_type(): arr = DecimalArray._from_sequence([Decimal('1.0')] * 10) ser = pd.Series(arr) dser = dd.from_pandas(ser, 2) assert_eq(ser, dser) df = pd.DataFrame({"A": ser}) ddf = dd.from_pandas(df, 2) assert_eq(df, ddf)
def test_from_pandas_non_sorted(): df = pd.DataFrame({'x': [1, 2, 3]}, index=[3, 1, 2]) ddf = dd.from_pandas(df, npartitions=2, sort=False) assert not ddf.known_divisions assert_eq(df, ddf) ddf = dd.from_pandas(df, chunksize=2, sort=False) assert not ddf.known_divisions assert_eq(df, ddf)
def test_merge_index_without_divisions(shuffle): a = pd.DataFrame({'x': [1, 2, 3, 4, 5]}, index=[1, 2, 3, 4, 5]) b = pd.DataFrame({'y': [1, 2, 3, 4, 5]}, index=[5, 4, 3, 2, 1]) aa = dd.from_pandas(a, npartitions=3, sort=False) bb = dd.from_pandas(b, npartitions=2) assert_eq(aa.join(bb, how='inner', shuffle=shuffle), a.join(b, how='inner'))
import Database_Handler as dh from bson import ObjectId mongodb = dh.ToMongoDB(*dh.AWS_MongoDB_Information()) dbname = 'hy_db' useDB = dh.Use_Database(mongodb, dbname) commentCollection = dh.Use_Collection(useDB, 'comments') info = {'site': row['site'], 'category': row['category'], 'date': row['date'], 'rank': row['rank']} commentsForNews = commentCollection.find(info) realNumCount = commentsForNews.count() site = row['site'] oid = ObjectId(row['id']) if site == 'daum': newsCollection = dh.Use_Collection(useDB, 'newsDaum') else: newsCollection = dh.Use_Collection(useDB, 'newsNaver') if realNumCount != row['number_of_crawled_comment']: newsCollection.update_one({'_id': oid}, {'$set': {'real_number_of_comment': realNumCount}}) if row.name % 100 == 0: print(row.name) if __name__ == "__main__": start = datetime.now() ddf = dd.from_pandas(extData, npartitions=cpu_count()) ddf.apply(GetNumberOfCommentInDB, axis=1, meta = int).compute() end = datetime.now() print('running time : {}'.format(end - start))
def time_df1(npartitions): pandas_df = pd.DataFrame({ 'time': pd.to_datetime([1, 2, 3, 4]), 'value': [1.1, 2.2, 3.3, 4.4] }) return dd.from_pandas(pandas_df, npartitions=npartitions)
def test_add_last_time_indexes(): pd_es = EntitySet(id="pd_es") dask_es = EntitySet(id="dask_es") sessions = pd.DataFrame({ "id": [0, 1, 2, 3], "user": [1, 2, 1, 3], "time": [ pd.to_datetime('2019-01-10'), pd.to_datetime('2019-02-03'), pd.to_datetime('2019-01-01'), pd.to_datetime('2017-08-25') ], "strings": ["I am a string", "23", "abcdef ghijk", ""] }) sessions_dask = dd.from_pandas(sessions, npartitions=2) sessions_logical_types = { "id": Integer, "user": Integer, "time": Datetime, "strings": NaturalLanguage } transactions = pd.DataFrame({ "id": [0, 1, 2, 3, 4, 5], "session_id": [0, 0, 1, 2, 2, 3], "amount": [1.23, 5.24, 123.52, 67.93, 40.34, 50.13], "time": [ pd.to_datetime('2019-01-10 03:53'), pd.to_datetime('2019-01-10 04:12'), pd.to_datetime('2019-02-03 10:34'), pd.to_datetime('2019-01-01 12:35'), pd.to_datetime('2019-01-01 12:49'), pd.to_datetime('2017-08-25 04:53') ] }) transactions_dask = dd.from_pandas(transactions, npartitions=2) transactions_logical_types = { "id": Integer, "session_id": Integer, "time": Datetime, "amount": Double } pd_es.add_dataframe(dataframe_name="sessions", dataframe=sessions, index="id", time_index="time") dask_es.add_dataframe(dataframe_name="sessions", dataframe=sessions_dask, index="id", time_index="time", logical_types=sessions_logical_types) pd_es.add_dataframe(dataframe_name="transactions", dataframe=transactions, index="id", time_index="time") dask_es.add_dataframe(dataframe_name="transactions", dataframe=transactions_dask, index="id", time_index="time", logical_types=transactions_logical_types) pd_es = pd_es.add_relationship("sessions", "id", "transactions", "session_id") dask_es = dask_es.add_relationship("sessions", "id", "transactions", "session_id") assert 'foreign_key' in pd_es['transactions'].ww.semantic_tags[ 'session_id'] assert 'foreign_key' in dask_es['transactions'].ww.semantic_tags[ 'session_id'] assert pd_es['sessions'].ww.metadata.get('last_time_index') is None assert dask_es['sessions'].ww.metadata.get('last_time_index') is None pd_es.add_last_time_indexes() dask_es.add_last_time_indexes() pd_lti_name = pd_es['sessions'].ww.metadata.get('last_time_index') ks_lti_name = dask_es['sessions'].ww.metadata.get('last_time_index') assert pd_lti_name == ks_lti_name pd.testing.assert_series_equal( pd_es['sessions'][pd_lti_name].sort_index(), dask_es['sessions'][ks_lti_name].compute().sort_index(), check_names=False)
def partition_and_write(df, x, y, filename, p=10, npartitions=None, shuffle=None, compression='default'): """ Perform spatial partitioning on an input dataframe and write the result to a parquet file. The resulting parquet file will contain the same columns as the input dataframe, but the dataframe's original index will be dropped. The resulting parquet file will contain all of the rows from the input dataframe, but they will be spatially sorted and partitioned along a 2D Hilbert curve (https://en.wikipedia.org/wiki/Hilbert_curve). The parquet file will also contain custom metadata that is needed to reconstruct the Hilbert curve distances on load. This parquet file may then be used to construct SpatialPointsFrame instances. Parameters ---------- df: pd.DataFrame or dd.DataFrame The input dataframe to partition x, y The column labels in df of the x and y coordinates of each row filename: str The path where the resulting parquet file should be written. See dask.dataframe.to_parquet for description of supported path specifications. p: int (default 10) The Hilbert curve order parameter that determines the resolution of the 2D grid that data points are rounded to before computing their Hilbert distance. Points will be discretized into 2 ** p bins in each the x and y dimensions. This parameter should be increased if the partitions of the resulting parquet files are significantly unbalanced. npartitions: int or None (default None) The number of partitions for the resulting parquet file. If None (the default) this is chosen to be the greater of 8 and len(df) // 2**23. In general, increasing the number of partitions will improve performance when processing small subsets of the overall parquet data set. But this comes at the cost of some additional overhead when processing the entire data set. shuffle: str or None (default None) The dask.dataframe.DataFrame.set_index shuffle method. If None, a default is chosen based on the current scheduler. compression: str or None (default) The dask.dataframe.to_parquet compression method. """ _validate_fastparquet() # Validate filename if (not isinstance(filename, basestring) or not (filename.endswith('.parquet') or filename.endswith('.parq'))): raise ValueError("""\ 'filename must be a string ending with a .parquet or .parq extension""") # Remove any existing directory if os.path.exists(filename): shutil.rmtree(filename) # Normalize to dask dataframe if isinstance(df, pd.DataFrame): ddf = dd.from_pandas(df, npartitions=4) elif isinstance(df, dd.DataFrame): ddf = df else: raise ValueError(""" df must be a pandas or dask DataFrame instance. Received value of type {typ}""".format(typ=type(df))) # Compute npartitions if needed if npartitions is None: # Make partitions of ~8 million rows with a minimum of 8 # partitions npartitions = max(len(df) // 2**23, 8) # Compute data extents extents = ddf.map_partitions(_compute_extents, x, y).compute() x_range = (float(extents['x_min'].min()), float(extents['x_max'].max())) y_range = (float(extents['y_min'].min()), float(extents['y_max'].max())) # Compute distance of points along the Hilbert-curve ddf = ddf.assign(distance=ddf.map_partitions(_compute_distance, x=x, y=y, p=p, x_range=x_range, y_range=y_range, as_series=True)) # Set index to distance. This will trigger an expensive shuffle # sort operation ddf = ddf.set_index('distance', npartitions=npartitions, shuffle=shuffle) # Get list of the distance divisions computed by dask distance_divisions = [int(d) for d in ddf.divisions] # Save properties as custom metadata in the parquet file props = dict( version='1.0', x=x, y=y, p=p, distance_divisions=distance_divisions, x_range=x_range, y_range=y_range, ) # Drop distance index to save storage space ddf = ddf.reset_index(drop=True) # Save ddf to parquet dd.to_parquet(ddf, filename, engine='fastparquet', compression=compression) # Open resulting parquet file pf = fp.ParquetFile(filename) # Add a new property to the file metadata new_fmd = copy.copy(pf.fmd) new_kv = fp.parquet_thrift.KeyValue() new_kv.key = 'SpatialPointsFrame' new_kv.value = json.dumps(props) new_fmd.key_value_metadata.append(new_kv) # Overwrite file metadata fn = os.path.join(filename, '_metadata') fp.writer.write_common_metadata(fn, new_fmd, no_row_groups=False) fn = os.path.join(filename, '_common_metadata') fp.writer.write_common_metadata(fn, new_fmd)
import pandas as pd import pytest import numpy as np import dask.dataframe as dd from dask.dataframe.utils import assert_eq, PANDAS_VERSION N = 40 df = pd.DataFrame({ 'a': np.random.randn(N).cumsum(), 'b': np.random.randint(100, size=(N, )), 'c': np.random.randint(100, size=(N, )), 'd': np.random.randint(100, size=(N, )), 'e': np.random.randint(100, size=(N, )) }) ddf = dd.from_pandas(df, 3) idx = (pd.date_range('2016-01-01', freq='3s', periods=100) | pd.date_range('2016-01-01', freq='5s', periods=100))[:N] ts = pd.DataFrame( { 'a': np.random.randn(N).cumsum(), 'b': np.random.randint(100, size=(N, )), 'c': np.random.randint(100, size=(N, )), 'd': np.random.randint(100, size=(N, )), 'e': np.random.randint(100, size=(N, )) }, index=idx) dts = dd.from_pandas(ts, 3)
def test_from_pandas_single_row(): df = pd.DataFrame({"x": [1]}, index=[1]) ddf = dd.from_pandas(df, npartitions=1) assert ddf.divisions == (1, 1) assert_eq(ddf, df)
def test_to_bag(): a = pd.DataFrame( { "x": ["a", "b", "c", "d"], "y": [2, 3, 4, 5] }, index=pd.Index([1.0, 2.0, 3.0, 4.0], name="ind"), ) ddf = dd.from_pandas(a, 2) assert ddf.to_bag().compute() == list(a.itertuples(False)) assert ddf.to_bag(True).compute() == list(a.itertuples(True)) assert ddf.to_bag(format="dict").compute() == [ { "x": "a", "y": 2 }, { "x": "b", "y": 3 }, { "x": "c", "y": 4 }, { "x": "d", "y": 5 }, ] assert ddf.to_bag(True, format="dict").compute() == [ { "index": 1.0, "x": "a", "y": 2 }, { "index": 2.0, "x": "b", "y": 3 }, { "index": 3.0, "x": "c", "y": 4 }, { "index": 4.0, "x": "d", "y": 5 }, ] assert ddf.x.to_bag(True).compute() == list(a.x.items()) assert ddf.x.to_bag().compute() == list(a.x) assert ddf.x.to_bag(True, format="dict").compute() == [ { "x": "a" }, { "x": "b" }, { "x": "c" }, { "x": "d" }, ] assert ddf.x.to_bag(format="dict").compute() == [ { "x": "a" }, { "x": "b" }, { "x": "c" }, { "x": "d" }, ]
def __init__(self): self.nodes = dd.from_pandas(pd.DataFrame( columns=['node_1', 'node_2', 'weight']).set_index(['node_1'], drop=False), chunksize=1e9)
def test_set_index_sorted_single_partition(): df = pd.DataFrame({"x": [1, 2, 3, 4], "y": [1, 0, 1, 0]}) ddf = dd.from_pandas(df, npartitions=1) assert_eq(ddf.set_index("x", sorted=True), df.set_index("x"))
def _fetch_profile_and_discrete(self): self._logger.info("Fetching Profiles and Discrete...") svdf_arrays = {} label_arrays = {} for idx, row in self._sources_df.iterrows(): self._logger.info("------------------------") self._logger.info(row['cruise_id']) url = row['summary_url'] self._logger.info(url) if url.endswith('.csv'): svdf = pd.read_csv(url, na_values=['-9999999']) elif url.endswith('.xlsx'): svdf = pd.read_excel(url, na_values=['-9999999']) if row['array_rd'] not in svdf_arrays: svdf_arrays[row['array_rd']] = [] clean_svdf, discrete_sample_labels = clean_ship_verification(svdf) label_arrays[row['array_rd']] = pd.DataFrame( discrete_sample_labels ).set_index('name') if row['array_rd'] == 'CE': # Fix some O, 0 weirdness... if 'CEO2' in clean_svdf['station'].unique(): self._logger.warning('CEO2 found! Fixing to CE02...') clean_svdf.loc[:, 'station'] = clean_svdf['station'].apply( lambda r: r.replace('O', '0') ) clean_svdf.loc[:, 'cruise_id'] = row['cruise_id'] final_svdf = clean_svdf.reset_index(drop=True) cleaned_final_svdf = self.check_types_and_replace(final_svdf) svdf_arrays[row['array_rd']].append(cleaned_final_svdf) svdf_dict = { k: pd.concat(v, sort=False) for k, v in svdf_arrays.items() } # Creates label mapping for Display Names and Units self._create_label_map(label_arrays) profile_list, discrete_list = [], [] for k, v in svdf_dict.items(): sampledf = v.copy() profile_df, discrete_df = self.parse_profile_and_discrete( sampledf, k ) profile_list.append(profile_df) if any( discrete_df.columns.isin(['calculated_dic', 'calculated_pco2']) ): if all(discrete_df['calculated_dic'].isna()): discrete_df.drop('calculated_dic', axis=1, inplace=True) if all(discrete_df['calculated_pco2'].isna()): discrete_df.drop('calculated_pco2', axis=1, inplace=True) discrete_list.append(discrete_df) all_profiles = pd.concat(profile_list, sort=False).reset_index( drop=True ) all_discrete = pd.concat(discrete_list, sort=False).reset_index( drop=True ) apdd = dd.from_pandas(all_profiles, npartitions=2) addd = dd.from_pandas(all_discrete, npartitions=2) apdd.to_parquet( f"s3://{self._cadai_bucket}/{settings.SHIP_DATA_PROFILES}", write_index=False, ) addd.to_parquet( f"s3://{self._cadai_bucket}/{settings.SHIP_DATA_DISCRETE}", write_index=False, )
# df_=dd.from_pandas(df_, npartitions=2*multiprocessing.cpu_count()) # df['elmo']=df_.usable_text.map(lambda usable_text: give_paragraph_elmo_vector(usable_text), meta=('usable_text', str)).compute() # df['elmo'] = df.apply(lambda row: give_paragraph_elmo_vector(row['usable_text']) , axis=1) # print(time.process_time() - start) # df.to_csv(os.path.join(standards_dir,'iso_final_all_clean_text_w_elmo.csv')) # the above code used dask to parallelize the operations for calculating elmo vectors. But this still uses the tf_hub on a paragraph basis. We can try one shot: # -- sent-tokenize the paragraphs and maintain an array to track which sentences belong to which data points. # -- give all sents at once to the tf_hub and extract the tokens then # -- implemented the above in the elmo_utils.py standards_dir = '../standards/data' df = dd.read_csv( os.path.join(standards_dir, 'iso_final_all_clean_text.csv') ) # df=pd.read_csv(os.path.join(standards_dir,'iso_final_all_clean_text.csv'), index_col=0) df = df.compute() df = dd.from_pandas(df, npartitions=2 * multiprocessing.cpu_count()) df = df[df['type'] == 'standard'].reset_index(drop=True) df = df.fillna('') df = df.map_partitions(lambda df: df.assign(usable_text=df['description_clean'] + ' ' + df['title'])).compute() # df=df.head(100) # df=df.reset_index() df_splits = np.array_split(df, 31) for df_split in df_splits: start = time.process_time() df_split['elmo'] = give_paragraph_elmo_vector_multi( list(df_split['usable_text'])) print(time.process_time() - start) df = pd.concat(df_splits) df.to_csv(
def _download_prices(date): ''' input: datetime object output: pandas dataframe with prices for all available futures for the specified date ''' db = DataBase() errors = [] if type(date) == type('str'): date = pd.to_datetime(date, format='%Y-%m-%d') y = str(date.year) if len(str(date.month)) == 2: m = str(date.month) else: m = '0' + str(date.month) if len(str(date.day)) == 2: d = str(date.day) else: d = '0' + str(date.day) try: url = f'https://www.mrci.com/ohlc/{y}/{y[-2:]+m+d}.php' soup = db._get_session(url) df = pd.read_html(str(soup.find('map').find_next('table')))[0] try: futures_lookup = pd.read_csv( os.path.dirname(__file__) + '/futures_lookup.csv').name.tolist() except: futures_lookup = pd.read_csv( os.path.dirname(__file__) + '\\futures_lookup.csv').name.tolist() indices = [ i for i, j in enumerate(df.iloc[:, 0]) if j in futures_lookup ] columns = [ 'month', 'date', 'open', 'high', 'low', 'close', 'change', 'volume', 'open_interest', 'change_in_oi' ] if len(df.columns) == 11: df = df.iloc[indices[0]:-2, :len(df.columns) - 1] else: df = df.iloc[indices[0]:-2, :] #session.close() except: errors.append(date) #session.close() return errors df.columns = columns #[ i for i in np.unique(df.month).tolist() if i not in futures_lookup ] first = True for i in range(1, len(indices)): temp = df.loc[indices[i - 1] + 1:indices[i] - 2].copy() temp['future'] = df.loc[indices[i - 1], 'month'] if first: out = temp.copy() first = False else: out = out.append(temp) out = out[out.iloc[:, 1] != 'Total Volume and Open Interest'] # out.to_csv('futures.csv') out.index = [date] * len( out ) #pd.to_datetime( [ f'{i[-2:]}/{i[2:4]}/{i[:2]}' for i in out.date ] ) out.replace('\+', '', regex=True, inplace=True) out.replace('unch', np.nan, inplace=True) out = db._col_to_float(out) return dd.from_pandas(out, npartitions=1)
def test_set_index_interpolate_int(): L = sorted(list(range(0, 200, 10)) * 2) df = pd.DataFrame({'x': 2 * L}) d = dd.from_pandas(df, 2) d1 = d.set_index('x', npartitions=10) assert all(np.issubdtype(type(x), np.integer) for x in d1.divisions)
def Agriculture(): """ Import and format previously calculated agriculture fuel and energy use. ag_electricity.py and ag_fuel.py should be refactored for methods, etc. """ # use 2012 for 2010 - 2014; use 2017 for 2015 - 2016 fuel_results_file_12 = \ '../results/ag_output_fuel_use_by_county_2012_20190812_2238.csv' fuel_results_file_17 = \ '../results/ag_output_fuel_use_by_county_2017_20190812_2230.csv' elect_results_file_12 = \ '../results/ag_output_electricity_use_by_county_2012_20190813_1024.csv' elect_results_file_17 =\ '../results/ag_output_electricity_use_by_county_2017_20190813_1019.csv' def import_format(results_filepath): """ """ ag_energy = pd.read_csv(results_filepath, index_col=0) # Check if index was written to file if ag_energy.index.names != [None]: ag_energy.reset_index(inplace=True) ag_energy.replace({'LP GAS': 'LPG_NGL', 'NATURAL_GAS': 'Natural_gas', 'DIESEL': 'Diesel', 'LPG': 'LPG_NGL', 'GASOLINE': 'Other', 'OTHER': 'Residual_fuel_oil', 'ELECTRICITY': 'Net_electricity'}, inplace=True) ag_energy.rename({'fuel_type': 'MECS_FT'}, inplace=True) return ag_energy fuel_12 = import_format(fuel_results_file_12) fuel_17 = import_format(fuel_results_file_17) elect_12 = import_format(elect_results_file_12) elect_17 = import_format(elect_results_file_17) multiplier_12 = ag.calc_multiplier(base_year=2012, calculation_years=range(2010, 2015)) multiplier_17 = ag.calc_multiplier(base_year=2017, calculation_years=range(2015, 2018)) county_fuel = pd.concat( [ag.calc_county_fuel(fuel_12, multiplier_12, calculation_years=range(2010, 2015)), ag.calc_county_fuel(fuel_17, multiplier_17, calculation_years=range(2015, 2018))], axis=1, ignore_index=False ) county_elec = pd.concat( [ag.calc_county_fuel(elect_12, multiplier_12, calculation_years=range(2010, 2015)), ag.calc_county_fuel(elect_17, multiplier_17, calculation_years=range(2015, 2018))], axis=1, ignore_index=False ) # county_elec.state.fillna(method='ffill', inplace=True) # # county_elec.fipstate.fillna(method='ffill', inplace=True) county_total = pd.DataFrame() for df in [county_fuel, county_elec]: df = df.iloc[:, 2:] df.state.fillna(method='ffill', inplace=True) df.fipstate.fillna(method='ffill', inplace=True) df['fipstate'] = df.fipstate.astype(int) df.reset_index(inplace=True) # Drop any Alaksan counties missing info df = df[df.COUNTY_FIPS !=2] df.rename(columns={'fuel_type': 'MECS_FT'}, inplace=True) county_total = county_total.append(df.melt( id_vars=['NAICS', 'COUNTY_FIPS', 'fipstate', 'MECS_FT', 'state'], var_name='year', value_name='MMBtu_TOTAL' )) county_total = dd.from_pandas( county_total.set_index('fipstate'), npartitions=len(county_total.fipstate.unique()) ) filename = 'ag_county_energy_' + \ dt.datetime.now().strftime('%Y%m%d_%H%M')+'.parquet.gzip' county_total.to_parquet('../results/'+filename, engine='pyarrow', compression='gzip') return county_total
def test_from_pandas_with_index_nulls(null_value): df = pd.DataFrame({"x": [1, 2, 3]}, index=["C", null_value, "A"]) with pytest.raises(NotImplementedError, match="is non-numeric and contains nulls"): dd.from_pandas(df, npartitions=2, sort=False)
from dask.array.utils import assert_eq as assert_eq_ar from dask.dataframe.utils import assert_eq as assert_eq_df from dask_ml.datasets import make_classification from dask_ml.utils import ( _num_samples, assert_estimator_equal, check_array, check_chunks, check_matching_blocks, check_random_state, handle_zeros_in_scale, slice_columns, ) df = dd.from_pandas(pd.DataFrame(5 * [range(42)]).T, npartitions=5) s = dd.from_pandas(pd.Series([0, 1, 2, 3, 0]), npartitions=5) a = da.from_array(np.array([0, 1, 2, 3, 0]), chunks=3) X, y = make_classification(chunks=(2, 20)) Foo = namedtuple("Foo", "a_ b_ c_ d_") Bar = namedtuple("Bar", "a_ b_ d_ e_") def test_slice_columns(): columns = [2, 3] df2 = slice_columns(df, columns) X2 = slice_columns(X, columns) assert list(df2.columns) == columns assert_eq_df(df[columns].compute(), df2.compute())
def test_rolling_repr(): ddf = dd.from_pandas(pd.DataFrame([10] * 30), npartitions=3) assert repr(ddf.rolling(4)) == 'Rolling [window=4,center=False,axis=0]'
import pandas.util.testing as tm import pytest import sklearn.preprocessing as spp from dask import compute from dask.array.utils import assert_eq as assert_eq_ar from dask.dataframe.utils import assert_eq as assert_eq_df from pandas.api.types import is_categorical_dtype, is_object_dtype from sklearn.exceptions import NotFittedError import dask_ml.preprocessing as dpp from dask_ml.datasets import make_classification from dask_ml.utils import assert_estimator_equal X, y = make_classification(chunks=50) df = X.to_dask_dataframe().rename(columns=str) df2 = dd.from_pandas(pd.DataFrame(5 * [range(42)]).T.rename(columns=str), npartitions=5) raw = pd.DataFrame( { "A": ["a", "b", "c", "a"], "B": ["a", "b", "c", "a"], "C": ["a", "b", "c", "a"], "D": [1, 2, 3, 4], }, columns=["A", "B", "C", "D"], ) dummy = pd.DataFrame( { "A": pd.Categorical(["a", "b", "c", "a"], ordered=True), "B": pd.Categorical(["a", "b", "c", "a"], ordered=False), "C": pd.Categorical(["a", "b", "c", "a"], categories=["a", "b", "c", "d"]),
def test_rolling_names(): df = pd.DataFrame({'a': [1, 2, 3], 'b': [4, 5, 6]}) a = dd.from_pandas(df, npartitions=2) assert sorted(a.rolling(2).sum().dask) == sorted(a.rolling(2).sum().dask)
def test_da(self): a = dd.from_pandas(dummy, npartitions=2) de = dpp.OrdinalEncoder() result = de.fit_transform(a) assert isinstance(result, dd.DataFrame)
def test_make_meta(): df = pd.DataFrame( {"a": [1, 2, 3], "b": list("abc"), "c": [1.0, 2.0, 3.0]}, index=[10, 20, 30] ) # Pandas dataframe meta = make_meta(df) assert len(meta) == 0 assert (meta.dtypes == df.dtypes).all() assert isinstance(meta.index, type(df.index)) # Pandas series meta = make_meta(df.a) assert len(meta) == 0 assert meta.dtype == df.a.dtype assert isinstance(meta.index, type(df.index)) # Pandas index meta = make_meta(df.index) assert isinstance(meta, type(df.index)) assert len(meta) == 0 # Dask object ddf = dd.from_pandas(df, npartitions=2) assert make_meta(ddf) is ddf._meta # Dict meta = make_meta({"a": "i8", "b": "O", "c": "f8"}) assert isinstance(meta, pd.DataFrame) assert len(meta) == 0 assert (meta.dtypes == df.dtypes).all() assert isinstance(meta.index, pd.RangeIndex) # Iterable meta = make_meta([("a", "i8"), ("c", "f8"), ("b", "O")]) assert (meta.columns == ["a", "c", "b"]).all() assert len(meta) == 0 assert (meta.dtypes == df.dtypes[meta.dtypes.index]).all() assert isinstance(meta.index, pd.RangeIndex) # Tuple meta = make_meta(("a", "i8")) assert isinstance(meta, pd.Series) assert len(meta) == 0 assert meta.dtype == "i8" assert meta.name == "a" # With index meta = make_meta({"a": "i8", "b": "i4"}, index=pd.Int64Index([1, 2], name="foo")) assert isinstance(meta.index, pd.Int64Index) assert len(meta.index) == 0 meta = make_meta(("a", "i8"), index=pd.Int64Index([1, 2], name="foo")) assert isinstance(meta.index, pd.Int64Index) assert len(meta.index) == 0 # Categoricals meta = make_meta({"a": "category"}) assert len(meta.a.cat.categories) == 1 assert meta.a.cat.categories[0] == UNKNOWN_CATEGORIES meta = make_meta(("a", "category")) assert len(meta.cat.categories) == 1 assert meta.cat.categories[0] == UNKNOWN_CATEGORIES # Numpy scalar meta = make_meta(np.float64(1.0)) assert isinstance(meta, np.float64) # Python scalar meta = make_meta(1.0) assert isinstance(meta, np.float64) # Timestamp x = pd.Timestamp(2000, 1, 1) meta = make_meta(x) assert meta is x # Dtype expressions meta = make_meta("i8") assert isinstance(meta, np.int64) meta = make_meta(float) assert isinstance(meta, np.dtype(float).type) meta = make_meta(np.dtype("bool")) assert isinstance(meta, np.bool_) assert pytest.raises(TypeError, lambda: make_meta(None))
def dask_df(pandas_df): return dd.from_pandas(pandas_df, npartitions=5)
def to_dc(self, input_item, table_name: str, format: str = None, **kwargs): npartitions = kwargs.pop("npartitions", 1) return dd.from_pandas(input_item, npartitions=npartitions, **kwargs)
def test_set_index_reduces_partitions_small(shuffle): df = pd.DataFrame({'x': np.random.random(100)}) ddf = dd.from_pandas(df, npartitions=50) ddf2 = ddf.set_index('x', shuffle=shuffle, npartitions='auto') assert ddf2.npartitions < 10
def time_df2(npartitions): pandas_df = pd.DataFrame({ 'time': pd.to_datetime([2, 4]), 'other_value': [1.2, 2.0] }) return dd.from_pandas(pandas_df, npartitions=npartitions)
def test_set_index_detects_sorted_data(shuffle): df = pd.DataFrame({'x': range(100), 'y': range(100)}) ddf = dd.from_pandas(df, npartitions=10, name='x', sort=False) ddf2 = ddf.set_index('x', shuffle=shuffle) assert len(ddf2.dask) < ddf.npartitions * 4
def test_basic(df, npartitions): ddf = dd.from_pandas(df, npartitions=npartitions) approx = ddf.nunique_approx().compute(scheduler="sync") exact = len(df.drop_duplicates()) assert abs(approx - exact) <= 2 or abs(approx - exact) / exact < 0.05
train_df['len_word_q2'] = train_df.question2.apply( lambda x: len(str(x).split())) #Test test_df['len_char_q1'] = test_df.question1.apply( lambda x: len(''.join(set(str(x).replace(' ', ''))))) test_df['len_char_q2'] = test_df.question2.apply( lambda x: len(''.join(set(str(x).replace(' ', ''))))) test_df['len_word_q1'] = test_df.question1.apply(lambda x: len(str(x).split())) test_df['len_word_q2'] = test_df.question2.apply(lambda x: len(str(x).split())) ############################################################################### # Try paralell computation with dask #Train print('extra fuzzy features, train....') train_dd = from_pandas(train_df[['question1', 'question2']], npartitions=8) start_time = time.time() train_df['fuzz_qratio'] = train_dd.apply( lambda x: fuzz.QRatio(str(x['question1']), str(x['question2'])), axis=1, meta=('a', np.dtype('int64'))).compute(get=dask.multiprocessing.get) train_df['fuzz_WRatio'] = train_dd.apply( lambda x: fuzz.WRatio(str(x['question1']), str(x['question2'])), axis=1, meta=('a', np.dtype('int64'))).compute(get=dask.multiprocessing.get) train_df['fuzz_token_set_ratio'] = train_dd.apply( lambda x: fuzz.token_set_ratio(str(x['question1']), str(x['question2'])), axis=1, meta=('a', np.dtype('int64'))).compute(get=dask.multiprocessing.get) train_df['fuzz_token_sort_ratio'] = train_dd.apply(
def test_da(self, data): a = dd.from_pandas(data, npartitions=2) ct = DummyEncoder() result = ct.fit_transform(a) expected = DummyEncoder().fit_transform(data) assert_eq(result, expected)
def test_to_csv_paths(): df = pd.DataFrame({"A": range(10)}) ddf = dd.from_pandas(df, npartitions=2) assert ddf.to_csv("foo*.csv") == ['foo0.csv', 'foo1.csv'] os.remove('foo0.csv') os.remove('foo1.csv')