def testJoin(self): df1 = pd.DataFrame([[1, 3, 3], [4, 2, 6], [7, 8, 9]], index=['a1', 'a2', 'a3']) df2 = pd.DataFrame([[1, 2, 3], [1, 5, 6], [7, 8, 9]], index=['a1', 'b2', 'b3']) + 1 df2 = pd.concat([df2, df2 + 1]) mdf1 = from_pandas(df1, chunk_size=2) mdf2 = from_pandas(df2, chunk_size=2) parameters = [ { 'lsuffix': 'l_', 'rsuffix': 'r_' }, { 'lsuffix': 'l_', 'rsuffix': 'r_', 'how': 'left' }, { 'lsuffix': 'l_', 'rsuffix': 'r_', 'how': 'right' }, { 'lsuffix': 'l_', 'rsuffix': 'r_', 'how': 'inner' }, { 'lsuffix': 'l_', 'rsuffix': 'r_', 'how': 'left' }, ] for kw in parameters: df = mdf1.join(mdf2, **kw) df = df.tiles() self.assertEqual(df.chunk_shape, (3, 1)) for chunk in df.chunks: self.assertIsInstance(chunk.op, DataFrameShuffleMerge) self.assertEqual(chunk.op.how, kw.get('how', 'left')) left, right = chunk.op.inputs self.assertIsInstance(left.op, DataFrameMergeAlignReduce) self.assertIsInstance(right.op, DataFrameMergeAlignReduce) self.assertEqual(len(left.inputs[0].inputs), 2) self.assertEqual(len(right.inputs[0].inputs), 3) for lchunk in left.inputs[0].inputs: self.assertIsInstance(lchunk.op, DataFrameMergeAlignMap) self.assertEqual(lchunk.op.index_shuffle_size, 3) self.assertEqual(lchunk.op.shuffle_on, None) for rchunk in right.inputs[0].inputs: self.assertIsInstance(rchunk.op, DataFrameMergeAlignMap) self.assertEqual(rchunk.op.index_shuffle_size, 3) self.assertEqual(rchunk.op.shuffle_on, None) pd.testing.assert_index_equal(chunk.columns_value.to_pandas(), df.columns_value.to_pandas())
def testWithMultiForms(self): # test multiple forms # such as self+other, self.add(other), add(self,other) data1 = pd.DataFrame(np.random.rand(10, 10)) data1 = self.to_boolean_if_needed(data1) df1 = from_pandas(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 10)) data2 = self.to_boolean_if_needed(data2) df2 = from_pandas(data2, chunk_size=6) expected = self.func(data1, data2) result = self.executor.execute_dataframe(self.func(df1, df2), concat=True)[0] pd.testing.assert_frame_equal(expected, result) result = self.executor.execute_dataframe(self.func(df1, df2), concat=True)[0] pd.testing.assert_frame_equal(expected, result) result = self.executor.execute_dataframe(getattr(df1, self.func_name)(df2), concat=True)[0] pd.testing.assert_frame_equal(expected, result) result = self.executor.execute_dataframe(getattr(df1, self.rfunc_name)(df2), concat=True)[0] pd.testing.assert_frame_equal(self.func(data2, data1), result)
def testWithOneShuffleExecution(self): # only 1 axis is monotonic # data1 with index split into [0...4], [5...9], data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas(data1, chunk_size=5) # data2 with index split into [6...11], [2, 5], data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas(data2, chunk_size=6) df3 = self.func(df1, df2) expected = self.func(data1, data2) result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) # only 1 axis is monotonic # data1 with columns split into [0...4], [5...9], data1 = pd.DataFrame(np.random.rand(10, 10), index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7], columns=np.arange(10)) df1 = from_pandas(data1, chunk_size=5) # data2 with columns split into [6...11], [2, 5], data2 = pd.DataFrame(np.random.rand(10, 10), index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2], columns=np.arange(11, 1, -1)) df2 = from_pandas(data2, chunk_size=6) df3 = self.func(df1, df2) expected = self.func(data1, data2) result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result)
def testBothOneChunk(self): # no axis is monotonic, but 1 chunk for all axes data1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9], columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas(data1, chunk_size=10) data2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3], columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas(data2, chunk_size=10) df3 = add(df1, df2) # test df3's index and columns pd.testing.assert_index_equal(df3.columns.to_pandas(), (data1 + data2).columns) self.assertTrue(df3.columns.should_be_monotonic) self.assertIsInstance(df3.index_value.value, IndexValue.Int64Index) self.assertTrue(df3.index_value.should_be_monotonic) pd.testing.assert_index_equal(df3.index_value.to_pandas(), pd.Int64Index([])) self.assertNotEqual(df3.index_value.key, df1.index_value.key) self.assertNotEqual(df3.index_value.key, df2.index_value.key) self.assertEqual(df3.shape[1], 12) # columns is recorded, so we can get it df3.tiles() self.assertEqual(df3.chunk_shape, (1, 1)) for c in df3.chunks: self.assertIsInstance(c.op, DataFrameAdd) self.assertEqual(len(c.inputs), 2) # test the left side self.assertIs(c.inputs[0], df1.chunks[0].data) # test the right side self.assertIs(c.inputs[1], df2.chunks[0].data)
def testWithShuffleOnStringIndex(self): if self.func_name in ['__and__', '__or__', '__xor__']: # FIXME bitwise logical operators behave differently with pandas when index is not aligned. return # no axis is monotonic, and the index values are strings. data1 = pd.DataFrame( np.random.rand(10, 10), index=[str(x) for x in [0, 10, 2, 3, 4, 5, 6, 7, 8, 9]], columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) data1 = self.to_boolean_if_needed(data1) df1 = from_pandas(data1, chunk_size=5) data2 = pd.DataFrame( np.random.rand(10, 10), index=[str(x) for x in [11, 1, 2, 5, 7, 6, 8, 9, 10, 3]], columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) data2 = self.to_boolean_if_needed(data2) df2 = from_pandas(data2, chunk_size=6) df3 = self.func(df1, df2) expected = self.func(data1, data2) result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result)
def testAppend(self): df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD')) df2 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD')) mdf1 = from_pandas(df1, chunk_size=3) mdf2 = from_pandas(df2, chunk_size=3) adf = mdf1.append(mdf2) self.assertEqual(adf.shape, (20, 4)) self.assertIsInstance(adf.index_value.value, IndexValue.Int64Index) tiled = adf.tiles() self.assertEqual(tiled.nsplits, ((3, 3, 3, 1, 3, 3, 3, 1), (3, 1))) self.assertEqual(tiled.chunk_shape, (8, 2)) for i, c in enumerate(tiled.chunks): index = (i // 2, i % 2) self.assertEqual(c.index, index) mdf1 = from_pandas(df1, chunk_size=3) mdf2 = from_pandas(df2, chunk_size=3) adf = mdf1.append(mdf2, ignore_index=True) self.assertEqual(adf.shape, (20, 4)) self.assertIsInstance(adf.index_value.value, IndexValue.RangeIndex) pd.testing.assert_index_equal(adf.index_value.to_pandas(), pd.RangeIndex(20)) tiled = adf.tiles() self.assertEqual(tiled.nsplits, ((3, 3, 3, 1, 3, 3, 3, 1), (3, 1))) self.assertEqual(tiled.chunk_shape, (8, 2)) self.assertIsInstance(tiled.chunks[0].op, ChunkStandardizeRangeIndex)
def test_append(): df1 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD')) df2 = pd.DataFrame(np.random.rand(10, 4), columns=list('ABCD')) mdf1 = from_pandas(df1, chunk_size=3) mdf2 = from_pandas(df2, chunk_size=3) adf = mdf1.append(mdf2) assert adf.shape == (20, 4) assert isinstance(adf.index_value.value, IndexValue.Int64Index) tiled = tile(adf) assert tiled.nsplits == ((3, 3, 3, 1, 3, 3, 3, 1), (3, 1)) assert tiled.chunk_shape == (8, 2) for i, c in enumerate(tiled.chunks): index = (i // 2, i % 2) assert c.index == index mdf1 = from_pandas(df1, chunk_size=3) mdf2 = from_pandas(df2, chunk_size=3) adf = mdf1.append(mdf2, ignore_index=True) assert adf.shape == (20, 4) assert isinstance(adf.index_value.value, IndexValue.RangeIndex) pd.testing.assert_index_equal(adf.index_value.to_pandas(), pd.RangeIndex(20)) tiled = tile(adf) assert tiled.nsplits == ((3, 3, 3, 1, 3, 3, 3, 1), (3, 1)) assert tiled.chunk_shape == (8, 2) assert isinstance(tiled.chunks[0].op, ChunkStandardizeRangeIndex)
def test_without_shuffle_execution(setup, func_name, func_opts): if func_opts.func_name in ['__and__', '__or__', '__xor__']: # FIXME bitwise logical operators behave differently with pandas when index is not aligned. return # all the axes are monotonic # data1 with index split into [0...4], [5...9], # columns [3...7], [8...12] data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=np.arange(3, 13)) data1 = to_boolean_if_needed(func_opts.func_name, data1) df1 = from_pandas(data1, chunk_size=5) # data2 with index split into [6...11], [2, 5], # columns [4...9], [10, 13] data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=np.arange(4, 14)) data2 = to_boolean_if_needed(func_opts.func_name, data2) df2 = from_pandas(data2, chunk_size=6) df3 = func_opts.func(df1, df2) expected = func_opts.func(data1, data2) result = df3.execute().fetch() pd.testing.assert_frame_equal(expected, result)
def testMerge(self): df1 = pd.DataFrame(np.arange(20).reshape((4, 5)) + 1, columns=['a', 'b', 'c', 'd', 'e']) df2 = pd.DataFrame(np.arange(20).reshape((5, 4)) + 1, columns=['a', 'b', 'x', 'y']) mdf1 = from_pandas(df1, chunk_size=2) mdf2 = from_pandas(df2, chunk_size=2) # Note [Index of Merge] # # When `left_index` and `right_index` of `merge` is both false, pandas will generate an RangeIndex to # the final result dataframe. # # We chunked the `left` and `right` dataframe, thus every result chunk will have its own RangeIndex. # When they are contenated we don't generate a new RangeIndex for the result, thus we cannot obtain the # same index value with pandas. But we guarantee that the content of dataframe is correct. # merge on index expected0 = df1.merge(df2) jdf0 = mdf1.merge(mdf2) result0 = self.executor.execute_dataframe(jdf0, concat=True)[0] pd.testing.assert_frame_equal(sort_dataframe_inplace(expected0, 0), sort_dataframe_inplace(result0, 0)) # merge on left index and `right_on` expected1 = df1.merge(df2, how='left', right_on='x', left_index=True) jdf1 = mdf1.merge(mdf2, how='left', right_on='x', left_index=True) result1 = self.executor.execute_dataframe(jdf1, concat=True)[0] expected1.set_index('a_x', inplace=True) result1.set_index('a_x', inplace=True) pd.testing.assert_frame_equal(sort_dataframe_inplace(expected1, 0), sort_dataframe_inplace(result1, 0)) # merge on `left_on` and right index expected2 = df1.merge(df2, how='right', left_on='a', right_index=True) jdf2 = mdf1.merge(mdf2, how='right', left_on='a', right_index=True) result2 = self.executor.execute_dataframe(jdf2, concat=True)[0] expected2.set_index('a', inplace=True) result2.set_index('a', inplace=True) pd.testing.assert_frame_equal(sort_dataframe_inplace(expected2, 0), sort_dataframe_inplace(result2, 0)) # merge on `left_on` and `right_on` expected3 = df1.merge(df2, how='left', left_on='a', right_on='x') jdf3 = mdf1.merge(mdf2, how='left', left_on='a', right_on='x') result3 = self.executor.execute_dataframe(jdf3, concat=True)[0] expected3.set_index('a_x', inplace=True) result3.set_index('a_x', inplace=True) pd.testing.assert_frame_equal(sort_dataframe_inplace(expected3, 0), sort_dataframe_inplace(result3, 0)) # merge on `on` expected4 = df1.merge(df2, how='right', on='a') jdf4 = mdf1.merge(mdf2, how='right', on='a') result4 = self.executor.execute_dataframe(jdf4, concat=True)[0] expected4.set_index('a', inplace=True) result4.set_index('a', inplace=True) pd.testing.assert_frame_equal(sort_dataframe_inplace(expected4, 0), sort_dataframe_inplace(result4, 0)) # merge on multiple columns expected5 = df1.merge(df2, how='inner', on=['a', 'b']) jdf5 = mdf1.merge(mdf2, how='inner', on=['a', 'b']) result5 = self.executor.execute_dataframe(jdf5, concat=True)[0] pd.testing.assert_frame_equal(sort_dataframe_inplace(expected5, 0), sort_dataframe_inplace(result5, 0))
def test_with_plain_value(setup, func_name, func_opts): if func_opts.func_name in ['__and__', '__or__', '__xor__']: # skip tests for bitwise logical operators on plain value. return data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) data1 = to_boolean_if_needed(func_opts.func_name, data1) df1 = from_pandas(data1, chunk_size=6) s1 = df1[2] r = getattr(df1, func_opts.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], axis=0) result = r.execute().fetch() expected = getattr(data1, func_opts.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], axis=0) pd.testing.assert_frame_equal(expected, result) r = getattr(df1, func_opts.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10), axis=0) result = r.execute().fetch() expected = getattr(data1, func_opts.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10), axis=0) pd.testing.assert_frame_equal(expected, result) r = getattr(s1, func_opts.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) result = r.execute().fetch() expected = getattr(data1[2], func_opts.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) pd.testing.assert_series_equal(expected, result) r = getattr(s1, func_opts.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) result = r.execute().fetch() expected = getattr(data1[2], func_opts.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) pd.testing.assert_series_equal(expected, result) # specify index, not the default range index data1 = pd.DataFrame(np.random.rand(10, 7), index=np.arange(5, 15), columns=[4, 1, 3, 2, 5, 6, 7]) data1 = to_boolean_if_needed(func_opts.func_name, data1) df1 = from_pandas(data1, chunk_size=6) s1 = df1[2] r = getattr(df1, func_opts.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), axis=0) result = r.execute().fetch() expected = getattr(data1, func_opts.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), axis=0) pd.testing.assert_frame_equal(expected, result) r = getattr(df1, func_opts.func_name)(from_array(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])), axis=0) result = r.execute().fetch() expected = getattr(data1, func_opts.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), axis=0) pd.testing.assert_frame_equal(expected, result) r = getattr(s1, func_opts.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])) result = r.execute().fetch() expected = getattr(data1[2], func_opts.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])) pd.testing.assert_series_equal(expected, result) r = getattr(s1, func_opts.func_name)(from_array(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))) result = r.execute().fetch() expected = getattr(data1[2], func_opts.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])) pd.testing.assert_series_equal(expected, result)
def testWithPlainValue(self): if self.func_name in ['__and__', '__or__', '__xor__']: # skip tests for bitwise logical operators on plain value. return data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) data1 = self.to_boolean_if_needed(data1) df1 = from_pandas(data1, chunk_size=6) s1 = df1[2] r = getattr(df1, self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], axis=0) result = self.executor.execute_dataframe(r, concat=True)[0] expected = getattr(data1, self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], axis=0) pd.testing.assert_frame_equal(expected, result) r = getattr(df1, self.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10), axis=0) result = self.executor.execute_dataframe(r, concat=True)[0] expected = getattr(data1, self.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10), axis=0) pd.testing.assert_frame_equal(expected, result) r = getattr(s1, self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) result = self.executor.execute_dataframe(r, concat=True)[0] expected = getattr(data1[2], self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) pd.testing.assert_series_equal(expected, result) r = getattr(s1, self.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) result = self.executor.execute_dataframe(r, concat=True)[0] expected = getattr(data1[2], self.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) pd.testing.assert_series_equal(expected, result) # specify index, not the default range index data1 = pd.DataFrame(np.random.rand(10, 7), index=np.arange(5, 15), columns=[4, 1, 3, 2, 5, 6, 7]) data1 = self.to_boolean_if_needed(data1) df1 = from_pandas(data1, chunk_size=6) s1 = df1[2] r = getattr(df1, self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), axis=0) result = self.executor.execute_dataframe(r, concat=True)[0] expected = getattr(data1, self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), axis=0) pd.testing.assert_frame_equal(expected, result) r = getattr(df1, self.func_name)(from_array(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])), axis=0) result = self.executor.execute_dataframe(r, concat=True)[0] expected = getattr(data1, self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), axis=0) pd.testing.assert_frame_equal(expected, result) r = getattr(s1, self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])) result = self.executor.execute_dataframe(r, concat=True)[0] expected = getattr(data1[2], self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])) pd.testing.assert_series_equal(expected, result) r = getattr(s1, self.func_name)(from_array(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))) result = self.executor.execute_dataframe(r, concat=True)[0] expected = getattr(data1[2], self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])) pd.testing.assert_series_equal(expected, result)
def testWithPlainValue(self): data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas(data1, chunk_size=6) s1 = df1[2] r = getattr(df1, self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], axis=0) result = self.executor.execute_dataframe(r, concat=True)[0] expected = getattr(data1, self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10], axis=0) pd.testing.assert_frame_equal(expected, result) r = getattr(df1, self.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10), axis=0) result = self.executor.execute_dataframe(r, concat=True)[0] expected = getattr(data1, self.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10), axis=0) pd.testing.assert_frame_equal(expected, result) r = getattr(s1, self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) result = self.executor.execute_dataframe(r, concat=True)[0] expected = getattr(data1[2], self.func_name)([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]) pd.testing.assert_series_equal(expected, result) r = getattr(s1, self.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) result = self.executor.execute_dataframe(r, concat=True)[0] expected = getattr(data1[2], self.func_name)((1, 2, 3, 4, 5, 6, 7, 8, 9, 10)) pd.testing.assert_series_equal(expected, result) # specify index, not the default range index data1 = pd.DataFrame(np.random.rand(10, 7), index=np.arange(5, 15), columns=[4, 1, 3, 2, 5, 6, 7]) df1 = from_pandas(data1, chunk_size=6) s1 = df1[2] r = getattr(df1, self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), axis=0) result = self.executor.execute_dataframe(r, concat=True)[0] expected = getattr(data1, self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), axis=0) pd.testing.assert_frame_equal(expected, result) r = getattr(df1, self.func_name)(from_array(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])), axis=0) result = self.executor.execute_dataframe(r, concat=True)[0] expected = getattr(data1, self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]), axis=0) pd.testing.assert_frame_equal(expected, result) r = getattr(s1, self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])) result = self.executor.execute_dataframe(r, concat=True)[0] expected = getattr(data1[2], self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])) pd.testing.assert_series_equal(expected, result) r = getattr(s1, self.func_name)(from_array(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]))) result = self.executor.execute_dataframe(r, concat=True)[0] expected = getattr(data1[2], self.func_name)(np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10])) pd.testing.assert_series_equal(expected, result)
def testDataFrame(self): with option_context({'eager_mode': True}): from mars.dataframe.arithmetic import add data1 = pd.DataFrame(np.random.rand(10, 10)) df1 = from_pandas(data1, chunk_size=5) pd.testing.assert_frame_equal(df1.fetch(), data1) data2 = pd.DataFrame(np.random.rand(10, 10)) df2 = from_pandas(data2, chunk_size=6) pd.testing.assert_frame_equal(df2.fetch(), data2) df3 = add(df1, df2) pd.testing.assert_frame_equal(df3.fetch(), data1 + data2)
def testWithAllShuffleExecution(self): # no axis is monotonic data1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9], columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3], columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas(data2, chunk_size=6) df3 = self.func(df1, df2) expected = self.func(data1, data2) result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result)
def testWithShuffleOnStringIndex(self): # no axis is monotonic, and the index values are strings. data1 = pd.DataFrame(np.random.rand(10, 10), index=[str(x) for x in [0, 10, 2, 3, 4, 5, 6, 7, 8, 9]], columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = from_pandas(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 10), index=[str(x) for x in [11, 1, 2, 5, 7, 6, 8, 9, 10, 3]], columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = from_pandas(data2, chunk_size=6) df3 = self.func(df1, df2) expected = self.func(data1, data2) result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result)
def test_dataframe_and_scalar(setup, func_name, func_opts): if func_opts.func_name in ['__and__', '__or__', '__xor__']: # FIXME bitwise logical operators doesn\'t support floating point scalars return # test dataframe and scalar pdf = pd.DataFrame(np.random.rand(10, 10)) pdf = to_boolean_if_needed(func_opts.func_name, pdf) df = from_pandas(pdf, chunk_size=2) expected = func_opts.func(pdf, 1) result = func_opts.func(df, 1).execute().fetch() pd.testing.assert_frame_equal(expected, result) result2 = func_opts.func(df, 1).execute().fetch() pd.testing.assert_frame_equal(expected, result2) result3 = getattr(df, func_opts.func_name)(1).execute().fetch() pd.testing.assert_frame_equal(expected, result3) # test scalar and dataframe result4 = func_opts.func(df, 1).execute().fetch() pd.testing.assert_frame_equal(expected, result4) expected2 = func_opts.func(1, pdf) result5 = func_opts.func(1, df).execute().fetch() pd.testing.assert_frame_equal(expected2, result5) result6 = getattr(df, func_opts.rfunc_name)(1).execute().fetch() pd.testing.assert_frame_equal(expected2, result6)
def test_same_index(setup, func_name, func_opts): data = pd.DataFrame(np.random.rand(10, 10), index=np.random.randint(0, 2, size=(10,)), columns=['c' + str(i) for i in range(10)]) data = to_boolean_if_needed(func_opts.func_name, data) df = from_pandas(data, chunk_size=3) df2 = func_opts.func(df, df) expected = func_opts.func(data, data) result = df2.execute().fetch() pd.testing.assert_frame_equal(expected, result) series = from_pandas_series(data.iloc[0], chunk_size=3) df3 = func_opts.func(df, series) expected = func_opts.func(data, data.iloc[0]) result = df3.execute().fetch() pd.testing.assert_frame_equal(expected, result) series = from_pandas_series(data.iloc[:, 0], chunk_size=3) df4 = getattr(df, func_opts.func_name)(series, axis=0) if func_opts.func_name not in ['__and__', '__or__', '__xor__']: expected = getattr(data, func_opts.func_name)(data.iloc[:, 0], axis=0) result = df4.execute().fetch() pd.testing.assert_frame_equal(expected, result)
def testDataframeAndScalar(self): if self.func_name in ['__and__', '__or__', '__xor__']: # FIXME bitwise logical operators doesn\'t support floating point scalars return # test dataframe and scalar pdf = pd.DataFrame(np.random.rand(10, 10)) pdf = self.to_boolean_if_needed(pdf) df = from_pandas(pdf, chunk_size=2) expected = self.func(pdf, 1) result = self.executor.execute_dataframe(self.func(df, 1), concat=True)[0] pd.testing.assert_frame_equal(expected, result) result2 = self.executor.execute_dataframe(self.func(df, 1), concat=True)[0] pd.testing.assert_frame_equal(expected, result2) result3 = self.executor.execute_dataframe(getattr(df, self.func_name)(1), concat=True)[0] pd.testing.assert_frame_equal(expected, result3) # test scalar and dataframe result4 = self.executor.execute_dataframe(self.func(df, 1), concat=True)[0] pd.testing.assert_frame_equal(expected, result4) expected2 = self.func(1, pdf) result5 = self.executor.execute_dataframe(self.func(1, df), concat=True)[0] pd.testing.assert_frame_equal(expected2, result5) result6 = self.executor.execute_dataframe(getattr(df, self.rfunc_name)(1), concat=True)[0] pd.testing.assert_frame_equal(expected2, result6)
def testAddSelf(self): data = pd.DataFrame(np.random.rand(10, 10), index=np.random.randint(-100, 100, size=(10,)), columns=[np.random.bytes(10) for _ in range(10)]) df = from_pandas(data, chunk_size=3) df2 = add(df, df) # test df2's index and columns pd.testing.assert_index_equal(df2.columns.to_pandas(), (data + data).columns) self.assertTrue(df2.columns.should_be_monotonic) self.assertIsInstance(df2.index_value.value, IndexValue.Int64Index) self.assertTrue(df2.index_value.should_be_monotonic) pd.testing.assert_index_equal(df2.index_value.to_pandas(), pd.Int64Index([])) self.assertEqual(df2.index_value.key, df.index_value.key) self.assertEqual(df2.columns.key, df.columns.key) self.assertEqual(df2.shape[1], 10) df2.tiles() self.assertEqual(df2.chunk_shape, df.chunk_shape) for c in df2.chunks: self.assertIsInstance(c.op, DataFrameAdd) self.assertEqual(len(c.inputs), 2) # test the left side self.assertIs(c.inputs[0], df.cix[c.index].data) # test the right side self.assertIs(c.inputs[1], df.cix[c.index].data)
def testSameIndex(self): data = pd.DataFrame(np.random.rand(10, 10), index=np.random.randint(0, 2, size=(10, )), columns=['c' + str(i) for i in range(10)]) data = self.to_boolean_if_needed(data) df = from_pandas(data, chunk_size=3) df2 = self.func(df, df) expected = self.func(data, data) result = self.executor.execute_dataframe(df2, concat=True)[0] pd.testing.assert_frame_equal(expected, result) series = from_pandas_series(data.iloc[0], chunk_size=3) df3 = self.func(df, series) expected = self.func(data, data.iloc[0]) result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) series = from_pandas_series(data.iloc[:, 0], chunk_size=3) df4 = getattr(df, self.func_name)(series, axis=0) expected = getattr(data, self.func_name)(data.iloc[:, 0], axis=0) result = self.executor.execute_dataframe(df4, concat=True)[0] pd.testing.assert_frame_equal(expected, result)
def testAddScalar(self): data = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=np.arange(3, 13)) df = from_pandas(data, chunk_size=5) # test add with scalar result = add(df, 1) result2 = df.add(1) # test radd with scalar result3 = df.radd(1) result4 = df + 1 result5 = 1 + df pd.testing.assert_index_equal(result.columns.to_pandas(), data.columns) self.assertIsInstance(result.index_value.value, IndexValue.Int64Index) pd.testing.assert_index_equal(result2.columns.to_pandas(), data.columns) self.assertIsInstance(result2.index_value.value, IndexValue.Int64Index) pd.testing.assert_index_equal(result3.columns.to_pandas(), data.columns) self.assertIsInstance(result3.index_value.value, IndexValue.Int64Index) pd.testing.assert_index_equal(result4.columns.to_pandas(), data.columns) self.assertIsInstance(result4.index_value.value, IndexValue.Int64Index) pd.testing.assert_index_equal(result5.columns.to_pandas(), data.columns) self.assertIsInstance(result5.index_value.value, IndexValue.Int64Index) # test NotImplemented, use other's radd instead class TestRadd: def __radd__(self, other): return 1 other = TestRadd() ret = df + other self.assertEqual(ret, 1)
def test_merge_on_duplicate_columns(setup): raw1 = pd.DataFrame([['foo', 1, 'bar'], ['bar', 2, 'foo'], ['baz', 3, 'foo']], columns=['lkey', 'value', 'value'], index=['a1', 'a2', 'a3']) raw2 = pd.DataFrame({'rkey': ['foo', 'bar', 'baz', 'foo'], 'value': [5, 6, 7, 8]}, index=['a1', 'a2', 'a3', 'a4']) df1 = from_pandas(raw1, chunk_size=2) df2 = from_pandas(raw2, chunk_size=3) r = df1.merge(df2, left_on='lkey', right_on='rkey') result = r.execute().fetch() expected = raw1.merge(raw2, left_on='lkey', right_on='rkey') pd.testing.assert_frame_equal(expected, result)
def test_negative(setup): data1 = pd.DataFrame(np.random.randint(low=0, high=100, size=(10, 10))) df1 = from_pandas(data1, chunk_size=5) result = -df1.execute().fetch() expected = -data1 pd.testing.assert_frame_equal(expected, result)
def testDataframeAndScalar(self): # test dataframe and scalar pdf = pd.DataFrame(np.random.rand(10, 10)) df = from_pandas(pdf, chunk_size=2) expected = self.func(pdf, 1) result = self.executor.execute_dataframe(self.func(df, 1), concat=True)[0] pd.testing.assert_frame_equal(expected, result) result2 = self.executor.execute_dataframe(self.func(df, 1), concat=True)[0] pd.testing.assert_frame_equal(expected, result2) result3 = self.executor.execute_dataframe(getattr(df, self.func_name)(1), concat=True)[0] pd.testing.assert_frame_equal(expected, result3) # test scalar and dataframe result4 = self.executor.execute_dataframe(self.func(df, 1), concat=True)[0] pd.testing.assert_frame_equal(expected, result4) expected2 = self.func(1, pdf) result5 = self.executor.execute_dataframe(self.func(1, df), concat=True)[0] pd.testing.assert_frame_equal(expected2, result5) result6 = self.executor.execute_dataframe(getattr(df, self.rfunc_name)(1), concat=True)[0] pd.testing.assert_frame_equal(expected2, result6)
def testNegative(self): data1 = pd.DataFrame(np.random.randint(low=0, high=100, size=(10, 10))) df1 = from_pandas(data1, chunk_size=5) result = self.executor.execute_dataframe(-df1, concat=True)[0] expected = -data1 pd.testing.assert_frame_equal(expected, result)
def test_not(setup): data1 = pd.DataFrame(np.random.uniform(low=-1, high=1, size=(10, 10)) > 0) df1 = from_pandas(data1, chunk_size=5) result = ~df1.execute().fetch() expected = ~data1 pd.testing.assert_frame_equal(expected, result)
def testRfunc(self): data1 = pd.DataFrame(np.random.rand(10, 10)) df1 = from_pandas(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 10)) df2 = from_pandas(data2, chunk_size=6) df3 = getattr(df1, self.rfunc_name)(df2) result = self.executor.execute_dataframe(df3, concat=True)[0] expected = self.func(data2, data1) pd.testing.assert_frame_equal(expected, result) data3 = pd.DataFrame(np.random.rand(10, 10)) df4 = from_pandas(data3, chunk_size=5) df5 = getattr(df4, self.rfunc_name)(1) result = self.executor.execute_dataframe(df5, concat=True)[0] expected2 = self.func(1, data3) pd.testing.assert_frame_equal(expected2, result)
def testAbs(self): data1 = pd.DataFrame(np.random.uniform(low=-1, high=1, size=(10, 10))) df1 = from_pandas(data1, chunk_size=5) result = self.executor.execute_dataframe(abs(df1), concat=True)[0] expected = data1.abs() pd.testing.assert_frame_equal(expected, result)
def testRadd(self): data1 = pd.DataFrame(np.random.rand(10, 10)) df1 = from_pandas(data1, chunk_size=5) data2 = pd.DataFrame(np.random.rand(10, 10)) df2 = from_pandas(data2, chunk_size=6) df3 = df1.radd(df2) result = self.executor.execute_dataframe(df3, concat=True)[0] expected = data1 + data2 pd.testing.assert_frame_equal(expected, result) data3 = pd.DataFrame(np.random.rand(10, 10)) df4 = from_pandas(data3, chunk_size=5) df5 = df4.radd(1) result = self.executor.execute_dataframe(df5, concat=True)[0] expected2 = data3 + 1 pd.testing.assert_frame_equal(expected2, result)
def testWithoutShuffleAndWithOneChunk(self): if self.func_name in ['__and__', '__or__', '__xor__']: # FIXME bitwise logical operators behave differently with pandas when index is not aligned. return # only 1 axis is monotonic # data1 with index split into [0...4], [5...9], data1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) data1 = self.to_boolean_if_needed(data1) df1 = from_pandas(data1, chunk_size=(5, 10)) # data2 with index split into [6...11], [2, 5], data2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) data2 = self.to_boolean_if_needed(data2) df2 = from_pandas(data2, chunk_size=(6, 10)) df3 = self.func(df1, df2) expected = self.func(data1, data2) result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result) # only 1 axis is monotonic # data1 with columns split into [0...4], [5...9], data1 = pd.DataFrame(np.random.rand(10, 10), index=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7], columns=np.arange(10)) data1 = self.to_boolean_if_needed(data1) df1 = from_pandas(data1, chunk_size=(10, 5)) # data2 with columns split into [6...11], [2, 5], data2 = pd.DataFrame(np.random.rand(10, 10), index=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2], columns=np.arange(11, 1, -1)) data2 = self.to_boolean_if_needed(data2) df2 = from_pandas(data2, chunk_size=(10, 6)) df3 = self.func(df1, df2) expected = self.func(data1, data2) result = self.executor.execute_dataframe(df3, concat=True)[0] pd.testing.assert_frame_equal(expected, result)