def testMainDataFrameWithoutEtcd(self): self.start_processes(etcd=False, scheduler_args=['-Dscheduler.aggressive_assign=true']) sess = new_session(self.session_manager_ref.address) # test binary arithmetics with different indices raw1 = pd.DataFrame(np.random.rand(10, 10)) df1 = md.DataFrame(raw1, chunk_size=5) raw2 = pd.DataFrame(np.random.rand(10, 10)) df2 = md.DataFrame(raw2, chunk_size=6) r = df1 + df2 result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1 + raw2) raw1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = md.DataFrame(raw1, chunk_size=(10, 5)) raw2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = md.DataFrame(raw2, chunk_size=(10, 6)) r = df1 + df2 result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1 + raw2) raw1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9], columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = md.DataFrame(raw1, chunk_size=5) raw2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3], columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = md.DataFrame(raw2, chunk_size=6) r = df1 + df2 result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1 + raw2) # test sort_values raw1 = pd.DataFrame(np.random.rand(10, 10)) raw1[0] = raw1[0].apply(str) raw1.columns = pd.MultiIndex.from_product([list('AB'), list('CDEFG')]) df1 = md.DataFrame(raw1, chunk_size=5) r = df1.sort_values([('A', 'C')]) result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1.sort_values([('A', 'C')])) rs = np.random.RandomState(0) raw2 = pd.DataFrame({'a': rs.rand(10), 'b': [f's{rs.randint(1000)}' for _ in range(10)] }) raw2['b'] = raw2['b'].astype(md.ArrowStringDtype()) mdf = md.DataFrame(raw2, chunk_size=4) filtered = mdf[mdf['a'] > 0.5] df2 = filtered.sort_values(by='b') result = df2.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = raw2[raw2['a'] > 0.5].sort_values(by='b') pd.testing.assert_frame_equal(result, expected) s1 = pd.Series(np.random.rand(10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3]) series1 = md.Series(s1, chunk_size=6) result = series1.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_series_equal(result, s1) # test reindex data = pd.DataFrame(np.random.rand(10, 5), columns=['c1', 'c2', 'c3', 'c4', 'c5']) df3 = md.DataFrame(data, chunk_size=4) r = df3.reindex(index=mt.arange(10, 1, -1, chunk_size=3)) result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = data.reindex(index=np.arange(10, 1, -1)) pd.testing.assert_frame_equal(result, expected) # test rebalance df4 = md.DataFrame(data) r = df4.rebalance() result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, data) chunk_metas = sess.get_tileable_chunk_metas(r.key) workers = list(set(itertools.chain(*(m.workers for m in chunk_metas.values())))) self.assertEqual(len(workers), 2) # test nunique data = pd.DataFrame(np.random.randint(0, 10, (100, 5)), columns=['c1', 'c2', 'c3', 'c4', 'c5']) df5 = md.DataFrame(data, chunk_size=4) r = df5.nunique() result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = data.nunique() pd.testing.assert_series_equal(result, expected)
def testMainDataFrameWithoutEtcd(self): self.start_processes( etcd=False, scheduler_args=['-Dscheduler.aggressive_assign=true']) sess = new_session(self.session_manager_ref.address) # test binary arithmetics with different indices raw1 = pd.DataFrame(np.random.rand(10, 10)) df1 = md.DataFrame(raw1, chunk_size=5) raw2 = pd.DataFrame(np.random.rand(10, 10)) df2 = md.DataFrame(raw2, chunk_size=6) r = df1 + df2 result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1 + raw2) raw1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = md.DataFrame(raw1, chunk_size=(10, 5)) raw2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = md.DataFrame(raw2, chunk_size=(10, 6)) r = df1 + df2 result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1 + raw2) raw1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9], columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = md.DataFrame(raw1, chunk_size=5) raw2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3], columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = md.DataFrame(raw2, chunk_size=6) r = df1 + df2 result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1 + raw2) # test sort_values raw1 = pd.DataFrame(np.random.rand(10, 10)) raw1[0] = raw1[0].apply(str) raw1.columns = pd.MultiIndex.from_product([list('AB'), list('CDEFG')]) df1 = md.DataFrame(raw1, chunk_size=5) r = df1.sort_values([('A', 'C')]) result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1.sort_values([('A', 'C')])) rs = np.random.RandomState(0) raw2 = pd.DataFrame({ 'a': rs.rand(10), 'b': [f's{rs.randint(1000)}' for _ in range(10)] }) raw2['b'] = raw2['b'].astype(md.ArrowStringDtype()) mdf = md.DataFrame(raw2, chunk_size=4) filtered = mdf[mdf['a'] > 0.5] df2 = filtered.sort_values(by='b') result = df2.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = raw2[raw2['a'] > 0.5].sort_values(by='b') pd.testing.assert_frame_equal(result, expected) s1 = pd.Series(np.random.rand(10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3]) series1 = md.Series(s1, chunk_size=6) result = series1.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_series_equal(result, s1) # test reindex data = pd.DataFrame(np.random.rand(10, 5), columns=['c1', 'c2', 'c3', 'c4', 'c5']) df3 = md.DataFrame(data, chunk_size=4) r = df3.reindex(index=mt.arange(10, 1, -1, chunk_size=3)) result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = data.reindex(index=np.arange(10, 1, -1)) pd.testing.assert_frame_equal(result, expected) # test rebalance df4 = md.DataFrame(data) r = df4.rebalance() result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, data) chunk_metas = sess.get_tileable_chunk_metas(r.key) workers = list( set(itertools.chain(*(m.workers for m in chunk_metas.values())))) self.assertEqual(len(workers), 2) # test nunique data = pd.DataFrame(np.random.randint(0, 10, (100, 5)), columns=['c1', 'c2', 'c3', 'c4', 'c5']) df5 = md.DataFrame(data, chunk_size=4) r = df5.nunique() result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = data.nunique() pd.testing.assert_series_equal(result, expected) # test re-execute df.groupby().agg().sort_values() rs = np.random.RandomState(0) data = pd.DataFrame({ 'col1': rs.rand(100), 'col2': rs.randint(10, size=100) }) df6 = md.DataFrame(data, chunk_size=40) grouped = df6.groupby('col2', as_index=False)['col2'].agg({"cnt": "count"}) \ .execute(session=sess, timeout=self.timeout) r = grouped.sort_values(by='cnt').head().execute(session=sess, timeout=self.timeout) result = r.fetch(session=sess) expected = data.groupby('col2', as_index=False)['col2'].agg({"cnt": "count"}) \ .sort_values(by='cnt').head() pd.testing.assert_frame_equal(result.reset_index(drop=True), expected.reset_index(drop=True)) r2 = df6.groupby('col2', as_index=False)['col2'].agg({"cnt": "count"}).sort_values(by='cnt').head() \ .execute(session=sess, timeout=self.timeout) result = r2.fetch(session=sess) pd.testing.assert_frame_equal(result.reset_index(drop=True), expected.reset_index(drop=True)) # test groupby with sample src_data_list = [] sample_count = 10 for b in range(5): data_count = int(np.random.randint(40, 100)) src_data_list.append( pd.DataFrame({ 'a': np.random.randint(0, 100, size=data_count), 'b': np.array([b] * data_count), 'c': np.random.randint(0, 100, size=data_count), 'd': np.random.randint(0, 100, size=data_count), })) data = pd.concat(src_data_list) shuffle_idx = np.arange(len(data)) np.random.shuffle(shuffle_idx) data = data.iloc[shuffle_idx].reset_index(drop=True) df7 = md.DataFrame(data, chunk_size=40) sampled = df7.groupby('b').sample(10) r = sampled.execute(session=sess, timeout=self.timeout) result = r.fetch(session=sess) self.assertFalse((result.groupby('b').count() - sample_count).any()[0])
def testMainDataFrameWithoutEtcd(self): self.start_processes( etcd=False, scheduler_args=['-Dscheduler.aggressive_assign=true']) sess = new_session(self.session_manager_ref.address) raw1 = pd.DataFrame(np.random.rand(10, 10)) df1 = md.DataFrame(raw1, chunk_size=5) raw2 = pd.DataFrame(np.random.rand(10, 10)) df2 = md.DataFrame(raw2, chunk_size=6) r = df1 + df2 result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1 + raw2) raw1 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(10), columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = md.DataFrame(raw1, chunk_size=(10, 5)) raw2 = pd.DataFrame(np.random.rand(10, 10), index=np.arange(11, 1, -1), columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = md.DataFrame(raw2, chunk_size=(10, 6)) r = df1 + df2 result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1 + raw2) raw1 = pd.DataFrame(np.random.rand(10, 10), index=[0, 10, 2, 3, 4, 5, 6, 7, 8, 9], columns=[4, 1, 3, 2, 10, 5, 9, 8, 6, 7]) df1 = md.DataFrame(raw1, chunk_size=5) raw2 = pd.DataFrame(np.random.rand(10, 10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3], columns=[5, 9, 12, 3, 11, 10, 6, 4, 1, 2]) df2 = md.DataFrame(raw2, chunk_size=6) r = df1 + df2 result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1 + raw2) raw1 = pd.DataFrame(np.random.rand(10, 10)) raw1[0] = raw1[0].apply(str) df1 = md.DataFrame(raw1, chunk_size=5) r = df1.sort_values(0) result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_frame_equal(result, raw1.sort_values(0)) rs = np.random.RandomState(0) raw2 = pd.DataFrame({ 'a': rs.rand(10), 'b': [f's{rs.randint(1000)}' for _ in range(10)] }) raw2['b'] = raw2['b'].astype(md.ArrowStringDtype()) mdf = md.DataFrame(raw2, chunk_size=3) df2 = mdf.sort_values(by='b') result = df2.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = raw2.sort_values(by='b') pd.testing.assert_frame_equal(result, expected) s1 = pd.Series(np.random.rand(10), index=[11, 1, 2, 5, 7, 6, 8, 9, 10, 3]) series1 = md.Series(s1, chunk_size=6) result = series1.execute(session=sess, timeout=self.timeout).fetch(session=sess) pd.testing.assert_series_equal(result, s1) data = pd.DataFrame(np.random.rand(10, 5), columns=['c1', 'c2', 'c3', 'c4', 'c5']) df3 = md.DataFrame(data, chunk_size=4) r = df3.reindex(index=mt.arange(10, 1, -1, chunk_size=3)) result = r.execute(session=sess, timeout=self.timeout).fetch(session=sess) expected = data.reindex(index=np.arange(10, 1, -1)) pd.testing.assert_frame_equal(result, expected)