def test_aggregate(self): coll = self.coll df = self.df result = MDataFrame(coll).groupby(['x']).agg({'x': 'sum'}) testagg = df.groupby('x').agg({'x': 'sum'}) testagg = testagg.rename(columns=dict(x='x_sum')) self.assertTrue(result.equals(testagg))
def test_mdataframe_xlarge(self): df = pd.DataFrame({ 'a': list(range(0, int(1e4 + 1))), 'b': list(range(0, int(1e4 + 1))) }) store = self.om.datasets store.put(df, 'mydata-xlarge', append=False) coll = store.collection('mydata-xlarge') result = MDataFrame(coll).value self.assertEqual(set(MDataFrame(coll).columns), set(list(df.columns))) self.assertTrue(result.equals(df))
def test_verylarge_dataframe(self): if not os.environ.get('TEST_LARGE'): return other = pd.DataFrame({ 'x': list(range(0, int(10e6))), 'y': list(range(0, int(10e6))), 'z': list(range(0, int(10e6))) }) coll = self.coll df = self.df result = MDataFrame(coll).value self.assertEqual(set(MDataFrame(coll).columns), set(list(df.columns))) self.assertTrue(result.equals(df))
def test_mdataframe_merge(self): coll = self.coll df = self.df om = self.om other = pd.DataFrame({ 'x': list(range(0, 20)), 'y': list(range(0, 20)), 'z': list(range(0, 20)) }) om.datasets.put(other, 'samplez', append=False) coll2 = om.datasets.collection('samplez') result = MDataFrame(coll).merge(coll2, on='x', how='left').value testdf = df.merge(other, on='x', how='left') self.assertTrue(result.equals(testdf))
def test_mdataframe_merge_right_cartesian(self): coll = self.coll df = self.df om = self.om other = pd.DataFrame({ 'x': list(range(0, 5)), 'y': list(range(0, 5)), 'z': list(range(0, 5)) }) om.datasets.put(other, 'samplez', append=False) om.datasets.put(other, 'samplez', append=True) other = om.datasets.get('samplez') coll2 = om.datasets.collection('samplez') result = MDataFrame(coll).merge(coll2, on='x', how='left', sort=True).value testdf = df.merge(other, on='x', how='left', sort=True) testdf = testdf[result.columns] self.assertTrue(result.equals(testdf))
def test_mdataframe_merge_filtered(self): coll = self.coll df = self.df om = self.om other = pd.DataFrame({ 'x': list(range(0, 5)), 'y': list(range(0, 5)), 'z': list(range(0, 5)) }) om.datasets.put(other, 'samplez', append=False) om.datasets.put(other, 'samplez', append=True) other = om.datasets.get('samplez') coll2 = om.datasets.collection('samplez') result = MDataFrame(coll).merge(coll2, on='x', how='left', sort=True, filter=dict(x__in=[1, 2])).value q = df['x'].isin([1, 2]) testdf = df[q].merge(other, on='x', how='left', sort=True) testdf = testdf[result.columns] self.assertTrue(result.equals(testdf))
def test_count_column(self): coll = self.coll df = self.df result = MDataFrame(coll).groupby(['x']).x.count() testgroup = df.groupby('x').x.count() self.assertTrue(result.equals(testgroup))