def testPandasGroupbyFilter(self): import pandas as pd data = [ [2001, 1], [2002, 2], [2003, 3] ] df = DataFrame(pd.DataFrame(data, columns=['id', 'fid'])) df2 = df.groupby('id').agg(df.fid.sum()) df3 = df2[df2.id == 2003] expected = [ [2003, 3] ] self.assertEqual(df3.execute().values.values.tolist(), expected) df2 = df.groupby('id').agg(df.fid.sum()) df2.execute() self.assertIsNotNone(df2._cache_data) df3 = df2[df2.id == 2003] self.assertEqual(df3.execute().values.values.tolist(), expected) self.assertEqual(df3.execute().values.values.tolist(), expected) df4 = df.fid.sum() self.assertEqual(df4.execute(), 6) self.assertEqual(df4.execute(), 6)
def testPandasGroupbyFilter(self): import pandas as pd data = [ [2001, 1], [2002, 2], [2003, 3] ] df = DataFrame(pd.DataFrame(data, columns=['id', 'fid'])) df2 = df.groupby('id').agg(df.fid.sum()) df3 = df2[df2.id == 2003] expected = [ [2003, 3] ] self.assertEqual(df3.execute().values.values.tolist(), expected) df2 = df.groupby('id').agg(df.fid.sum()) df2.execute() self.assertTrue(context.is_cached(df2)) df3 = df2[df2.id == 2003] self.assertEqual(df3.execute().values.values.tolist(), expected) self.assertEqual(df3.execute().values.values.tolist(), expected) df4 = df.fid.sum() self.assertEqual(df4.execute(), 6) self.assertEqual(df4.execute(), 6)
def test_df_store(self): self.delete_table(IONOSPHERE_SORTED_TABLE_PART) self.create_ionosphere_two_parts(IONOSPHERE_TABLE_TWO_PARTS) df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE_TWO_PARTS)).filter_partition('part1=1,part2=2') self.odps.delete_table(IONOSPHERE_SORTED_TABLE_PART) sorted_df = df.groupby(df['class']).agg(df.a01.count().rename('count')).sort('class', ascending=False) sorted_df.persist(IONOSPHERE_SORTED_TABLE_PART)
def testRepeatSetItem(self): df = DataFrame(self.table) df['rank'] = df.groupby('name').sort('id').id.rank() df['rank'] = df.groupby('name').sort('id').id.rank() self.assertEqual(len(df.execute()), 3)
def test_df_store(self): self.delete_table(IONOSPHERE_SORTED_TABLE_PART) self.create_ionosphere_two_parts(IONOSPHERE_TABLE_TWO_PARTS) df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE_TWO_PARTS)).filter_partition('part1=1,part2=2') drop_table(self.odps, IONOSPHERE_SORTED_TABLE_PART, async=False) sorted_df = df.groupby(df['class']).agg(df.a01.count().rename('count')).sort('class', ascending=False) sorted_df.persist(IONOSPHERE_SORTED_TABLE_PART)
def chengbenjiaupdatedf(dfsall, cnxxc): """ :param dfsall: 按照日期排序的销售明细记录 :param cnxxc: 数据库连接,为了查询生成产品价格变动记录 :return: """ # 读取进货记录(排除退货记录) dfpros = pd.read_sql_query( 'select 产品名称, strftime(\'%Y%m\',日期) as 年月, 金额 as 进货金额, 数量 as 进货数量, ' '单价 as 进货单价 from jinghuomingxi where 金额 >=0 order by 年月, 产品名称', cnxxc) dfpros = DataFrame(dfpros) descdb(dfpros[dfpros.进货金额 == 0].to_pandas()) # 按照月份汇总,生成成本单价并按照月份分组汇总,生成价格调整记录 dfpro = dfpros.groupby(['产品名称', '年月']).agg(进货金额=dfpros.进货金额.sum(), 进货数量=dfpros.进货数量.sum()) dfpro = dfpro[dfpro, (dfpro.进货金额 / dfpro.进货数量).round(2).rename('单价')] descdb(dfpro.to_pandas()) dfpro = dfpro.groupby(['产品名称', '单价' ]).agg(年月=dfpro.年月.min(), 进货金额=dfpro.进货金额.sum()).sort(['产品名称', '年月']) descdb(dfpro.to_pandas()) log.info('共有%d条产品价格记录,共有%d条产品价格记录(含调价)' % (dfpro.groupby('产品名称').agg( dfpro.单价.count()).to_pandas().shape[0], dfpro.to_pandas().shape[0])) log.info('共有%d条销售明细记录' % dfsall.shape[0]) dfsall['年月'] = dfsall['日期'].apply( lambda x: datetime.datetime.strftime(x, '%Y%m')) dfprosall = dfsall.groupby('商品全名', as_index=False)['金额'].sum() dfprosall.rename(columns={'商品全名': '产品名称', '金额': '销售金额'}, inplace=True) # 连接进货产品目录和销售产品目录,查看各自的空记录 dfproall = pd.merge(dfpro.groupby( ['产品名称']).agg(进货金额=dfpro.进货金额.sum()).to_pandas(), dfprosall, how='outer') descdb(dfproall) log.info('以下进货产品在本期无销售记录:%s' % list(dfproall[dfproall.销售金额.isnull().values == True]['产品名称'])) dfsall['成本单价'] = 0 dfpro = dfpro.to_pandas() for idx in dfpro.index: dfsall.loc[dfsall[(dfsall.商品全名 == dfpro.loc[idx]['产品名称']) & (dfsall.年月 >= dfpro.loc[idx]['年月'])].index, ['成本单价']] = dfpro.loc[idx]['单价'] dfsall['成本金额'] = dfsall['成本单价'] * dfsall['数量'] dfsall['毛利'] = dfsall['金额'] - dfsall['成本金额'] descdb(dfsall) del dfsall['年月'] # 删除过程数据 descdb(dfsall) return dfsall
def test_df_method(self): self.create_ionosphere(IONOSPHERE_TABLE) df = DataFrame(self.odps.get_table(IONOSPHERE_TABLE)) sorted_df = df.groupby(df['class']).agg(df.a01.count().rename('count')).sort('class', ascending=False) sorted_df.to_pandas()
# -*- coding: utf-8 -*- """ Created on Sat Sep 2 20:07:27 2017 @author: shuai.qian """ import matplotlib.pyplot as plt from odps.df import DataFrame from odps import ODPS o = ODPS('',project='', endpoint='') t = DataFrame(o.get_table('tmp_ods_mc_testing_dlt')) print("=================================> START <==================================") #print(t.dtypes) #print(t["class"].head(5)) t.groupby('class').agg(count = t['class'].count()) # %matplotlib inline t['class'].value_counts().plot(kind = 'bar', x = 'class', xlabel = 'cnt' ) tmp = range(0,10,2) tmp.pop(1)