class Layout: ''' 类Layout用于各种数据结构的转换。 属性: self.ad: AdminCode的一个实例 self._data: 读入的数据 self.year:代表数据的时间,是一个str的列表 self.acode:代表地区的行政代码,是一个str的list self.region:代表地区名称,是一个str的list self.variable:代表数据的变量,是一个str的list self.ndim:代表数据的维度,是一个字典 方法: __init__(self,data=None):构造函数,用来进行初始化设置。 _type(self)->dict:辅助函数,用来返回数据的结构类型,无输入参数。返回值是一个字典。: stackToNormal(self):转换数据,从stack格式到normal格式 Demo: 转换区域数据的格式,从stack到normal ad = AdminCode() rdata = RegionalData() # 构建一个RegionalData的实例 mdata = rdata.query(region=ad[u'浙江',u'f'],variable=[u'财政支出'],year=2012) # 查询数据 lout = Layout(mdata) # 进行格式转换 print(lout.stackToNormal()) 得到的结果类似于 region 财政支出 330100 杭州市 78628 330200 宁波市 82844 330300 温州市 38779 330400 嘉兴市 26070 ''' # 构造函数 def __init__(self,data=None): self.ad = AdminData() self._data = data self.tags = self._type() self.ndim = {'year':len(self.tags['year']),'variable':len(self.tags['variable']),'region':len(self.tags['region'])} # 格式转换 def stackToNormal(self): # 构建时间序列数据 if (self.ndim['variable'] == 1) and (self.ndim['region'] == 1): #data = pd.Series(dict(zip(year,[group for name, group in self._data.groupby(['acode','variable'],sort=True)][0]['value']))) #d = dict(zip(year,data)) #sname = '|'.join([self.region[0],self.variable[0]]) #self.tags = [self.region[0],self.variable[0]] #self.tags = {'region':self.region[0],'variable':self.variable[0]} return pd.Series(dict(zip(self.year,[group for name, group in self._data.groupby(['acode','variable'],sort=True)][0]['value']))) # 构建横截面数据(地区|变量) if self.ndim['year'] == 1: g1 = [group for name,group in self._data.groupby(['year'], sort=True)][0] g2 = g1.groupby(['variable'], sort=True) i = 0 mdata = [] for name,data in g2: rdata = pd.DataFrame({name:data['value'].values},index=data['acode'].values) if i == 0: mdata = rdata else: mdata = pd.merge(mdata,rdata,left_index=True,right_index=True,how='outer') i = i + 1 mdata.insert(0, 'region', [self.ad.get_by_acode(item)[0]['region'] for item in mdata.index]) #self.information = self.type['year'] #self.tags = {'region':} return mdata # 构建横截面数据(地区|时间) if self.ndim['variable'] == 1: g = self._data.groupby(['year'], sort=True) i = 0 mdata = [] for name,data in g: rdata = pd.DataFrame({name:data['value'].values},index=data['acode'].values) if i == 0: mdata = rdata else: mdata = pd.merge(mdata,rdata,left_index=True,right_index=True,how='outer') i = i + 1 mdata.insert(0, 'region', [self.ad.get_by_acode(item)[0]['region'] for item in mdata.index]) #self.information = self.type['variable'] return mdata # panel data g = self._data.groupby(['year'], sort=True) year = [] pdata =[] for y,g1 in g: g2 = g1.groupby(['variable'], sort=True) i = 0 mdata = [] for name,data in g2: rdata = pd.DataFrame({name:data['value'].values},index=data['acode'].values) if i == 0: mdata = rdata else: mdata = pd.merge(mdata,rdata,left_index=True,right_index=True,how='outer') i = i + 1 mdata.insert(0, 'region', [self.ad.get_by_acode(item)[0]['region'] for item in mdata.index]) year.append(str(y)) pdata.append(mdata) result = pd.Panel(dict(zip(year,pdata))) #self.information = [] return result # 辅助函数,返回数据结构 def _type(self)->dict: g = self._data.groupby(['year'], sort=True) self.year = [str(name) for name, group in g] g = self._data.groupby(['acode'], sort=True) self.acode = [name for name, group in g] self.region = [self.ad.get_by_acode(item)[0]['region'] for item in self.acode] g = self._data.groupby(['variable'], sort=True) self.variable = [name for name, group in g] return {'year':self.year,'region':self.region,'variable':self.variable}
class RegionFormat(Format): '''RegionFormat类用来进行区域数据格式转换 :param list data: 区域数据 ''' def __init__(self,data): Format.__init__(self,data) self.ad = AdminData() self.to_set_type() # 转换格式 def transform(self,sourcetype='stack',targettype='normal',connect='outer'): if (re.match('stack',sourcetype) is not None) and (re.match('normal',targettype) is not None): return self.stack_to_normal(connect=connect) def stack_to_normal(self,connect): '''转换区域数据格式,从stack格式到normal格式 :param str connect: 区域横截面数据连接方式,可以分别是'inner'或'outer' :return: dict ''' scale = self.ndim.get('scale') # 当存在全市和市辖区的差别的时候 if (scale is not None) and scale > 1: # 构建横截面数据(地区|变量) if self.ndim['year'] == 1: g1 = [group for name,group in self._data.groupby(['year'], sort=True)][0] region_dict = dict(zip(g1['acode'],g1['region'])) g2 = g1.groupby(['variable','scale'], sort=True) i = 0 mdata = [] for name,data in g2: rdata = pd.DataFrame({'_'.join(name):data['value'].values},index=data['acode'].values) if i == 0: mdata = rdata else: mdata = pd.merge(mdata,rdata,left_index=True,right_index=True,how=connect) i = i + 1 tags = {'year',self.year[0]} mdata.insert(0, 'region', [region_dict[i] for i in mdata.index]) return {'tags':tags,'data':mdata} # panel data g = self._data.groupby(['year'], sort=True) year = [] pdata =[] for y,g1 in g: region_dict = dict(zip(g1['acode'],g1['region'])) g2 = g1.groupby(['variable','scale'], sort=True) i = 0 mdata = [] for name,data in g2: rdata = pd.DataFrame({'_'.join(name):data['value'].values},index=data['acode'].values) if i == 0: mdata = rdata else: mdata = pd.merge(mdata,rdata,left_index=True,right_index=True,how='outer') i = i + 1 mdata.insert(0, 'region', [region_dict[i] for i in mdata.index]) year.append(str(y)) pdata.append(mdata) result = pd.Panel(dict(zip(year,pdata))) mdata = result.swapaxes('items','minor') mdata = mdata.swapaxes('major','minor') mdata = mdata.to_frame(False) result2 = result.dropna(axis=1) mdata2 = result2.swapaxes('items','minor') mdata2 = mdata2.swapaxes('major','minor') mdata2 = mdata2.to_frame(False) return {'data':mdata,'pdata':result,'balanceddata':mdata2} # 如果没有scale,或者scale为1 else: # 时间序列模型 if (self.ndim['variable'] == 1) and (self.ndim['region'] == 1): tags = {'variable':self.variable[0],'region':self.ad.get_by_acode(self.acode[0])[0]['region']} result = pd.Series(dict(zip(self.year,[group for name, group in self._data.groupby(['acode','variable'],sort=True)][0]['value']))) if scale is not None: tags['scale'] = self.scale[0] return {'tags':tags,'data':result} # 构建横截面数据(地区|变量) if self.ndim['year'] == 1: g1 = [group for name,group in self._data.groupby(['year'], sort=True)][0] region_dict = dict(zip(g1['acode'],g1['region'])) g2 = g1.groupby(['variable'], sort=True) i = 0 mdata = [] for name,data in g2: rdata = pd.DataFrame({name:data['value'].values},index=data['acode'].values) if i == 0: mdata = rdata else: mdata = pd.merge(mdata,rdata,left_index=True,right_index=True,how='outer') i = i + 1 mdata.insert(0, 'region', [region_dict[i] for i in mdata.index]) tags = {'year':self.year[0]} if scale is not None: tags['scale'] = self.scale[0] return {'tags':tags,'data':mdata} # panel data g = self._data.groupby(['year'], sort=True) year = [] pdata =[] for y,g1 in g: region_dict = dict(zip(g1['acode'],g1['region'])) g2 = g1.groupby(['variable'], sort=True) i = 0 mdata = [] for name,data in g2: rdata = pd.DataFrame({name:data['value'].values},index=data['acode'].values) if i == 0: mdata = rdata else: mdata = pd.merge(mdata,rdata,left_index=True,right_index=True,how='outer') i = i + 1 mdata.insert(0, 'region', [region_dict[i] for i in mdata.index]) year.append(str(y)) pdata.append(mdata) print(pdata) result = pd.Panel(dict(zip(year,pdata))) mdata = result.swapaxes('items','minor') mdata = mdata.swapaxes('major','minor') mdata = mdata.to_frame(False) result2 = result.dropna(axis=1) mdata2 = result2.swapaxes('items','minor') mdata2 = mdata2.swapaxes('major','minor') mdata2 = mdata2.to_frame(False) if scale is not None: tags = {'scale':self.scale} return {'tags':tags,'data':mdata,'pdata':result,'balanceddata':mdata2} else: return {'data':mdata,'pdata':result,'balanceddata':mdata2} # 辅助函数,返回数据结构 def to_set_type(self): '''辅助函数,用来设置区域数据的参数 :return: 无返回值 ''' # 设定年份 g = self._data.groupby(['year'], sort=True) self.year = [str(name) for name, group in g] # 设定行政区域代码 g = self._data.groupby(['acode'], sort=True) self.acode = [name for name, group in g] self.number_of_region = len(g.groups) # 设定变量 g = self._data.groupby(['variable'], sort=True) self.variable = [name for name, group in g] # 设定变量的维度 self.ndim = {'year':len(self.year),'variable':len(self.variable),'region':self.number_of_region} # 设定区域尺度:全市或者市辖区 if 'scale' in self._data.columns: g = self._data.groupby(['scale'],sort=True) self.scale = [name for name, group in g] self.ndim['scale'] = len(self.scale)
from werkzeug.datastructures import ImmutableMultiDict import json year = list(range(1990, 2015)) print(year) # 利用CEIC的数据来做Demo # 1. 导入CEIC数据 db = Database() con = db.connect("regionDB", "CEIC") ceic_region_code = sorted(con.find().distinct("acode")) print(len(ceic_region_code)) # 2. 搜索行政区划代码数据库 admin_data = AdminData() regions = [admin_data.get_by_acode(acode=acode)[0] for acode in ceic_region_code] # 3. 生成区域行政数据 region_list = [] for region in regions: # 第一个元素是行政区划代码 if region["adminlevel"] < 3: parent = u"中国" else: parent = admin_data.database.collection.find_one({"_id": region["parent"]}) parent = "/".join(["中国", "".join([parent["region"], "属下"])]) region_list.append([region["acode"], parent, region["region"]]) print(region_list) # json.dump(region_list,fp=open('e:/gitwork/application/testweb/region_ceic.txt', 'w')) # 4. 查询变量