def iter_generic_obs(self, sdmxobj, with_value, with_attributes): ObsKeyTuple = ObsAttrTuple = None if self.dsd: # this is a structure-specific dataset for obs in sdmxobj._elem.iterchildren('Obs'): # dimensions: obs_attrib = obs.attrib # XML attrib if not ObsKeyTuple: obs_key_id = [k for k in self.dim_ids if k in obs_attrib] ObsKeyTuple = namedtuple_factory('ObsKey', obs_key_id) obs_key_values = [obs_attrib[k] for k in self.dim_ids if k in obs_attrib] obs_key = ObsKeyTuple._make(obs_key_values) obs_value = obs_attrib['OBS_VALUE'] if with_value else None if with_attributes: if not ObsAttrTuple: obs_attr_id = [ k for k in self.attrib_ids if k in obs_attrib] ObsAttrTuple = namedtuple_factory( 'ObsAttributes', obs_attr_id) obs_attr_values = [obs_attrib[k] for k in obs_attr_id] obs_attr = ObsAttrTuple(*obs_attr_values) else: obs_attr = None yield self._ObsTuple(obs_key, obs_value, obs_attr) else: # we have a generic dataset for obs in self._paths['generic_obs_path'](sdmxobj._elem): # Construct the namedtuple for the ObsKey. # The namedtuple class is created on first iteration. obs_key_values = self._paths['obs_key_values_path'](obs) if not ObsKeyTuple: obs_key_id = self._paths['obs_key_id_path'](obs) ObsKeyTuple = namedtuple_factory('ObsKey', obs_key_id) obs_key = ObsKeyTuple._make(obs_key_values) if with_value: obs_value = self._paths['obs_value_path'](obs)[0] else: obs_value = None if with_attributes: obs_attr_values = self._paths['attr_values_path'](obs) obs_attr_id = self._paths['attr_id_path'](obs) obs_attr_type = namedtuple_factory( 'ObsAttributes', obs_attr_id) obs_attr = obs_attr_type(*obs_attr_values) else: obs_attr = None yield self._ObsTuple(obs_key, obs_value, obs_attr)
def iter_generic_series_obs(self, sdmxobj, with_value, with_attributes, reverse_obs=False): obs_l = sorted(sdmxobj._elem['observations'].items(), key=self.getitem0, reverse=reverse_obs) for obs in obs_l: # value for dim at obs, e.g. '2014' for time series. # As this method is called only when each obs has but one dimension, we # it is at index 0. obs_dim_value = self._obs_dim[0]['values'][int(obs[0])]['id'] obs_value = obs[1][0] if with_value else None if with_attributes and len(obs[1]) > 1: obs_attr_idx = obs[1][1:] obs_attr_raw = [ (d['id'], d['values'][i].get('id', d['values'][i]['name'])) for i, d in zip(obs_attr_idx, self._obs_attrib) if i is not None ] if obs_attr_raw: obs_attr_id, obs_attr_values = zip(*obs_attr_raw) obs_attr_type = namedtuple_factory('ObsAttributes', obs_attr_id) obs_attr = obs_attr_type(*obs_attr_values) else: obs_attr = None else: obs_attr = None yield self._SeriesObsTuple(obs_dim_value, obs_value, obs_attr)
def initialize(self, source): tree = json.load(source) # pre-fetch some structures for efficient use in series and obs a = tree['structure'].get('attributes', {}) self._dataset_attrib = a.get('dataSet', []) self._series_attrib = a.get('series', []) self._obs_attrib = a.get('observation', []) d = tree['structure'].get('dimensions', {}) self._dataset_dim = d.get('dataSet', []) self._series_dim = d.get('series', []) self._obs_dim = d.get('observation', []) self._dataset_dim_key = {dim['keyPosition']: dim['id'] for dim in self._dataset_dim} self._dataset_dim_values = {dim['keyPosition']: dim['values'][0]['id'] for dim in self._dataset_dim} if self._series_dim: self._key_len = len(self._dataset_dim) + len(self._series_dim) # Map keyPositions of dimensions at series level to dimension IDs, like with dataset-level dims above. # In case of cross-sectional dataset, the only dimension at series level has no # keyPosition, eg. TIME_PERIOD. Instead, the keyPosition of the dim at observation # is used to fill the gap. self._series_dim_key = {dim.get('keyPosition', self._obs_dim[0].get('keyPosition')): dim['id'] for dim in self._series_dim} self.SeriesKeyTuple = namedtuple_factory('SeriesKeyTuple', (self._dataset_dim_key.get(i) or self._series_dim_key.get(i) for i in range(self._key_len))) else: # Dataset must be flat self._key_len = len(self._dataset_dim) + len(self._obs_dim) self.obs_attr_id = [d['id'] for d in self._obs_attrib] # init message instance cls = model.DataMessage self.message = cls(self, tree) return self.message
def iter_generic_series_obs(self, sdmxobj, with_value, with_attributes, reverse_obs=False): obs_l = sorted(sdmxobj._elem['observations'].items(), key=self.getitem0, reverse=reverse_obs) for obs in obs_l: # value for dim at obs, e.g. '2014' for time series. # As this method is called only when each obs has but one dimension, we # it is at index 0. obs_dim_value = self._obs_dim[0]['values'][int(obs[0])]['id'] obs_value = obs[1][0] if with_value else None if with_attributes and len(obs[1]) > 1: obs_attr_idx = obs[1][1:] obs_attr_raw = [(d['id'], d['values'][i].get('id')) for i, d in zip(obs_attr_idx, self._obs_attrib) if i is not None] if obs_attr_raw: obs_attr_id, obs_attr_values = zip(*obs_attr_raw) obs_attr_type = namedtuple_factory( 'ObsAttributes', obs_attr_id) obs_attr = obs_attr_type(*obs_attr_values) else: obs_attr = None else: obs_attr = None yield self._SeriesObsTuple(obs_dim_value, obs_value, obs_attr)
def iter_generic_series_obs(self, sdmxobj, with_value, with_attributes, reverse_obs=False): obs_l = sorted(sdmxobj._elem.value['observations'].items(), key=self.getitem0, reverse=reverse_obs) obs_dim_l = parse('$.structure.dimensions.observation[*]').find( sdmxobj._elem) for obs in obs_l: obs_dim = obs_dim_l[0].value['values'][int(obs[0])]['id'] if with_value: obs_value = obs[1][0] else: obs_value = None if with_attributes: obs_attr_values = self._paths['attr_values_path'](obs) obs_attr_id = self._paths['attr_id_path'](obs) obs_attr_type = namedtuple_factory('ObsAttributes', obs_attr_id) obs_attr = obs_attr_type(*obs_attr_values) else: obs_attr = None yield self._SeriesObsTuple(obs_dim, obs_value, obs_attr)
def series_attrib(self, sdmxobj): value_idx = sdmxobj._elem.get('attributes') if value_idx: attrib_list = [(a['id'], a['values'][i].get('id', a['values'][i]['name'])) for i, a in zip(value_idx, self._series_attrib) if i is not None] attrib_ids, attrib_values = zip(*attrib_list) return namedtuple_factory('Attrib', attrib_ids)(*attrib_values)
def iter_generic_obs(self, sdmxobj, with_value, with_attributes): # Make type namedtuple for obs_key. It must be # merged with any dimension values at dataset level maintaining the # key position order. # Note that the measure dimension (such as TIME_PERIOD) has no key position. # We fill this gap by injecting the highest key position. _obs_dim_key = { dim.get('keyPosition', self._key_len - 1): dim['id'] for dim in self._obs_dim } _GenericObsKey = namedtuple_factory( 'GenericObservationKey', (self._dataset_dim_key.get(d) or _obs_dim_key.get(d) for d in range(self._key_len))) obs_l = sorted(sdmxobj._elem.value['observations'].items(), key=self.getitem0) for dim, value in obs_l: # Construct the key for this observation key_idx = [int(i) for i in dim.split(':')] obs_key_values = [ d['values'][i]['id'] for i, d in zip(key_idx, self._obs_dim) ] obs_key = _GenericObsKey._make( self._dataset_dim_values.get(d) or obs_key_values.pop(0) for d in range(self._key_len)) # Read the value obs_value = value[0] if with_value else None # Read any attributes if with_attributes and len(value) > 1: obs_attr_idx = value[1:] obs_attr_raw = [(d['id'], d['values'][i].get('id')) for i, d in zip(obs_attr_idx, self._obs_attrib) ] if obs_attr_raw: obs_attr_id, obs_attr_values = zip(*obs_attr_raw) obs_attr_type = namedtuple_factory('ObsAttributes', obs_attr_id) obs_attr = obs_attr_type(*obs_attr_values) else: obs_attr = None else: obs_attr = None yield self._SeriesObsTuple(obs_key, obs_value, obs_attr)
def iter_generic_obs(self, sdmxobj, with_value, with_attributes): # Make type namedtuple for obs_key. It must be # merged with any dimension values at dataset level maintaining the # key position order. # Note that the measure dimension (such as TIME_PERIOD) has no key position. # We fill this gap by injecting the highest key position. _obs_dim_key = {dim.get('keyPosition', self._key_len - 1): dim['id'] for dim in self._obs_dim} _GenericObsKey = namedtuple_factory('GenericObservationKey', (self._dataset_dim_key.get(d, _obs_dim_key.get(d)) for d in range(self._key_len))) obs_l = sorted(sdmxobj._elem.value['observations'].items(), key=self.getitem0) for dim, value in obs_l: # Construct the key for this observation key_idx = [int(i) for i in dim.split(':')] obs_key_values = (d['values'][i]['id'] for i, d in zip(key_idx, self._obs_dim)) obs_key = _GenericObsKey._make(self._dataset_dim_values.get(d) or next(obs_key_values) for d in range(self._key_len)) # Read the value obs_value = value[0] if with_value else None # Read any attributes if with_attributes and len(value) > 1: obs_attr_idx = value[1:] obs_attr_raw = [(d['id'], d['values'][i].get('id') if i is not None else None) for i, d in zip(obs_attr_idx, self._obs_attrib)] if obs_attr_raw: obs_attr_id, obs_attr_values = zip(*obs_attr_raw) obs_attr_type = namedtuple_factory( 'ObsAttributes', obs_attr_id) obs_attr = obs_attr_type(*obs_attr_values) else: obs_attr = None else: obs_attr = None yield self._SeriesObsTuple(obs_key, obs_value, obs_attr)
def series_key(self, sdmxobj): series_key_id = self._paths['series_key_id_path'](sdmxobj._elem) # Translate IDs to match with SDMX 2.1. translate_dict = {'FREQUENCY': 'FREQ'} series_key_id = [translate_dict.get(x, x) for x in series_key_id] series_key_values = self._paths['series_key_values_path']( sdmxobj._elem) SeriesKeyTuple = namedtuple_factory('SeriesKey', series_key_id) return SeriesKeyTuple._make(series_key_values)
def series_key(self, sdmxobj): series_key_id = self._paths['series_key_id_path'](sdmxobj._elem) # Translate IDs to match with SDMX 2.1. translate_dict = {'FREQUENCY': 'FREQ'} series_key_id = [translate_dict.get(x,x) for x in series_key_id] series_key_values = self._paths[ 'series_key_values_path'](sdmxobj._elem) SeriesKeyTuple = namedtuple_factory('SeriesKey', series_key_id) return SeriesKeyTuple._make(series_key_values)
def test_concat_namedtuples(): num = list(range(26)) chars = [chr(65 + i) for i in num] limits = [0, 4, 5, 8, 14, 22, 25] tuples = [] for i in range(len(limits) - 1): newtype = namedtuple_factory('Test', chars[limits[i]:limits[i + 1]]) t = newtype(*num[limits[i]:limits[i + 1]]) tuples.append(t) concat1 = concat_namedtuples(*tuples) assert isinstance(concat1, tuple) assert concat1.A == 0
def iter_generic_series_obs(self, sdmxobj, with_value, with_attributes, reverse_obs=False): ObsAttrTuple = None if self.dsd: # this is a structure-specific dataset for obs in sdmxobj._elem.iterchildren(reversed=reverse_obs): obs_attrib = obs.attrib # XML attributes # dim at obs obs_dim = obs_attrib[self.message.data.dim_at_obs] obs_value = obs_attrib['OBS_VALUE'] if with_value else None if with_attributes: if not ObsAttrTuple: obs_attr_id = [ k for k in self.attrib_ids if k in obs_attrib] ObsAttrTuple = namedtuple_factory( 'ObsAttributes', obs_attr_id) obs_attr_values = [obs_attrib[k] for k in obs_attr_id] obs_attr = ObsAttrTuple(*obs_attr_values) else: obs_attr = None yield self._SeriesObsTuple(obs_dim, obs_value, obs_attr) else: # we have a generic dataset for obs in sdmxobj._elem.iterchildren( '{http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic}Obs', reversed=reverse_obs): obs_dim = self._paths['generic_series_dim_path'](obs)[0] if with_value: obs_value = self._paths['obs_value_path'](obs)[0] else: obs_value = None if with_attributes: obs_attr_values = self._paths['attr_values_path'](obs) obs_attr_id = self._paths['attr_id_path'](obs) obs_attr_type = namedtuple_factory( 'ObsAttributes', obs_attr_id) obs_attr = obs_attr_type(*obs_attr_values) else: obs_attr = None yield self._SeriesObsTuple(obs_dim, obs_value, obs_attr)
def series_key(self, sdmxobj): # pull down dataset key full_key_ids = [d['id'] for d in self._dataset_dim] full_key_values = [d['values'][0]['id'] for d in self._dataset_dim] key_idx = [int(i) for i in sdmxobj._elem['_key'].split(':')] series_key_ids = [d['id'] for d in self._series_dim] series_key_values = [ d['values'][i]['id'] for i, d in zip(key_idx, self._series_dim) ] full_key_ids.extend(series_key_ids) full_key_values.extend(series_key_values) SeriesKeyTuple = namedtuple_factory('SeriesKey', full_key_ids) return SeriesKeyTuple._make(full_key_values)
def group_key(self, sdmxobj): if self.dsd: # handle structure-specific dataset group_attrib = sdmxobj._elem.attrib group_key_id, group_key_values = zip(*((k, group_attrib[k]) for k in self.dim_ids if k in group_attrib)) else: # generic dataset group_key_id = self._paths['group_key_id_path'](sdmxobj._elem) group_key_values = self._paths[ 'group_key_values_path'](sdmxobj._elem) GroupKeyTuple = namedtuple_factory('GroupKey', group_key_id) return GroupKeyTuple._make(group_key_values)
def series_key(self, sdmxobj): if self.dsd: # handle structured-specific dataset series_attrib = sdmxobj._elem.attrib series_key_id, series_key_values = zip(*((k, series_attrib[k]) for k in self.dim_ids if k in series_attrib)) else: # generic dataset series_key_id = self._paths['series_key_id_path'](sdmxobj._elem) series_key_values = self._paths[ 'series_key_values_path'](sdmxobj._elem) SeriesKeyTuple = namedtuple_factory('SeriesKey', series_key_id) return SeriesKeyTuple._make(series_key_values)
def iter_generic_obs(self, sdmxobj, with_value, with_attributes): for obs in self._paths['generic_obs_path'](sdmxobj._elem): # Construct the namedtuple for the ObsKey. # The namedtuple class is created on first iteration. obs_key_values = self._paths['obs_key_values_path'](obs) try: obs_key = ObsKeyTuple._make(obs_key_values) except NameError: obs_key_id = self._paths['obs_key_id_path'](obs) ObsKeyTuple = namedtuple_factory('ObsKey', obs_key_id) obs_key = ObsKeyTuple._make(obs_key_values) if with_value: obs_value = self._paths['obs_value_path'](obs)[0] else: obs_value = None if with_attributes: obs_attr_values = self._paths['attr_values_path'](obs) obs_attr_id = self._paths['attr_id_path'](obs) obs_attr_type = namedtuple_factory( 'ObsAttributes', obs_attr_id) obs_attr = obs_attr_type(*obs_attr_values) else: obs_attr = None yield self._ObsTuple(obs_key, obs_value, obs_attr)
def iter_generic_obs(self, sdmxobj, with_value, with_attributes): for obs in self._paths['generic_obs_path'](sdmxobj._elem): # Construct the namedtuple for the ObsKey. # The namedtuple class is created on first iteration. obs_key_values = self._paths['obs_key_values_path'](obs) try: obs_key = ObsKeyTuple._make(obs_key_values) except NameError: obs_key_id = self._paths['obs_key_id_path'](obs) ObsKeyTuple = namedtuple_factory('ObsKey', obs_key_id) obs_key = ObsKeyTuple._make(obs_key_values) if with_value: obs_value = self._paths['obs_value_path'](obs)[0] else: obs_value = None if with_attributes: obs_attr_values = self._paths['attr_values_path'](obs) obs_attr_id = self._paths['attr_id_path'](obs) obs_attr_type = namedtuple_factory('ObsAttributes', obs_attr_id) obs_attr = obs_attr_type(*obs_attr_values) else: obs_attr = None yield self._ObsTuple(obs_key, obs_value, obs_attr)
def series_attrib(self, sdmxobj): if self.dsd: # structure-specific dataset series_attrib = sdmxobj._elem.attrib attrib_l = [(k, series_attrib[k]) for k in self.attrib_ids if k in series_attrib] if attrib_l: attr_id, attr_values = zip(*attrib_l) else: attr_id = attr_values = [] else: # generic dataset attr_id = self._paths['attr_id_path'](sdmxobj._elem) attr_values = self._paths['attr_values_path'](sdmxobj._elem) return namedtuple_factory('Attrib', attr_id)(*attr_values)
def series_key(self, sdmxobj): # pull down dataset key dataset_dim = parse( '$.structure.dimensions.dataSet[*]').find(sdmxobj._elem) full_key_ids = [d.value['id'] for d in dataset_dim] full_key_values = [d.value['values'][0]['id'] for d in dataset_dim] key_idx = [int(i) for i in sdmxobj._elem.value['_key'].split(':')] struct_dim = parse('$.structure.dimensions.series').find( sdmxobj._elem)[0].value series_key_ids = [d['id'] for d in struct_dim] series_key_values = [d['values'][i]['id'] for i, d in zip(key_idx, struct_dim)] full_key_ids.extend(series_key_ids) full_key_values.extend(series_key_values) SeriesKeyTuple = namedtuple_factory('SeriesKey', full_key_ids) return SeriesKeyTuple._make(full_key_values)
def series_key(self, sdmxobj): # pull down dataset key dataset_dim = parse('$.structure.dimensions.dataSet[*]').find( sdmxobj._elem) full_key_ids = [d.value['id'] for d in dataset_dim] full_key_values = [d.value['values'][0]['id'] for d in dataset_dim] key_idx = [int(i) for i in sdmxobj._elem.value['_key'].split(':')] struct_dim = parse('$.structure.dimensions.series').find( sdmxobj._elem)[0].value series_key_ids = [d['id'] for d in struct_dim] series_key_values = [ d['values'][i]['id'] for i, d in zip(key_idx, struct_dim) ] full_key_ids.extend(series_key_ids) full_key_values.extend(series_key_values) SeriesKeyTuple = namedtuple_factory('SeriesKey', full_key_ids) return SeriesKeyTuple._make(full_key_values)
def iter_generic_series_obs(self, sdmxobj, with_value, with_attributes, reverse_obs=False): for obs in sdmxobj._elem.iterchildren( '{http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic}Obs', reversed=reverse_obs): obs_dim = self._paths['generic_series_dim_path'](obs)[0] if with_value: obs_value = self._paths['obs_value_path'](obs)[0] else: obs_value = None if with_attributes: obs_attr_values = self._paths['attr_values_path'](obs) obs_attr_id = self._paths['attr_id_path'](obs) obs_attr_type = namedtuple_factory( 'ObsAttributes', obs_attr_id) obs_attr = obs_attr_type(*obs_attr_values) else: obs_attr = None yield self._SeriesObsTuple(obs_dim, obs_value, obs_attr)
def initialize(self, source): tree = json.load(source) # pre-fetch some structures for efficient use in series and obs a = tree['structure']['attributes'] self._dataset_attrib = a['dataSet'] self._series_attrib = a['series'] self._obs_attrib = a['observation'] d = tree['structure']['dimensions'] self._dataset_dim = d.get('dataSet', []) self._series_dim = d['series'] self._obs_dim = d['observation'] self._dataset_dim_key = { dim['keyPosition']: dim['id'] for dim in self._dataset_dim } self._dataset_dim_values = { dim['keyPosition']: dim['values'][0]['id'] for dim in self._dataset_dim } if self._series_dim: self._key_len = len(self._dataset_dim) + len(self._series_dim) # Map keyPositions of dimensions at series level to dimension IDs, like with dataset-level dims above. # In case of cross-sectional dataset, the only dimension at series level has no # keyPosition, eg. TIME_PERIOD. Instead, the keyPosition of the dim at observation # is used to fill the gap. self._series_dim_key = { dim.get('keyPosition', self._obs_dim[0].get('keyPosition')): dim['id'] for dim in self._series_dim } self.SeriesKeyTuple = namedtuple_factory( 'SeriesKeyTuple', (self._dataset_dim_key.get(i) or self._series_dim_key.get(i) for i in range(self._key_len))) else: # Dataset must be flat self._key_len = len(self._dataset_dim) + len(self._obs_dim) self.obs_attr_id = [d['id'] for d in self._obs_attrib] # init message instance cls = model.DataMessage self.message = cls(self, tree) return self.message
def iter_generic_series_obs(self, sdmxobj, with_value, with_attributes, reverse_obs=False): obs_l = sorted(sdmxobj._elem.value[ 'observations'].items(), key=self.getitem0, reverse=reverse_obs) obs_dim_l = parse( '$.structure.dimensions.observation[*]').find(sdmxobj._elem) for obs in obs_l: obs_dim = obs_dim_l[0].value['values'][int(obs[0])]['id'] if with_value: obs_value = obs[1][0] else: obs_value = None if with_attributes: obs_attr_values = self._paths['attr_values_path'](obs) obs_attr_id = self._paths['attr_id_path'](obs) obs_attr_type = namedtuple_factory( 'ObsAttributes', obs_attr_id) obs_attr = obs_attr_type(*obs_attr_values) else: obs_attr = None yield self._SeriesObsTuple(obs_dim, obs_value, obs_attr)
def iter_generic_series_obs(self, sdmxobj, with_value, with_attributes, reverse_obs=False): for obs in sdmxobj._elem.iterchildren( '{http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic}Obs', reversed=reverse_obs): obs_dim = self._paths['generic_series_dim_path'](obs)[0] if with_value: obs_value = self._paths['obs_value_path'](obs)[0] else: obs_value = None if with_attributes: obs_attr_values = self._paths['attr_values_path'](obs) obs_attr_id = self._paths['attr_id_path'](obs) obs_attr_type = namedtuple_factory('ObsAttributes', obs_attr_id) obs_attr = obs_attr_type(*obs_attr_values) else: obs_attr = None yield self._SeriesObsTuple(obs_dim, obs_value, obs_attr)
def series_attrib(self, sdmxobj): attr_id = self._paths['attr_id_path'](sdmxobj._elem) attr_values = self._paths['attr_values_path'](sdmxobj._elem) return namedtuple_factory('Attrib', attr_id)(*attr_values)
def group_key(self, sdmxobj): group_key_id = self._paths['group_key_id_path'](sdmxobj._elem) group_key_values = self._paths[ 'group_key_values_path'](sdmxobj._elem) GroupKeyTuple = namedtuple_factory('GroupKey', group_key_id) return GroupKeyTuple._make(group_key_values)
def series_key(self, sdmxobj): series_key_id = self._paths['series_key_id_path'](sdmxobj._elem) series_key_values = self._paths[ 'series_key_values_path'](sdmxobj._elem) SeriesKeyTuple = namedtuple_factory('SeriesKey', series_key_id) return SeriesKeyTuple._make(series_key_values)
class Reader(BaseReader): """ Read SDMXJSON 2.1 and expose it as instances from pandasdmx.model """ def read_as_str(self, name, sdmxobj, first_only=True): result = self._paths[name](sdmxobj._elem) if result: if first_only: return result[0].value else: return [r.value for r in result] def initialize(self, source): tree = json.load(source) # pre-fetch some structures for efficient use in series and obs a = tree['structure']['attributes'] self._dataset_attrib = a['dataSet'] self._series_attrib = a['series'] self._obs_attrib = a['observation'] d = tree['structure']['dimensions'] self._dataset_dim = d.get('dataSet', []) self._series_dim = d['series'] self._obs_dim = d['observation'] self._dataset_dim_key = { dim['keyPosition']: dim['id'] for dim in self._dataset_dim } self._dataset_dim_values = { dim['keyPosition']: dim['values'][0]['id'] for dim in self._dataset_dim } if self._series_dim: self._key_len = len(self._dataset_dim) + len(self._series_dim) # Map keyPositions of dimensions at series level to dimension IDs, like with dataset-level dims above. # In case of cross-sectional dataset, the only dimension at series level has no # keyPosition, eg. TIME_PERIOD. Instead, the keyPosition of the dim at observation # is used to fill the gap. self._series_dim_key = { dim.get('keyPosition', self._obs_dim[0].get('keyPosition')): dim['id'] for dim in self._series_dim } self.SeriesKeyTuple = namedtuple_factory( 'SeriesKeyTuple', (self._dataset_dim_key.get(i) or self._series_dim_key.get(i) for i in range(self._key_len))) else: # Dataset must be flat self._key_len = len(self._dataset_dim) + len(self._obs_dim) self.obs_attr_id = [d['id'] for d in self._obs_attrib] # init message instance cls = model.DataMessage self.message = cls(self, tree) return self.message # flag to prevent multiple compiling. See BaseReader.__init__ _compiled = False def write_source(self, filename): ''' Save source to file by calling `write` on the root element. ''' with open(filename, 'w') as fp: return json.dump(self.message._elem, fp, indent=4, sort_keys=True) _paths = { # 'footer_text': 'com:Text/text()', # 'footer_code': '@code', # 'footer_severity': '@severity', # 'dataflow_from_msg': 'mes:Structures/str:Dataflows', # 'constraint_attachment': 'str:ConstraintAttachment', # 'include': '@include', # 'id': '@id', # 'urn': '@urn', # 'url': '@url', # 'uri': '@uri', # 'agencyID': '@agencyID', # 'maintainable_parent_id': '@maintainableParentID', # 'value': 'com:Value/text()', 'headerID': '$.header.id', 'header_prepared': '$.header.prepared', 'header_sender': '$.header.sender.*', # 'header_receiver': 'mes:Receiver/@*', # 'assignment_status': '@assignmentStatus', # 'error': 'mes:error/@*', # 'ref_version': '@version', # 'concept_id': 'str:ConceptIdentity', # 'position': '@position', # 'isfinal': '@isfinal', # 'ref_package': '@package', # 'ref_class': '@class', # 'ref_target': 'str:Target', # 'ref_source': 'str:Source', # 'ref_structure': 'str:Structure', # 'annotationtype': 'com:AnnotationType/text()', # 'generic_obs_path': 'gen:Obs', # 'obs_key_id_path': 'gen:ObsKey/gen:Value/@id', # 'obs_key_values_path': 'gen:ObsKey/gen:Value/@value', # 'series_key_values_path': 'gen:SeriesKey/gen:Value/@value', # 'series_key_id_path': 'gen:SeriesKey/gen:Value/@id', # 'generic_series_dim_path': 'gen:ObsDimension/@value', # 'group_key_values_path': 'gen:GroupKey/gen:Value/@value', # 'group_key_id_path': 'gen:GroupKey/gen:Value/@id', # 'obs_value_path': 'gen:ObsValue/@value', # 'attr_id_path': 'gen:Attributes/gen:Value/@id', # 'attr_values_path': 'gen:Attributes/gen:Value/@value', # model.Code: 'str:Code', # model.Categorisation: 'str:Categorisation', # model.CategoryScheme: 'mes:Structures/str:CategorySchemes/str:CategoryScheme', # model.DataStructureDefinition: 'mes:Structures/str:DataStructures/str:DataStructure', # model.DataflowDefinition: 'str:Dataflow', # model.ConceptScheme: 'mes:Structures/str:Concepts/str:ConceptScheme', # model.ContentConstraint: 'mes:Structures/str:Constraints/str:ContentConstraint', # model.Concept: 'str:Concept', # model.Codelist: 'mes:Structures/str:Codelists/str:Codelist', # model.Categorisations: 'mes:Structures/str:Categorisations', model.Footer: 'footer.message', # model.Category: 'str:Category', # model.DimensionDescriptor: 'str:DataStructureComponents/str:DimensionList', # model.Dimension: 'str:Dimension', # model.TimeDimension: 'str:TimeDimension', # model.MeasureDimension: 'str:MeasureDimension', # model.MeasureDescriptor: 'str:DataStructureComponents/str:MeasureList', # model.PrimaryMeasure: 'str:PrimaryMeasure', # model.AttributeDescriptor: 'str:DataStructureComponents/str:AttributeList', # model.DataAttribute: 'str:Attribute', # model.CubeRegion: 'str:CubeRegion', # model.KeyValue: 'com:KeyValue', # model.Ref: 'Ref', model.Header: '$.header', # model.Annotation: 'com:Annotations/com:Annotation', # model.Group: 'gen:Group', # model.Series: 'gen:Series', model.DataSet: '$.dataSets[0]', # 'int_str_names': './*[local-name() = $name]/@xml:lang', # model.Representation: 'str:LocalRepresentation', # 'int_str_values': './*[local-name() = $name]/text()', # 'enumeration': 'str:Enumeration', # 'texttype': 'str:TextFormat/@textType', # 'maxlength': 'str:TextFormat/@maxLength', # # need this? It is just a non-offset Ref # 'attr_relationship': '*/Ref/@id', } @classmethod def _compile_paths(cls): for key, path in cls._paths.items(): cls._paths[key] = XPath(path) def international_str(self, name, sdmxobj): ''' return DictLike of xml:lang attributes. If node has no attributes, assume that language is 'en'. ''' # Get language tokens like 'en', 'fr'... elem_attrib = self._paths['int_str_names'](sdmxobj._elem, name=name) values = self._paths['int_str_values'](sdmxobj._elem, name=name) # Unilingual strings have no attributes. Assume 'en' instead. if not elem_attrib: elem_attrib = ['en'] return DictLike(zip(elem_attrib, values)) def header_error(self, sdmxobj): try: return DictLike(sdmxobj._elem.Error.attrib) except AttributeError: return None def dim_at_obs(self, sdmxobj): if len(self._obs_dim) > 1: return 'AllDimensions' else: return self._obs_dim[0]['id'] def structured_by(self, sdmxobj): return None # complete this # Types for generic observations _ObsTuple = namedtuple_factory('GenericObservation', ('key', 'value', 'attrib')) _SeriesObsTuple = namedtuple_factory('SeriesObservation', ('dim', 'value', 'attrib')) # Operators getitem0 = itemgetter(0) getitem_key = itemgetter('_key') def iter_generic_obs(self, sdmxobj, with_value, with_attributes): # Make type namedtuple for obs_key. It must be # merged with any dimension values at dataset level maintaining the # key position order. # Note that the measure dimension (such as TIME_PERIOD) has no key position. # We fill this gap by injecting the highest key position. _obs_dim_key = { dim.get('keyPosition', self._key_len - 1): dim['id'] for dim in self._obs_dim } _GenericObsKey = namedtuple_factory( 'GenericObservationKey', (self._dataset_dim_key.get(d) or _obs_dim_key.get(d) for d in range(self._key_len))) obs_l = sorted(sdmxobj._elem.value['observations'].items(), key=self.getitem0) for dim, value in obs_l: # Construct the key for this observation key_idx = [int(i) for i in dim.split(':')] obs_key_values = [ d['values'][i]['id'] for i, d in zip(key_idx, self._obs_dim) ] obs_key = _GenericObsKey._make( self._dataset_dim_values.get(d) or obs_key_values.pop(0) for d in range(self._key_len)) # Read the value obs_value = value[0] if with_value else None # Read any attributes if with_attributes and len(value) > 1: obs_attr_idx = value[1:] obs_attr_raw = [(d['id'], d['values'][i].get('id')) for i, d in zip(obs_attr_idx, self._obs_attrib) ] if obs_attr_raw: obs_attr_id, obs_attr_values = zip(*obs_attr_raw) obs_attr_type = namedtuple_factory('ObsAttributes', obs_attr_id) obs_attr = obs_attr_type(*obs_attr_values) else: obs_attr = None else: obs_attr = None yield self._SeriesObsTuple(obs_key, obs_value, obs_attr) def generic_series(self, sdmxobj): for key, series in sdmxobj._elem.value['series'].items(): series['_key'] = key for series in sorted(sdmxobj._elem.value['series'].values(), key=self.getitem_key): yield model.Series(self, series, dataset=sdmxobj) def generic_groups(self, sdmxobj): return [] def series_key(self, sdmxobj): key_idx = [int(i) for i in sdmxobj._elem['_key'].split(':')] series_key_values = [ d['values'][i]['id'] for i, d in zip(key_idx, self._series_dim) ] full_key_values = [ self._dataset_dim_values.get(d) or series_key_values.pop(0) for d in range(self._key_len) ] return self.SeriesKeyTuple._make(full_key_values) def group_key(self, sdmxobj): group_key_id = self._paths['group_key_id_path'](sdmxobj._elem) group_key_values = self._paths['group_key_values_path'](sdmxobj._elem) GroupKeyTuple = namedtuple_factory('GroupKey', group_key_id) return GroupKeyTuple._make(group_key_values) def dataset_attrib(self, sdmxobj): value_idx = sdmxobj._elem.value.get('attributes') if value_idx: attrib_list = [(a['id'], a['values'][i].get('id', a['values'][i]['name'])) for i, a in zip(value_idx, self._dataset_attrib) if i is not None] attrib_ids, attrib_values = zip(*attrib_list) return namedtuple_factory('Attrib', attrib_ids)(*attrib_values) def series_attrib(self, sdmxobj): value_idx = sdmxobj._elem.get('attributes') if value_idx: attrib_list = [(a['id'], a['values'][i].get('id', a['values'][i]['name'])) for i, a in zip(value_idx, self._series_attrib) if i is not None] attrib_ids, attrib_values = zip(*attrib_list) return namedtuple_factory('Attrib', attrib_ids)(*attrib_values) def iter_generic_series_obs(self, sdmxobj, with_value, with_attributes, reverse_obs=False): obs_l = sorted(sdmxobj._elem['observations'].items(), key=self.getitem0, reverse=reverse_obs) for obs in obs_l: # value for dim at obs, e.g. '2014' for time series. # As this method is called only when each obs has but one dimension, we # it is at index 0. obs_dim_value = self._obs_dim[0]['values'][int(obs[0])]['id'] obs_value = obs[1][0] if with_value else None if with_attributes and len(obs[1]) > 1: obs_attr_idx = obs[1][1:] obs_attr_raw = [(d['id'], d['values'][i].get('id')) for i, d in zip(obs_attr_idx, self._obs_attrib) if i is not None] if obs_attr_raw: obs_attr_id, obs_attr_values = zip(*obs_attr_raw) obs_attr_type = namedtuple_factory('ObsAttributes', obs_attr_id) obs_attr = obs_attr_type(*obs_attr_values) else: obs_attr = None else: obs_attr = None yield self._SeriesObsTuple(obs_dim_value, obs_value, obs_attr)
def group_key(self, sdmxobj): group_key_id = self._paths['group_key_id_path'](sdmxobj._elem) group_key_values = self._paths['group_key_values_path'](sdmxobj._elem) GroupKeyTuple = namedtuple_factory('GroupKey', group_key_id) return GroupKeyTuple._make(group_key_values)
def series_key(self, sdmxobj): series_key_id = self._paths['series_key_id_path'](sdmxobj._elem) series_key_values = self._paths['series_key_values_path']( sdmxobj._elem) SeriesKeyTuple = namedtuple_factory('SeriesKey', series_key_id) return SeriesKeyTuple._make(series_key_values)
class Reader(BaseReader): """ Read SDMX-ML 2.1 and expose it as instances from pandasdmx.model """ _nsmap = { 'com': 'http://www.sdmx.org/resources/sdmxml/schemas/v2_1/common', 'str': 'http://www.sdmx.org/resources/sdmxml/schemas/v2_1/structure', 'mes': 'http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message', 'gen': 'http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic', 'footer': 'http://www.sdmx.org/resources/sdmxml/schemas/v2_1/message/footer' } def initialize(self, source): tree = etree.parse(source) root = tree.getroot() if root.tag.endswith('Structure'): cls = model.StructureMessage elif root.tag.endswith('Data'): cls = model.DataMessage else: raise ValueError('Unsupported root tag: %s' % root.tag) self.message = cls(self, root) return self.message # flag to prevent multiple compiling. See BaseReader.__init__ _compiled = False def write_source(self, filename): ''' Save XML source to file by calling `write` on the root element. ''' return self.message._elem.getroottree().write(filename, encoding='utf8') _paths = { 'footer_text': 'com:Text/text()', 'footer_code': '@code', 'footer_severity': '@severity', 'dataflow_from_msg': 'mes:Structures/str:Dataflows', 'constraint_attachment': 'str:ConstraintAttachment', 'include': '@include', 'id': '@id', 'urn': '@urn', 'url': '@url', 'uri': '@uri', 'agencyID': '@agencyID', 'maintainable_parent_id': '@maintainableParentID', 'value': 'com:Value/text()', 'headerID': 'mes:ID/text()', 'header_prepared': 'mes:Prepared/text()', 'header_sender': 'mes:Sender/@*', 'header_receiver': 'mes:Receiver/@*', 'assignment_status': '@assignmentStatus', 'error': 'mes:error/@*', 'ref_version': '@version', 'concept_identity': 'str:ConceptIdentity', 'position': '@position', 'isfinal': '@isfinal', 'ref_package': '@package', 'ref_class': '@class', 'ref_target': 'str:Target', 'ref_source': 'str:Source', 'ref_structure': 'str:Structure', 'annotationtype': 'com:AnnotationType/text()', 'structured_by': 'mes:Structure/@structureID', 'dim_at_obs': '//mes:Header/mes:Structure/@dimensionAtObservation', 'generic_obs_path': 'gen:Obs', 'obs_key_id_path': 'gen:ObsKey/gen:Value/@id', 'obs_key_values_path': 'gen:ObsKey/gen:Value/@value', 'series_key_values_path': 'gen:SeriesKey/gen:Value/@value', 'series_key_id_path': 'gen:SeriesKey/gen:Value/@id', 'generic_series_dim_path': 'gen:ObsDimension/@value', 'group_key_values_path': 'gen:GroupKey/gen:Value/@value', 'group_key_id_path': 'gen:GroupKey/gen:Value/@id', 'obs_value_path': 'gen:ObsValue/@value', 'attr_id_path': 'gen:Attributes/gen:Value/@id', 'attr_values_path': 'gen:Attributes/gen:Value/@value', model.Code: 'str:Code', model.Categorisation: 'str:Categorisation', model.CategoryScheme: 'mes:Structures/str:CategorySchemes/str:CategoryScheme', model.DataStructureDefinition: 'mes:Structures/str:DataStructures/str:DataStructure', model.DataflowDefinition: 'str:Dataflow', model.ConceptScheme: 'mes:Structures/str:Concepts/str:ConceptScheme', model.ContentConstraint: 'mes:Structures/str:Constraints/str:ContentConstraint', model.Concept: 'str:Concept', model.Codelist: 'mes:Structures/str:Codelists/str:Codelist', model.Categorisations: 'mes:Structures/str:Categorisations', model.Footer: 'footer:Footer/footer:Message', model.Category: 'str:Category', model.DimensionDescriptor: 'str:DataStructureComponents/str:DimensionList', model.Dimension: 'str:Dimension', model.TimeDimension: 'str:TimeDimension', model.MeasureDimension: 'str:MeasureDimension', model.MeasureDescriptor: 'str:DataStructureComponents/str:MeasureList', model.PrimaryMeasure: 'str:PrimaryMeasure', model.AttributeDescriptor: 'str:DataStructureComponents/str:AttributeList', model.DataAttribute: 'str:Attribute', model.CubeRegion: 'str:CubeRegion', model.KeyValue: 'com:KeyValue', model.Ref: 'Ref', model.Header: 'mes:Header', model.Annotation: 'com:Annotations/com:Annotation', model.Group: 'gen:Group', model.Series: 'gen:Series', model.DataSet: 'mes:DataSet', 'int_str_names': './*[local-name() = $name]/@xml:lang', model.Representation: 'str:LocalRepresentation', 'int_str_values': './*[local-name() = $name]/text()', 'enumeration': 'str:Enumeration', 'texttype': 'str:TextFormat/@textType', 'maxlength': 'str:TextFormat/@maxLength', # need this? It is just a non-offset Ref 'attr_relationship': '*/Ref/@id', 'cat_scheme_id': '../@id' } @classmethod def _compile_paths(cls): for key, path in cls._paths.items(): cls._paths[key] = XPath(path, namespaces=cls._nsmap, smart_strings=False) def international_str(self, name, sdmxobj): ''' return DictLike of xml:lang attributes. If node has no attributes, assume that language is 'en'. ''' # Get language tokens like 'en', 'fr'... elem_attrib = self._paths['int_str_names'](sdmxobj._elem, name=name) values = self._paths['int_str_values'](sdmxobj._elem, name=name) # Unilingual strings have no attributes. Assume 'en' instead. if not elem_attrib: elem_attrib = ['en'] return DictLike(zip(elem_attrib, values)) def header_error(self, sdmxobj): try: return DictLike(sdmxobj._elem.Error.attrib) except AttributeError: return None def dim_at_obs(self, sdmxobj): return self.read_as_str('dim_at_obs', sdmxobj) def structured_by(self, sdmxobj): return self.read_as_str('structured_by', sdmxobj) # Types for generic observations _ObsTuple = namedtuple_factory('GenericObservation', ('key', 'value', 'attrib')) _SeriesObsTuple = namedtuple_factory('SeriesObservation', ('dim', 'value', 'attrib')) def iter_generic_obs(self, sdmxobj, with_value, with_attributes): for obs in self._paths['generic_obs_path'](sdmxobj._elem): # Construct the namedtuple for the ObsKey. # The namedtuple class is created on first iteration. obs_key_values = self._paths['obs_key_values_path'](obs) try: obs_key = ObsKeyTuple._make(obs_key_values) except NameError: obs_key_id = self._paths['obs_key_id_path'](obs) ObsKeyTuple = namedtuple_factory('ObsKey', obs_key_id) obs_key = ObsKeyTuple._make(obs_key_values) if with_value: obs_value = self._paths['obs_value_path'](obs)[0] else: obs_value = None if with_attributes: obs_attr_values = self._paths['attr_values_path'](obs) obs_attr_id = self._paths['attr_id_path'](obs) obs_attr_type = namedtuple_factory('ObsAttributes', obs_attr_id) obs_attr = obs_attr_type(*obs_attr_values) else: obs_attr = None yield self._ObsTuple(obs_key, obs_value, obs_attr) def generic_series(self, sdmxobj): path = self._paths[model.Series] for series in path(sdmxobj._elem): yield model.Series(self, series, dataset=sdmxobj) def generic_groups(self, sdmxobj): path = self._paths[model.Group] for series in path(sdmxobj._elem): yield model.Group(self, series) def series_key(self, sdmxobj): series_key_id = self._paths['series_key_id_path'](sdmxobj._elem) series_key_values = self._paths['series_key_values_path']( sdmxobj._elem) SeriesKeyTuple = namedtuple_factory('SeriesKey', series_key_id) return SeriesKeyTuple._make(series_key_values) def group_key(self, sdmxobj): group_key_id = self._paths['group_key_id_path'](sdmxobj._elem) group_key_values = self._paths['group_key_values_path'](sdmxobj._elem) GroupKeyTuple = namedtuple_factory('GroupKey', group_key_id) return GroupKeyTuple._make(group_key_values) def series_attrib(self, sdmxobj): attr_id = self._paths['attr_id_path'](sdmxobj._elem) attr_values = self._paths['attr_values_path'](sdmxobj._elem) return namedtuple_factory('Attrib', attr_id)(*attr_values) dataset_attrib = series_attrib def iter_generic_series_obs(self, sdmxobj, with_value, with_attributes, reverse_obs=False): for obs in sdmxobj._elem.iterchildren( '{http://www.sdmx.org/resources/sdmxml/schemas/v2_1/data/generic}Obs', reversed=reverse_obs): obs_dim = self._paths['generic_series_dim_path'](obs)[0] if with_value: obs_value = self._paths['obs_value_path'](obs)[0] else: obs_value = None if with_attributes: obs_attr_values = self._paths['attr_values_path'](obs) obs_attr_id = self._paths['attr_id_path'](obs) obs_attr_type = namedtuple_factory('ObsAttributes', obs_attr_id) obs_attr = obs_attr_type(*obs_attr_values) else: obs_attr = None yield self._SeriesObsTuple(obs_dim, obs_value, obs_attr)
def preview_data(self, flow_id, key=None, count=True, total=True, dsd=None): ''' Get keys or number of series for a prospective dataset query allowing for keys with multiple values per dimension. It downloads the complete list of series keys for a dataflow rather than using constraints and DSD. This feature is, however, not supported by all data providers. ECB, IMF_SDMXCENTRAL and UNSD are known to work. Args: flow_id(str): dataflow id key(dict): optional key mapping dimension names to values or lists of values. Must have been validated before. It is not checked if key values are actually valid dimension names and values. Default: {} count(bool): if True (default), return the number of series of the dataset designated by flow_id and key. If False, the actual keys are returned as a pandas DataFrame or dict of dataframes, depending on the value of 'total'. total(bool): if True (default), return the aggregate number of series or a single dataframe (depending on the value of 'count'). If False, return a dict mapping keys to dataframes of series keys. E.g., if key={'COUNTRY':'IT+CA+AU'}, the dict will have 3 items describing the series keys for each country respectively. If 'count' is True, dict values will be int rather than PD.DataFrame. ''' all_keys = self.series_keys(flow_id, dsd=dsd) # Handle the special case that no key is provided if not key: if count: return all_keys.shape[0] else: return all_keys # So there is a key specifying at least one dimension value. # Wrap single values in 1-elem list for uniform treatment key_l = self.prepare_key(key) # order dim_names that are present in the key dim_names = [k for k in all_keys if k in key] # Drop columns that are not in the key key_df = all_keys.loc[:, dim_names] if total: # DataFrame with matching series keys bool_series = reduce( and_, (key_df.isin(key_l)[col] for col in dim_names)) if count: return bool_series.value_counts()[True] else: return all_keys[bool_series] else: # Dict of value combinations as dict keys key_product = product(*(key_l[k] for k in dim_names)) # Replace key tuples by namedtuples PartialKey = namedtuple_factory('PartialKey', dim_names) matches = {PartialKey(k): reduce(and_, (key_df.isin({k1: [v1] for k1, v1 in zip(dim_names, k)})[col] for col in dim_names)) for k in key_product} if not count: # dict mapping each key to DataFrame with selected key-set return {k: all_keys[v] for k, v in matches.items()} else: # Number of series per key return {k: v.value_counts()[True] for k, v in matches.items()}
class Reader(BaseReader): """ Read SDMXJSON 2.1 and expose it as instances from pandasdmx.model """ def read_as_str(self, name, sdmxobj, first_only=True): result = self._paths[name](sdmxobj._elem) if result: if first_only: return result[0].value else: return [r.value for r in result] def initialize(self, source): tree = json.load(source) cls = model.DataMessage self.message = cls(self, tree) return self.message # flag to prevent multiple compiling. See BaseReader.__init__ _compiled = False def write_source(self, filename): ''' Save source to file by calling `write` on the root element. ''' return json.dumps(self.message._elem, filename) _paths = { # 'footer_text': 'com:Text/text()', # 'footer_code': '@code', # 'footer_severity': '@severity', # 'dataflow_from_msg': 'mes:Structures/str:Dataflows', # 'constraint_attachment': 'str:ConstraintAttachment', # 'include': '@include', # 'id': '@id', # 'urn': '@urn', # 'url': '@url', # 'uri': '@uri', # 'agencyID': '@agencyID', # 'maintainable_parent_id': '@maintainableParentID', # 'value': 'com:Value/text()', 'headerID': 'id', # 'header_prepared': 'mes:Prepared/text()', # 'header_sender': 'mes:Sender/@*', # 'header_receiver': 'mes:Receiver/@*', # 'assignment_status': '@assignmentStatus', # 'error': 'mes:error/@*', # 'ref_version': '@version', # 'concept_id': 'str:ConceptIdentity', # 'position': '@position', # 'isfinal': '@isfinal', # 'ref_package': '@package', # 'ref_class': '@class', # 'ref_target': 'str:Target', # 'ref_source': 'str:Source', # 'ref_structure': 'str:Structure', # 'annotationtype': 'com:AnnotationType/text()', 'structured_by': '$.structure.links', 'dim_at_obs': '$.structure.dimensions.observations', # 'generic_obs_path': 'gen:Obs', # 'obs_key_id_path': 'gen:ObsKey/gen:Value/@id', # 'obs_key_values_path': 'gen:ObsKey/gen:Value/@value', # 'series_key_values_path': 'gen:SeriesKey/gen:Value/@value', # 'series_key_id_path': 'gen:SeriesKey/gen:Value/@id', # 'generic_series_dim_path': 'gen:ObsDimension/@value', # 'group_key_values_path': 'gen:GroupKey/gen:Value/@value', # 'group_key_id_path': 'gen:GroupKey/gen:Value/@id', # 'obs_value_path': 'gen:ObsValue/@value', # 'attr_id_path': 'gen:Attributes/gen:Value/@id', # 'attr_values_path': 'gen:Attributes/gen:Value/@value', # model.Code: 'str:Code', # model.Categorisation: 'str:Categorisation', # model.CategoryScheme: 'mes:Structures/str:CategorySchemes/str:CategoryScheme', # model.DataStructureDefinition: 'mes:Structures/str:DataStructures/str:DataStructure', # model.DataflowDefinition: 'str:Dataflow', # model.ConceptScheme: 'mes:Structures/str:Concepts/str:ConceptScheme', # model.ContentConstraint: 'mes:Structures/str:Constraints/str:ContentConstraint', # model.Concept: 'str:Concept', # model.Codelist: 'mes:Structures/str:Codelists/str:Codelist', # model.Categorisations: 'mes:Structures/str:Categorisations', model.Footer: 'footer.message', # model.Category: 'str:Category', # model.DimensionDescriptor: 'str:DataStructureComponents/str:DimensionList', # model.Dimension: 'str:Dimension', # model.TimeDimension: 'str:TimeDimension', # model.MeasureDimension: 'str:MeasureDimension', # model.MeasureDescriptor: 'str:DataStructureComponents/str:MeasureList', # model.PrimaryMeasure: 'str:PrimaryMeasure', # model.AttributeDescriptor: 'str:DataStructureComponents/str:AttributeList', # model.DataAttribute: 'str:Attribute', # model.CubeRegion: 'str:CubeRegion', # model.KeyValue: 'com:KeyValue', # model.Ref: 'Ref', model.Header: 'header', # model.Annotation: 'com:Annotations/com:Annotation', # model.Group: 'gen:Group', # model.Series: 'gen:Series', model.DataSet: 'dataSets[0]', # 'int_str_names': './*[local-name() = $name]/@xml:lang', # model.Representation: 'str:LocalRepresentation', # 'int_str_values': './*[local-name() = $name]/text()', # 'enumeration': 'str:Enumeration', # 'texttype': 'str:TextFormat/@textType', # 'maxlength': 'str:TextFormat/@maxLength', # # need this? It is just a non-offset Ref # 'attr_relationship': '*/Ref/@id', } @classmethod def _compile_paths(cls): for key, path in cls._paths.items(): cls._paths[key] = XPath(path) def international_str(self, name, sdmxobj): ''' return DictLike of xml:lang attributes. If node has no attributes, assume that language is 'en'. ''' # Get language tokens like 'en', 'fr'... elem_attrib = self._paths['int_str_names'](sdmxobj._elem, name=name) values = self._paths['int_str_values'](sdmxobj._elem, name=name) # Unilingual strings have no attributes. Assume 'en' instead. if not elem_attrib: elem_attrib = ['en'] return DictLike(zip(elem_attrib, values)) def header_error(self, sdmxobj): try: return DictLike(sdmxobj._elem.Error.attrib) except AttributeError: return None # Types for generic observations _ObsTuple = namedtuple_factory('GenericObservation', ('key', 'value', 'attrib')) _SeriesObsTuple = namedtuple_factory('SeriesObservation', ('dim', 'value', 'attrib')) def iter_generic_obs(self, sdmxobj, with_value, with_attributes): for obs in self._paths['generic_obs_path'](sdmxobj._elem): # Construct the namedtuple for the ObsKey. # The namedtuple class is created on first iteration. obs_key_values = self._paths['obs_key_values_path'](obs) try: obs_key = ObsKeyTuple._make(obs_key_values) except NameError: obs_key_id = self._paths['obs_key_id_path'](obs) ObsKeyTuple = namedtuple_factory('ObsKey', obs_key_id) obs_key = ObsKeyTuple._make(obs_key_values) if with_value: obs_value = self._paths['obs_value_path'](obs)[0] else: obs_value = None if with_attributes: obs_attr_values = self._paths['attr_values_path'](obs) obs_attr_id = self._paths['attr_id_path'](obs) obs_attr_type = namedtuple_factory('ObsAttributes', obs_attr_id) obs_attr = obs_attr_type(*obs_attr_values) else: obs_attr = None yield self._ObsTuple(obs_key, obs_value, obs_attr) @staticmethod def getitem_key(obj): return obj.value['_key'] def generic_series(self, sdmxobj): for key, series in sdmxobj._elem.value['series'].items(): series['_key'] = key for series in sorted(parse('series.*').find(sdmxobj._elem), key=self.getitem_key): yield model.Series(self, series, dataset=sdmxobj) def generic_groups(self, sdmxobj): return [] def series_key(self, sdmxobj): # pull down dataset key dataset_dim = parse('$.structure.dimensions.dataSet[*]').find( sdmxobj._elem) full_key_ids = [d.value['id'] for d in dataset_dim] full_key_values = [d.value['values'][0]['id'] for d in dataset_dim] key_idx = [int(i) for i in sdmxobj._elem.value['_key'].split(':')] struct_dim = parse('$.structure.dimensions.series').find( sdmxobj._elem)[0].value series_key_ids = [d['id'] for d in struct_dim] series_key_values = [ d['values'][i]['id'] for i, d in zip(key_idx, struct_dim) ] full_key_ids.extend(series_key_ids) full_key_values.extend(series_key_values) SeriesKeyTuple = namedtuple_factory('SeriesKey', full_key_ids) return SeriesKeyTuple._make(full_key_values) def group_key(self, sdmxobj): group_key_id = self._paths['group_key_id_path'](sdmxobj._elem) group_key_values = self._paths['group_key_values_path'](sdmxobj._elem) GroupKeyTuple = namedtuple_factory('GroupKey', group_key_id) return GroupKeyTuple._make(group_key_values) def dataset_attrib(self, sdmxobj): value_idx = sdmxobj._elem.value.get('attributes') if value_idx: struct_attrib = parse('$.structure.attributes.dataset').find( sdmxobj._elem)[0].value return [(a['id'], a['values'][i].get('id', a['values'][i]['name'])) for i, a in zip(value_idx, struct_attrib)] def series_attrib(self, sdmxobj): value_idx = sdmxobj._elem.value.get('attributes') if value_idx: struct_attrib = parse('$.structure.attributes.series').find( sdmxobj._elem)[0].value return [(a['id'], a['values'][i].get('id', a['values'][i]['name'])) for i, a in zip(value_idx, struct_attrib)] getitem0 = itemgetter(0) def iter_generic_series_obs(self, sdmxobj, with_value, with_attributes, reverse_obs=False): obs_l = sorted(sdmxobj._elem.value['observations'].items(), key=self.getitem0, reverse=reverse_obs) obs_dim_l = parse('$.structure.dimensions.observation[*]').find( sdmxobj._elem) for obs in obs_l: obs_dim = obs_dim_l[0].value['values'][int(obs[0])]['id'] if with_value: obs_value = obs[1][0] else: obs_value = None if with_attributes: obs_attr_values = self._paths['attr_values_path'](obs) obs_attr_id = self._paths['attr_id_path'](obs) obs_attr_type = namedtuple_factory('ObsAttributes', obs_attr_id) obs_attr = obs_attr_type(*obs_attr_values) else: obs_attr = None yield self._SeriesObsTuple(obs_dim, obs_value, obs_attr)
def preview_data(self, flow_id, key=None, count=True, total=True): ''' Get keys or number of series for a prospective dataset query allowing for keys with multiple values per dimension. It downloads the complete list of series keys for a dataflow rather than using constraints and DSD. This feature is, however, not supported by all data providers. ECB and UNSD are known to work. Args: flow_id(str): dataflow id key(dict): optional key mapping dimension names to values or lists of values. Must have been validated before. It is not checked if key values are actually valid dimension names and values. Default: {} count(bool): if True (default), return the number of series of the dataset designated by flow_id and key. If False, the actual keys are returned as a pandas DataFrame or dict of dataframes, depending on the value of 'total'. total(bool): if True (default), return the aggregate number of series or a single dataframe (depending on the value of 'count'). If False, return a dict mapping keys to dataframes of series keys. E.g., if key={'COUNTRY':'IT+CA+AU'}, the dict will have 3 items describing the series keys for each country respectively. If 'count' is True, dict values will be int rather than PD.DataFrame. ''' all_keys = self.series_keys(flow_id) # Handle the special case that no key is provided if not key: if count: return all_keys.shape[0] else: return all_keys # So there is a key specifying at least one dimension value. # Wrap single values in 1-elem list for uniform treatment key_l = { k: [v] if isinstance(v, str_type) else v for k, v in key.items() } # order dim_names that are present in the key dim_names = [k for k in all_keys if k in key] # Drop columns that are not in the key key_df = all_keys.loc[:, dim_names] if total: # DataFrame with matching series keys bool_series = reduce(and_, (key_df.isin(key_l)[col] for col in dim_names)) if count: return bool_series.value_counts()[True] else: return all_keys[bool_series] else: # Dict of value combinations as dict keys key_product = product(*(key_l[k] for k in dim_names)) # Replace key tuples by namedtuples PartialKey = namedtuple_factory('PartialKey', dim_names) matches = { PartialKey(k): reduce(and_, (key_df.isin({k1: [v1] for k1, v1 in zip(dim_names, k)})[col] for col in dim_names)) for k in key_product } if not count: # dict mapping each key to DataFrame with selected key-set return {k: all_keys[v] for k, v in matches.items()} else: # Number of series per key return {k: v.value_counts()[True] for k, v in matches.items()}