def _apply_filters(data, meta, filters): keep = np.array([True] * len(data)) # filter by columns and list of values for col, values in filters.items(): if col in meta.columns: matches = pattern_match(meta[col], values) cat_idx = meta[matches].index keep_col = data[META_IDX].set_index(META_IDX).index.isin(cat_idx) elif col in ['model', 'scenario', 'region', 'unit']: keep_col = pattern_match(data[col], values) elif col == 'variable': level = filters['level'] if 'level' in filters.keys() else None keep_col = pattern_match(data[col], values, level) elif col == 'year': keep_col = years_match(data[col], values) elif col == 'level': if 'variable' not in filters.keys(): keep_col = pattern_match(data['variable'], '*', level=values) else: continue else: raise ValueError('filter by column ' + col + ' not supported') keep &= keep_col return keep
def filter_by_meta(data, df, join_meta=False, **kwargs): """Filter by and join meta columns from an IamDataFrame to a pd.DataFrame Parameters ---------- data: pd.DataFrame instance DataFrame to which meta columns are to be joined, index or columns must include `['model', 'scenario']` df: IamDataFrame instance IamDataFrame from which meta columns are filtered and joined (optional) join_meta: bool, default False join selected columns from `df.meta` on `data` kwargs: meta columns to be filtered/joined, where `col=...` applies filters by the given arguments (using `utils.pattern_match()`) and `col=None` joins the column without filtering (setting col to `np.nan` if `(model, scenario) not in df.meta.index`) """ if not set(META_IDX).issubset(data.index.names + list(data.columns)): raise ValueError('missing required index dimensions or columns!') meta = pd.DataFrame(df.meta[list(set(kwargs) - set(META_IDX))].copy()) # filter meta by columns keep = np.array([True] * len(meta)) apply_filter = False for col, values in kwargs.items(): if col in META_IDX and values is not None: _col = meta.index.get_level_values(0 if col is 'model' else 1) keep &= pattern_match(_col, values, has_nan=False) apply_filter = True elif values is not None: keep &= pattern_match(meta[col], values) apply_filter |= values is not None meta = meta[keep] # set the data index to META_IDX and apply filtered meta index data = data.copy() idx = list(data.index.names) if not data.index.names == [None] else None data = data.reset_index().set_index(META_IDX) meta = meta.loc[meta.index.intersection(data.index)] meta.index.names = META_IDX if apply_filter: data = data.loc[meta.index] data.index.names = META_IDX # join meta (optional), reset index to format as input arg data = data.join(meta) if join_meta else data data = data.reset_index().set_index(idx or 'index') if idx is None: data.index.name = None return data
def test_pattern_match_dollar(): data = pd.Series(['foo$bar', 'foo']) values = ['foo$bar'] obs = utils.pattern_match(data, values) exp = [True, False] assert (obs == exp).all()
def test_pattern_match_brackets(): data = pd.Series(['foo (bar)', 'foo bar']) values = ['foo (bar)'] obs = utils.pattern_match(data, values) exp = [True, False] assert (obs == exp).all()
def test_pattern_match_dot(): data = pd.Series(['foo', 'fo.']) values = ['fo.'] obs = utils.pattern_match(data, values) exp = [False, True] assert (obs == exp).all()
def test_pattern_match_none(): data = pd.Series(['foo', 'bar']) values = ['baz'] obs = utils.pattern_match(data, values) exp = [False, False] assert (obs == exp).all()
def test_pattern_match_plus(): data = pd.Series(['foo', 'foo+', '+bar', 'b+az']) values = ['*+*'] obs = utils.pattern_match(data, values) exp = [False, True, True, True] assert (obs == exp).all()
def test_pattern_match_ast_regex(): data = pd.Series(['foo', 'foo2', 'bar']) values = ['foo*'] obs = utils.pattern_match(data, values) exp = [True, True, False] assert (obs == exp).all()
def _apply_filters(data, meta, filters): """Applies filters to the data and meta tables of an IamDataFrame. Parametersp ---------- data: pd.DataFrame data table of an IamDataFrame meta: pd.DataFrame meta table of an IamDataFrame filters: dict dictionary of filters ({col: values}}); uses a pseudo-regexp syntax by default, but accepts `regexp: True` to use direct regexp """ regexp = filters.pop('regexp', False) keep = np.array([True] * len(data)) # filter by columns and list of values for col, values in filters.items(): if col in meta.columns: matches = pattern_match(meta[col], values, regexp=regexp) cat_idx = meta[matches].index keep_col = data[META_IDX].set_index(META_IDX).index.isin(cat_idx) elif col in ['model', 'scenario', 'region', 'unit']: keep_col = pattern_match(data[col], values, regexp=regexp) elif col == 'variable': level = filters['level'] if 'level' in filters else None keep_col = pattern_match(data[col], values, level, regexp) elif col == 'year': keep_col = years_match(data[col], values) elif col == 'level': if 'variable' not in filters.keys(): keep_col = pattern_match(data['variable'], '*', values, regexp=regexp) else: continue else: raise ValueError('filter by column ' + col + ' not supported') keep &= keep_col return keep
def test_pattern_regexp(): data = pd.Series(['foo', 'foa', 'foo$']) values = ['fo.$'] obs = utils.pattern_match(data, values, regexp=True) exp = [True, True, False] assert (obs == exp).all()
def _match(data, patterns): # this is empty, return empty list which means "everything" if not patterns: return [] # otherwise match everything matches = np.array([False] * len(data)) for p in patterns: matches |= pattern_match(data, p) return data[matches].unique()
def check_aggregate(self, variable, components=None, units=None, exclude_on_fail=False, multiplier=1, **kwargs): """Check whether the timeseries data match the aggregation of components or sub-categories Parameters ---------- variable: str variable to be checked for matching aggregation of sub-categories components: list of str, default None list of variables, defaults to all sub-categories of `variable` units: str or list of str, default None filter variable and components for given unit(s) exclude_on_fail: boolean, default False flag scenarios failing validation as `exclude: True` multiplier: number, default 1 factor when comparing variable and sum of components kwargs: passed to `np.isclose()` """ # default components to all variables one level below `variable` if components is None: var_list = pd.Series(self.data.variable.unique()) components = var_list[pattern_match(var_list, '{}|*'.format(variable), 0)] if not len(components): msg = 'cannot check aggregate for {} because it has no components' logger().info(msg.format(variable)) return # filter and groupby data, use `pd.Series.align` for matching index df_variable, df_components = ( _aggregate_by_variables(self.data, variable, units) .align(_aggregate_by_variables(self.data, components, units)) ) # use `np.isclose` for checking match diff = df_variable[~np.isclose(df_variable, multiplier * df_components, **kwargs)] if len(diff): msg = '{} - {} of {} data points are not aggregates of components' logger().info(msg.format(variable, len(diff), len(df_variable))) if exclude_on_fail: self._exclude_on_fail(diff.index.droplevel([2, 3])) diff = pd.concat([diff], keys=[variable], names=['variable']) return diff.unstack().rename_axis(None, axis=1)
def filter_by_meta(data, df, join_meta=False, **kwargs): """Filter by and join meta columns from an IamDataFrame to a pd.DataFrame Parameters ---------- data: pd.DataFrame instance DataFrame to which meta columns are to be joined, index or columns must include `['model', 'scenario']` df: IamDataFrame instance IamDataFrame from which meta columns are filtered and joined (optional) join_meta: bool, default False join selected columns from `df.meta` on `data` kwargs: meta columns to be joined, where `col=...` applies filters by the given arguments (using `utils.pattern_match()`) and `col=None` joins the column without filtering """ if not set(META_IDX).issubset(data.index.names + list(data.columns)): raise ValueError('missing required index dimensions or columns!') meta = df.meta[list(kwargs)].copy() # filter meta by columns keep = np.array([True] * len(meta)) for col, values in kwargs.items(): if values is not None: keep_col = pattern_match(meta[col], values) keep &= keep_col meta = meta[keep] # set the data index to META_IDX and apply filtered meta index data = data.copy() idx = list(data.index.names) if not data.index.names == [None] else None data = data.reset_index().set_index(META_IDX).loc[meta.index] # join meta (optional), reset index to format as input arg data = data.join(meta) if join_meta else data data = data.reset_index().set_index(idx or 'index') if idx is None: data.index.name = None return data
def test_pattern_match_ast2_regex(): data = pd.Series(['foo|bar', 'foo', 'bar']) values = ['*o*b*'] obs = utils.pattern_match(data, values) assert (obs == [True, False, False]).all()
def test_pattern_match_nan(): data = pd.Series(['foo', np.nan]) values = ['baz'] obs = utils.pattern_match(data, values, has_nan=True) assert (obs == [False, False]).all()
def test_pattern_match_one(): data = pd.Series(['foo', 'bar']) values = ['foo'] obs = utils.pattern_match(data, values) assert (obs == [True, False]).all()
def test_pattern_match_dollar(): data = pd.Series(["foo$bar", "foo"]) values = ["foo$bar"] obs = utils.pattern_match(data, values) assert (obs == [True, False]).all()
def test_pattern_regexp(): data = pd.Series(["foo", "foa", "foo$"]) values = ["fo.$"] obs = utils.pattern_match(data, values, regexp=True) assert (obs == [True, True, False]).all()
def test_pattern_match_dot(): data = pd.Series(["foo", "fo."]) values = ["fo."] obs = utils.pattern_match(data, values) assert (obs == [False, True]).all()
def test_pattern_match_brackets(): data = pd.Series(["foo (bar)", "foo bar"]) values = ["foo (bar)"] obs = utils.pattern_match(data, values) assert (obs == [True, False]).all()
def test_pattern_match_plus(): data = pd.Series(["foo", "foo+", "+bar", "b+az"]) values = ["*+*"] obs = utils.pattern_match(data, values) assert (obs == [False, True, True, True]).all()
def test_pattern_match_ast2_regex(): data = pd.Series(["foo|bar", "foo", "bar"]) values = ["*o*b*"] obs = utils.pattern_match(data, values) assert (obs == [True, False, False]).all()
def test_pattern_match_none(): data = pd.Series(["foo", "bar"]) values = ["baz"] obs = utils.pattern_match(data, values) assert (obs == [False, False]).all()
def read_unfccc( party_code, gases=None, tier=None, mapping=None, model="UNFCCC", scenario="Data Inventory", ): """Read data from the UNFCCC Data Inventory This function is a wrappter for the :meth:`unfccc_di_api.UNFCCCApiReader.query`. The data returned from the UNFCCC Data Inventory is transformed into a structure similar to the format used in IPCC reports and IAM model comparison projects. For compatibility with the `iam-units <https://github.com/IAMconsortium/units>`_ package and the :meth:`convert_unit <IamDataFrame.convert_unit>`, emissions species are formatted to standard text ('CO2') instead of subscripts ('CO₂') and the unit 'CO₂ equivalent' used by UNFCCC is replaced by 'CO2e'. Parameters ---------- party_code : str ISO3-style code for UNFCCC party (country) gases : str or list of str, optional Emission species to be queried from the data inventory can be stated as subscript-format ('CO₂') or simple text ('CO2') tier : int or list of int Pre-specified groupings of UNFCCC data to a variable naming format used in IPCC reports and IAM model comparison projects mapping : dict, optional Mapping to cast UNFCCC-data columns into IAMC-style variables, e.g. .. code-block:: python { 'Emissions|{gas}|Energy': ('1. Energy', '*', '*', '*'), } where the tuple corresponds to filters for the columns `['category', 'classification', 'measure', 'gas']` and `{<col>}` tags in the key are replaced by the column value. model : str, optional Name to be used as model identifier scenario : str, optional Name to be used as scenario identifier Returns ------- :class:`IamDataFrame` """ if not HAS_UNFCCC: # pragma: no cover raise ImportError("Required package `unfccc-di-api` not found!") # check that only one of `tier` or `mapping` is provided if (tier is None and mapping is None) or (tier is not None and mapping is not None): raise ValueError("Please specify either `tier` or `mapping`!") global _READER if _READER is None: _READER = unfccc_di_api.UNFCCCApiReader() # retrieve data, drop non-numeric data and base year data = _READER.query(party_code=party_code, gases=to_list(gases)) data = data[~np.isnan(data.numberValue)] data = data[data.year != "Base year"] # create the mapping from the data if `tier` is given if tier is not None: _category = data.category.unique() mapping = {} for t in to_list(tier): # treatment of tear 1 if t == 1: pattern = re.compile(".\\. ") # pattern of top-level category for i in [i for i in _category if pattern.match(i)]: key = "Emissions|{gas}|" + i[4:] mapping[key] = ( i, "Total for category", "Net emissions/removals", "*", ) else: raise ValueError(f"Unknown value for `tier`: {t}") # add new `variable` column, iterate over mapping to determine variables data["variable"] = None for variable, value in mapping.items(): matches = np.array([True] * len(data)) for i, col in enumerate(NAME_COLS): matches &= pattern_match(data[col], value[i]) data.loc[matches, "variable"] = data.loc[matches].apply(_compile_variable, variable=variable, axis=1) # drop unspecified rows and columns, rename value column cols = ["party", "variable", "unit", "year", "gas", "numberValue"] data = data.loc[[isstr(i) for i in data.variable], cols] data.rename(columns={"numberValue": "value"}, inplace=True) # append `gas` to unit, drop `gas` column data.loc[:, "unit"] = data.apply(_compile_unit, axis=1) data.drop(columns="gas", inplace=True) return IamDataFrame(data, model=model, scenario=scenario, region="party")
def check_aggregate_regions(self, variable, region='World', components=None, units=None, exclude_on_fail=False, **kwargs): """Check whether the region timeseries data match the aggregation of components Parameters ---------- variable: str variable to be checked for matching aggregation of components data region: str region to be checked for matching aggregation of components data components: list of str, default None list of regions, defaults to all regions except region units: str or list of str, default None filter variable and components for given unit(s) exclude_on_fail: boolean, default False flag scenarios failing validation as `exclude: True` kwargs: passed to `np.isclose()` """ var_df = self.filter(variable=variable, level=0) if components is None: components = list(set(var_df.data.region) - set([region])) if not len(components): msg = ( 'cannot check regional aggregate for `{}` because it has no ' 'regional components' ) logger().info(msg.format(variable)) return None # filter and groupby data, use `pd.Series.align` for matching index df_region, df_components = ( _aggregate_by_regions(var_df.data, region, units) .align(_aggregate_by_regions(var_df.data, components, units)) ) df_components.index = df_components.index.droplevel( "variable" ) # Add in variables that are included in region totals but which # aren't included in the regional components. # For example, if we are looking at World and Emissions|BC, we need # to add aviation and shipping to the sum of Emissions|BC for each # of World's regional components to do a valid check. different_region = components[0] var_list = pd.Series(self.data.variable.unique()) var_components = var_list[pattern_match(var_list, '{}|*'.format(variable), 0)] for var_to_add in var_components: var_rows = self.data.variable == var_to_add region_rows = self.data.region == different_region var_has_regional_info = (var_rows & region_rows).any() if not var_has_regional_info: df_var_to_add = self.filter( region=region, variable=var_to_add ).data.groupby(REGION_IDX).sum()['value'] df_var_to_add.index = df_var_to_add.index.droplevel("variable") if len(df_var_to_add): df_components = df_components.add(df_var_to_add, fill_value=0) df_components = pd.concat([df_components], keys=[variable], names=['variable']) # use `np.isclose` for checking match diff = df_region[~np.isclose(df_region, df_components, **kwargs)] if len(diff): msg = ( '{} - {} of {} data points are not aggregates of regional ' 'components' ) logger().info(msg.format(variable, len(diff), len(df_region))) if exclude_on_fail: self._exclude_on_fail(diff.index.droplevel([2, 3])) diff = pd.concat([diff], keys=[region], names=['region']) return diff.unstack().rename_axis(None, axis=1)