Пример #1
0
def _aggregate_by_variables(df, variables, units=None):
    variables = [variables] if isstr(variables) else variables
    df = df[df.variable.isin(variables)]

    if units is not None:
        units = [units] if isstr(units) else units
        df = df[df.unit.isin(units)]

    return df.groupby(YEAR_IDX).sum()['value']
Пример #2
0
def _aggregate_by_regions(df, regions, units=None):
    regions = [regions] if isstr(regions) else regions
    df = df[df.region.isin(regions)]

    if units is not None:
        units = [units] if isstr(units) else units
        df = df[df.unit.isin(units)]

    return df.groupby(REGION_IDX).sum()['value']
Пример #3
0
    def pivot_table(self,
                    index,
                    columns,
                    values='value',
                    aggfunc='count',
                    fill_value=None,
                    style=None):
        """Returns a pivot table

        Parameters
        ----------
        index: str or list of strings
            rows for Pivot table
        columns: str or list of strings
            columns for Pivot table
        values: str, default 'value'
            dataframe column to aggregate or count
        aggfunc: str or function, default 'count'
            function used for aggregation,
            accepts 'count', 'mean', and 'sum'
        fill_value: scalar, default None
            value to replace missing values with
        style: str, default None
            output style for pivot table formatting
            accepts 'highlight_not_max', 'heatmap'
        """
        index = [index] if isstr(index) else index
        columns = [columns] if isstr(columns) else columns

        df = self.data

        # allow 'aggfunc' to be passed as string for easier user interface
        if isstr(aggfunc):
            if aggfunc == 'count':
                df = self.data.groupby(index + columns, as_index=False).count()
                fill_value = 0
            elif aggfunc == 'mean':
                df = self.data.groupby(index + columns, as_index=False).mean()\
                    .round(2)
                aggfunc = np.sum
                fill_value = 0 if style == 'heatmap' else ""
            elif aggfunc == 'sum':
                aggfunc = np.sum
                fill_value = 0 if style == 'heatmap' else ""

        df = df.pivot_table(values=values,
                            index=index,
                            columns=columns,
                            aggfunc=aggfunc,
                            fill_value=fill_value)
        return df
Пример #4
0
 def _load_yaml(self, obj):
     check_rel_paths = False
     if hasattr(obj, 'read'):  # it's a file
         obj = obj.read()
     if isstr(obj) and not os.path.exists(obj):
         raise IOError('File {} does not exist'.format(obj))
     if isstr(obj) and os.path.exists(obj):
         check_rel_paths = True
         fname = obj
         with open(fname) as f:
             obj = f.read()
     if not isinstance(obj, dict):
         obj = yaml.load(obj)
     return obj
Пример #5
0
def cross_threshold(x, threshold=0, direction=['from above', 'from below']):
    """Returns a list of the years in which a timeseries (indexed over years)
    crosses a given threshold

    Parameters
    ----------
    x: pandas.Series
        a timeseries indexed over years
    threshold: float, default 0
        the threshold that the timeseries is checked against
    direction: str, optional, default `['from above', 'from below']`
        whether to return all years where the threshold is crossed
        or only where threshold is crossed in a specific direction
    """
    prev_yr, prev_val = None, None
    years = []
    direction = [direction] if isstr(direction) else list(direction)
    if not set(direction).issubset(set(['from above', 'from below'])):
        raise ValueError('invalid direction `{}`'.format(direction))

    for yr, val in zip(x.index, x.values):
        if np.isnan(val):  # ignore nans in the timeseries
            continue
        if prev_val is None:
            prev_yr, prev_val = yr, val
            continue
        if not np.sign(prev_val - threshold) == np.sign(val - threshold):
            if ('from above' in direction and prev_val > val) \
                    or ('from below' in direction and prev_val < val):
                change = (val - prev_val) / (yr - prev_yr)
                # add one because int() rounds down
                cross_yr = prev_yr + int((threshold - prev_val) / change) + 1
                years.append(cross_yr)
        prev_yr, prev_val = yr, val
    return years
Пример #6
0
def mpl_args_to_meta_cols(df, **kwargs):
    """Return the kwargs values (not keys) matching a `df.meta` column name"""
    cols = set()
    for arg, value in kwargs.items():
        if isstr(value) and value in df.meta.columns:
            cols.add(value)
    return list(cols)
Пример #7
0
def _aggregate_region(df,
                      variable,
                      region,
                      subregions=None,
                      components=False,
                      method='sum',
                      weight=None):
    """Internal implementation for aggregating data over subregions"""
    if not isstr(variable) and components is not False:
        msg = 'aggregating by list of variables with components ' \
              'is not supported'
        raise ValueError(msg)

    if weight is not None and components is not False:
        msg = 'using weights and components in one operation not supported'
        raise ValueError(msg)

    # default subregions to all regions other than `region`
    subregions = subregions or df._all_other_regions(region, variable)

    if not len(subregions):
        msg = 'cannot aggregate variable `{}` to `{}` because it does not'\
              ' exist in any subregion'
        logger.info(msg.format(variable, region))

        return

    # compute aggregate over all subregions
    subregion_df = df.filter(region=subregions)
    rows = subregion_df._apply_filters(variable=variable)
    if weight is None:
        col = 'region'
        _data = _group_and_agg(subregion_df.data[rows], col, method=method)
    else:
        weight_rows = subregion_df._apply_filters(variable=weight)
        _data = _agg_weight(subregion_df.data[rows],
                            subregion_df.data[weight_rows], method)

    # if not `components=False`, add components at the `region` level
    if components is not False:
        with adjust_log_level(logger):
            region_df = df.filter(region=region)

        # if `True`, auto-detect `components` at the `region` level,
        # defaults to variables below `variable` only present in `region`
        if components is True:
            level = dict(level=None)
            r_comps = region_df._variable_components(variable, **level)
            sr_comps = subregion_df._variable_components(variable, **level)
            components = set(r_comps).difference(sr_comps)

        if len(components):
            # rename all components to `variable` and aggregate
            rows = region_df._apply_filters(variable=components)
            _df = region_df.data[rows].copy()
            _df['variable'] = variable
            _data = _data.add(_group_and_agg(_df, 'region'), fill_value=0)

    return _data
Пример #8
0
def _get_method_func(method):
    """Translate a string to a known method"""
    if not isstr(method):
        return method

    if method in KNOWN_FUNCS:
        return KNOWN_FUNCS[method]

    # raise error if `method` is a string but not in dict of known methods
    raise ValueError('method `{}` is not a known aggregator'.format(method))
Пример #9
0
def _get_method_func(method):
    """Translate a string to a known method"""
    if not isstr(method):
        return method

    if method in KNOWN_FUNCS:
        return KNOWN_FUNCS[method]

    # raise error if `method` is a string but not in dict of known methods
    raise ValueError(f"'{method}' is not a known method!")
Пример #10
0
def test_interpolate(test_pd_df):
    _df = test_pd_df.copy()
    _df["foo"] = ["bar", "baz", 2]  # add extra_col (check for #351)
    df = IamDataFrame(_df)
    obs = df.interpolate(2007, inplace=False).filter(year=2007)._data.values
    npt.assert_allclose(obs, [3, 1.5, 4])

    # redo the interpolation and check that no duplicates are added
    df.interpolate(2007, inplace=False)
    assert not df._data.index.duplicated().any()

    # assert that extra_col does not have nan's (check for #351)
    assert all([True if isstr(i) else ~np.isnan(i) for i in df.foo])
Пример #11
0
def test_interpolate(test_pd_df):
    _df = test_pd_df.copy()
    _df['foo'] = ['bar', 'baz', 2]  # add extra_col (check for #351)
    df = IamDataFrame(_df)
    df.interpolate(2007)
    obs = df.filter(year=2007).data['value'].reset_index(drop=True)
    exp = pd.Series([3, 1.5, 4], name='value')
    pd.testing.assert_series_equal(obs, exp)

    # redo the interpolation and check that no duplicates are added
    df.interpolate(2007)
    assert not df.filter().data.duplicated().any()

    # assert that extra_col does not have nan's (check for #351)
    assert all([True if isstr(i) else ~np.isnan(i) for i in df.data.foo])
Пример #12
0
def _get_token(creds, base_url):
    """Parse credentials and get token from IIASA authentication service"""
    plaintextcreds = True

    # try reading default config or parse file
    if creds is None:
        creds = _get_config()
        plaintextcreds = False
    elif isinstance(creds, Path) or isstr(creds):
        _creds = _get_config(creds)
        if _creds is None:
            logger.error(f"Could not read credentials from `{creds}`")
        creds = _creds
        plaintextcreds = False

    # if (still) no creds, get anonymous auth and return
    if creds is None:
        url = "/".join([base_url, "anonym"])
        r = requests.get(url)
        _check_response(r, "Could not get anonymous token")
        return r.json(), None

    # parse creds, write warning
    if isinstance(creds, Mapping):
        user, pw = creds["username"], creds["password"]
    else:
        user, pw = creds
    if plaintextcreds:
        logger.warning("You provided credentials in plain text. DO NOT save "
                       "these in a repository or otherwise post them online")
        deprecation_warning(
            "Please use `pyam.iiasa.set_config(<user>, <pwd>)`"
            " to store your credentials in a file!",
            "Providing credentials in plain text",
        )

    # get user token
    headers = {
        "Accept": "application/json",
        "Content-Type": "application/json"
    }
    data = {"username": user, "password": pw}
    url = "/".join([base_url, "login"])
    r = requests.post(url, headers=headers, data=json.dumps(data))
    _check_response(r, "Login failed for user: {}".format(user))
    return r.json(), user
Пример #13
0
def _get_token(creds, base_url):
    """Parse credentials and get token from IIASA authentication service"""
    plaintextcreds = True

    # try reading default config or parse file
    if creds is None:
        creds = _get_config()
        plaintextcreds = False
    elif isinstance(creds, Path) or isstr(creds):
        _creds = _get_config(creds)
        if _creds is None:
            logger.error(f'Could not read credentials from `{creds}`')
        creds = _creds
        plaintextcreds = False

    # if (still) no creds, get anonymous auth and return
    if creds is None:
        url = '/'.join([base_url, 'anonym'])
        r = requests.get(url)
        _check_response(r, 'Could not get anonymous token')
        return r.json(), None

    # parse creds, write warning
    if isinstance(creds, Mapping):
        user, pw = creds['username'], creds['password']
    else:
        user, pw = creds
    if plaintextcreds:
        logger.warning('You provided credentials in plain text. DO NOT save '
                       'these in a repository or otherwise post them online')
        deprecation_warning(
            'Please use `pyam.iiasa.set_config(<user>, <pwd>)`'
            ' to store your credentials in a file!',
            'Providing credentials in plain text')

    # get user token
    headers = {
        'Accept': 'application/json',
        'Content-Type': 'application/json'
    }
    data = {'username': user, 'password': pw}
    url = '/'.join([base_url, 'login'])
    r = requests.post(url, headers=headers, data=json.dumps(data))
    _check_response(r, 'Login failed for user: {}'.format(user))
    return r.json(), user
Пример #14
0
def _aggregate(df, variable, components=None, method=np.sum):
    """Internal implementation of the `aggregate` function"""

    if components is not None:
        # ensure that components is a proper list (not a dictionary)
        if not islistable(components) or isinstance(components, dict):
            raise ValueError(
                f"Value for `components` must be a list, found: {components}"
            )

        # list of variables require default components (no manual list)
        if islistable(variable):
            raise NotImplementedError(
                "Aggregating by list of variables does not support `components`."
            )

    mapping = {}
    msg = "Cannot aggregate variable '{}' because it has no components!"
    # if single variable
    if isstr(variable):
        # default components to all variables one level below `variable`
        components = components or df._variable_components(variable)

        if not len(components):
            logger.info(msg.format(variable))
            return

        for c in components:
            mapping[c] = variable

    # else, use all variables one level below `variable` as components
    else:
        for v in variable if islistable(variable) else [variable]:
            _components = df._variable_components(v)
            if not len(_components):
                logger.info(msg.format(v))
                continue

            for c in _components:
                mapping[c] = v

    # rename all components to `variable` and aggregate
    _df = df._data[df._apply_filters(variable=mapping.keys())]
    _df.index = replace_index_values(_df, "variable", mapping)
    return _group_and_agg(_df, [], method)
Пример #15
0
def cross_threshold(x,
                    threshold=0,
                    direction=['from above', 'from below'],
                    return_type=int):
    """Returns a list of the years in which a timeseries crosses a threshold

    Parameters
    ----------
    x : :class:`pandas.Series`
        A timeseries indexed over years (as integers)
    threshold : float, optional
        The threshold that the timeseries is checked against
    direction : str, optional
        Whether to return all years where the threshold is crossed
        or only where threshold is crossed in a specific direction
    return_type : type, optional
        Whether to cast the returned values to integer (years)
    """
    direction = [direction] if isstr(direction) else list(direction)
    if not set(direction).issubset(set(['from above', 'from below'])):
        raise ValueError('invalid direction `{}`'.format(direction))

    # get the values and time-domain index
    x = x.dropna()
    values, index = x.values - threshold, x.index.to_numpy()
    positive, negative = (values >= 0), (values < 0)

    # determine all indices before crossing the threshold
    pre = [False] * (len(x) - 1)
    if 'from above' in direction:
        pre |= positive[:-1] & negative[1:]
    if 'from below' in direction:
        pre |= positive[1:] & negative[:-1]
    pre = np.argwhere(pre)
    # determine all indices after crossing the threshold
    post = pre + 1

    # compute the index value where the threshold is crossed
    change = (values[post] - values[pre]) / (index[post] - index[pre])
    years = index[pre] - values[pre] / change

    # it year (as int) is returned, add one because int() rounds down
    if return_type == int:
        return [y + 1 for y in map(int, years)]
    return years
Пример #16
0
def _get_token(creds, base_url):
    """Parse credentials and get token from IIASA authentication service"""

    # try reading default config or parse file
    if creds is None:
        creds = _get_config()
    elif isinstance(creds, Path) or isstr(creds):
        _creds = _get_config(creds)
        if _creds is None:
            logger.error(f"Could not read credentials from `{creds}`")
        creds = _creds
    else:
        msg = (
            "Passing credentials as clear-text is not allowed. "
            "Please use `pyam.iiasa.set_config(<user>, <password>)` instead!")
        raise DeprecationWarning(msg)

    # if (still) no creds, get anonymous auth and return
    if creds is None:
        url = "/".join([base_url, "anonym"])
        r = requests.get(url)
        _check_response(r, "Could not get anonymous token")
        return r.json(), None

    # parse creds, write warning
    if isinstance(creds, Mapping):
        user, pw = creds["username"], creds["password"]
    else:
        user, pw = creds

    # get user token
    headers = {
        "Accept": "application/json",
        "Content-Type": "application/json"
    }
    data = {"username": user, "password": pw}
    url = "/".join([base_url, "login"])
    r = requests.post(url, headers=headers, data=json.dumps(data))
    _check_response(r, "Login failed for user: {}".format(user))
    return r.json(), user
Пример #17
0
def _aggregate(df, variable, components=None, method=np.sum):
    """Internal implementation of the `aggregate` function"""
    # list of variables require default components (no manual list)
    if islistable(variable) and components is not None:
        raise ValueError('aggregating by list of variables cannot use '
                         'custom components')

    mapping = {}
    msg = 'cannot aggregate variable `{}` because it has no components'
    # if single variable
    if isstr(variable):
        # default components to all variables one level below `variable`
        components = components or df._variable_components(variable)

        if not len(components):
            logger.info(msg.format(variable))
            return

        for c in components:
            mapping[c] = variable

    # else, use all variables one level below `variable` as components
    else:
        for v in variable if islistable(variable) else [variable]:
            _components = df._variable_components(v)
            if not len(_components):
                logger.info(msg.format(v))
                continue

            for c in _components:
                mapping[c] = v

    # rename all components to `variable` and aggregate
    _df = df.data[df._apply_filters(variable=mapping.keys())].copy()
    _df['variable'].replace(mapping, inplace=True)
    return _group_and_agg(_df, [], method)
Пример #18
0
def _aggregate_region(
    df,
    variable,
    region,
    subregions=None,
    components=False,
    method="sum",
    weight=None,
    drop_negative_weights=True,
):
    """Internal implementation for aggregating data over subregions"""
    if not isstr(variable) and components is not False:
        raise ValueError(
            "Aggregating by list of variables with components is not supported!"
        )

    if weight is not None and components is not False:
        raise ValueError("Using weights and components in one operation not supported!")

    # default subregions to all regions other than `region`
    subregions = subregions or df._all_other_regions(region, variable)

    if not len(subregions):
        logger.info(
            f"Cannot aggregate variable '{variable}' to '{region}' "
            "because it does not exist in any subregion!"
        )
        return

    # compute aggregate over all subregions
    subregion_df = df.filter(region=subregions)
    rows = subregion_df._apply_filters(variable=variable)
    if weight is None:

        if drop_negative_weights is False:
            raise ValueError(
                "Dropping negative weights can only be used with `weights`!"
            )

        _data = _group_and_agg(subregion_df._data[rows], "region", method=method)
    else:
        weight_rows = subregion_df._apply_filters(variable=weight)
        _data = _agg_weight(
            subregion_df._data[rows],
            subregion_df._data[weight_rows],
            method,
            drop_negative_weights,
        )

    # if not `components=False`, add components at the `region` level
    if components:
        with adjust_log_level(logger):
            region_df = df.filter(region=region)

        # if `True`, auto-detect `components` at the `region` level,
        # defaults to variables below `variable` only present in `region`
        if components is True:
            level = dict(level=None)
            r_comps = region_df._variable_components(variable, **level)
            sr_comps = subregion_df._variable_components(variable, **level)
            components = set(r_comps).difference(sr_comps)

        if len(components):
            # rename all components to `variable` and aggregate
            rows = region_df._apply_filters(variable=components)
            _df = region_df._data[rows]
            mapping = {c: variable for c in components}
            _df.index = replace_index_values(_df.index, "variable", mapping)
            _data = _data.add(_group_and_agg(_df, "region"), fill_value=0)

    return _data
Пример #19
0
def read_unfccc(
    party_code,
    gases=None,
    tier=None,
    mapping=None,
    model="UNFCCC",
    scenario="Data Inventory",
):
    """Read data from the UNFCCC Data Inventory

    This function is a wrappter for the
    :meth:`unfccc_di_api.UNFCCCApiReader.query`.

    The data returned from the UNFCCC Data Inventory is transformed
    into a structure similar to the format used in IPCC reports and
    IAM model comparison projects. For compatibility with the
    `iam-units <https://github.com/IAMconsortium/units>`_ package
    and the :meth:`convert_unit <IamDataFrame.convert_unit>`,
    emissions species are formatted to standard text ('CO2')
    instead of subscripts ('CO₂') and the unit 'CO₂ equivalent'
    used by UNFCCC is replaced by 'CO2e'.

    Parameters
    ----------
    party_code : str
        ISO3-style code for UNFCCC party (country)
    gases : str or list of str, optional
        Emission species to be queried from the data inventory can be stated
        as subscript-format ('CO₂') or simple text ('CO2')
    tier : int or list of int
        Pre-specified groupings of UNFCCC data to a variable naming format
        used in IPCC reports and IAM model comparison projects
    mapping : dict, optional
        Mapping to cast UNFCCC-data columns into IAMC-style variables, e.g.

        .. code-block:: python

            {
                'Emissions|{gas}|Energy': ('1.  Energy', '*', '*', '*'),
            }

        where the tuple corresponds to filters for the columns
        `['category', 'classification', 'measure', 'gas']`
        and `{<col>}` tags in the key are replaced by the column value.
    model : str, optional
        Name to be used as model identifier
    scenario : str, optional
        Name to be used as scenario identifier

    Returns
    -------
    :class:`IamDataFrame`
    """
    if not HAS_UNFCCC:  # pragma: no cover
        raise ImportError("Required package `unfccc-di-api` not found!")

    # check that only one of `tier` or `mapping` is provided
    if (tier is None and mapping is None) or (tier is not None
                                              and mapping is not None):
        raise ValueError("Please specify either `tier` or `mapping`!")

    global _READER
    if _READER is None:
        _READER = unfccc_di_api.UNFCCCApiReader()

    # retrieve data, drop non-numeric data and base year
    data = _READER.query(party_code=party_code, gases=to_list(gases))
    data = data[~np.isnan(data.numberValue)]
    data = data[data.year != "Base year"]

    # create the mapping from the data if `tier` is given
    if tier is not None:
        _category = data.category.unique()
        mapping = {}

        for t in to_list(tier):
            # treatment of tear 1
            if t == 1:
                pattern = re.compile(".\\.  ")  # pattern of top-level category
                for i in [i for i in _category if pattern.match(i)]:
                    key = "Emissions|{gas}|" + i[4:]
                    mapping[key] = (
                        i,
                        "Total for category",
                        "Net emissions/removals",
                        "*",
                    )
            else:
                raise ValueError(f"Unknown value for `tier`: {t}")

    # add new `variable` column, iterate over mapping to determine variables
    data["variable"] = None
    for variable, value in mapping.items():
        matches = np.array([True] * len(data))
        for i, col in enumerate(NAME_COLS):
            matches &= pattern_match(data[col], value[i])

        data.loc[matches,
                 "variable"] = data.loc[matches].apply(_compile_variable,
                                                       variable=variable,
                                                       axis=1)

    # drop unspecified rows and columns, rename value column
    cols = ["party", "variable", "unit", "year", "gas", "numberValue"]
    data = data.loc[[isstr(i) for i in data.variable], cols]
    data.rename(columns={"numberValue": "value"}, inplace=True)

    # append `gas` to unit, drop `gas` column
    data.loc[:, "unit"] = data.apply(_compile_unit, axis=1)
    data.drop(columns="gas", inplace=True)

    return IamDataFrame(data, model=model, scenario=scenario, region="party")
Пример #20
0
 def _get_kwarg(k):
     x = kwargs.pop(k, [])
     return [x] if isstr(x) else x
Пример #21
0
 def __setitem__(self, key, value):
     _key_check = [key] if isstr(key) else key
     if set(_key_check).issubset(self.meta.columns):
         return self.meta.__setitem__(key, value)
     else:
         return self.data.__setitem__(key, value)
Пример #22
0
    def __init__(self,
                 df,
                 groupby=None,
                 filters=None,
                 rows=False,
                 percentiles=[0.25, 0.5, 0.75]):
        self.df = df
        self.idx_depth = None

        # assing `groupby` settings and check that specifications are valid
        self.col = None
        self.groupby = None
        if isstr(groupby):
            self.col = groupby
            self.groupby = {groupby: None}
        elif isinstance(groupby, dict) and len(groupby) == 1:
            self.col = list(groupby.keys())[0]
            self.groupby = groupby
            self.idx_depth = 2
        elif groupby is not None:
            raise ValueError('arg `{}` not valid `groupby`'.format(groupby))
        if self.col is not None and self.col not in df.meta.columns:
            raise ValueError('column `{}` not in `df.meta`'.format(self.col))

        # if neither groupby nor filters is given, use filters to describe all
        # and assume that rows are used
        if groupby is None and filters is None:
            self.filters = [('', {})]
            rows = True
        else:
            self.filters = filters if filters is not None else []

        # set lists to sort index and subindex
        self._idx = [] if self.col is None else [self.col]
        self._sub_idx = self.groupby[self.col] or self.df[self.col].unique() \
            if self.col is not None else []
        self._headers, self._subheaders = ([], [])

        # assing `filters` settings and check that specifications are valid
        for (idx, _filter) in self.filters:
            # check that index in tuple is valid
            if isstr(idx):
                self._add_to_index(idx)
            else:
                if not (isinstance(idx, tuple) and len(idx) == 2
                        and isstr(idx[0]) or not isstr(idx[1])):
                    raise ValueError('`{}` is not a valid index'.format(idx))
                self._add_to_index(idx[0], idx[1])
            # check that filters in tuple are valid
            if not isinstance(_filter, dict):
                raise ValueError('`{}` is not a valid filter'.format(_filter))
            elif not (set(_filter) - set(META_IDX)).issubset(df.meta):
                raise ValueError('column `{}` not in `df.meta`'.format(
                    set(_filter) - set(META_IDX) - set(df.meta)))

        self.stats = None
        self.rows = [] if rows else None

        # percentiles for passing to `pandas.describe()`
        self.percentiles = list(percentiles)
        self._describe_cols = (['count', 'mean', 'std', 'min'] +
                               ['{:.0%}'.format(i)
                                for i in self.percentiles] + ['max'])
Пример #23
0
def line_plot(df, x='year', y='value', ax=None, legend=None, title=True,
              color=None, marker=None, linestyle=None, cmap=None,
              fill_between=None, final_ranges=None,
              rm_legend_label=[], **kwargs):
    """Plot data as lines with or without markers.

    Parameters
    ----------
    df : pd.DataFrame
        Data to plot as a long-form data frame
    x : string, optional
        The column to use for x-axis values
        default: year
    y : string, optional
        The column to use for y-axis values
        default: value
    ax : matplotlib.Axes, optional
    legend : bool or dictionary, optional
        Add a legend. If a dictionary is provided, it will be used as keyword
        arguments in creating the legend.
        default: None (displays legend only if less than 13 entries)
    title : bool or string, optional
        Display a default or custom title.
    color : string, optional
        A valid matplotlib color or column name. If a column name, common
        values will be provided the same color.
        default: None
    marker : string, optional
        A valid matplotlib marker or column name. If a column name, common
        values will be provided the same marker.
        default: None
    linestyle : string, optional
        A valid matplotlib linestyle or column name. If a column name, common
        values will be provided the same linestyle.
        default: None
    cmap : string, optional
        A colormap to use.
        default: None
    fill_between : boolean or dict, optional
        Fill lines between minima/maxima of the 'color' argument. This can only
        be used if also providing a 'color' argument. If this is True, then
        default arguments will be provided to `ax.fill_between()`. If this is a
        dictionary, those arguments will be provided instead of defaults.
        default: None
    final_ranges : boolean or dict, optional
        Add vertical line between minima/maxima of the 'color' argument in the
        last period plotted.  This can only be used if also providing a 'color'
        argument. If this is True, then default arguments will be provided to
        `ax.axvline()`. If this is a dictionary, those arguments will be
        provided instead of defaults.
        default: None
    rm_legend_label : string, list, optional
        Remove the color, marker, or linestyle label in the legend.
        default: []
    kwargs : Additional arguments to pass to the pd.DataFrame.plot() function
    """
    if ax is None:
        fig, ax = plt.subplots()

    # assign styling properties
    props = assign_style_props(df, color=color, marker=marker,
                               linestyle=linestyle, cmap=cmap)

    if fill_between and 'color' not in props:
        raise ValueError('Must use `color` kwarg if using `fill_between`')
    if final_ranges and 'color' not in props:
        raise ValueError('Must use `color` kwarg if using `final_ranges`')

    # reshape data for use in line_plot
    df = reshape_line_plot(df, x, y)  # long form to one column per line

    # determine index of column name in reshaped dataframe
    prop_idx = {}
    for kind, var in [('color', color), ('marker', marker),
                      ('linestyle', linestyle)]:
        if var is not None and var in df.columns.names:
            prop_idx[kind] = df.columns.names.index(var)

    # plot data, keeping track of which legend labels to apply
    no_label = [rm_legend_label] if isstr(rm_legend_label) else rm_legend_label

    for col, data in df.iteritems():
        pargs = {}
        labels = []
        # build plotting args and line legend labels
        for key, kind, var in [('c', 'color', color),
                               ('marker', 'marker', marker),
                               ('linestyle', 'linestyle', linestyle)]:
            if kind in props:
                label = col[prop_idx[kind]]
                pargs[key] = props[kind][label]
                if kind not in no_label:
                    labels.append(repr(label).lstrip("u'").strip("'"))
            else:
                pargs[key] = var
        kwargs.update(pargs)
        data = data.dropna()
        data.plot(ax=ax, **kwargs)
        if labels:
            ax.lines[-1].set_label(' '.join(labels))

    if fill_between:
        _kwargs = {'alpha': 0.25} if fill_between in [True, None] \
            else fill_between
        data = df.T
        columns = data.columns
        # get outer boundary mins and maxes
        allmins = data.groupby(color).min()
        intermins = (
            data.dropna(axis=1).groupby(color).min()  # nonan data
            .reindex(columns=columns)  # refill with nans
            .T.interpolate(method='index').T  # interpolate
        )
        mins = pd.concat([allmins, intermins]).min(level=0)
        allmaxs = data.groupby(color).max()
        intermaxs = (
            data.dropna(axis=1).groupby(color).max()  # nonan data
            .reindex(columns=columns)  # refill with nans
            .T.interpolate(method='index').T  # interpolate
        )
        maxs = pd.concat([allmaxs, intermaxs]).max(level=0)
        # do the fill
        for idx in mins.index:
            ymin = mins.loc[idx]
            ymax = maxs.loc[idx]
            ax.fill_between(ymin.index, ymin, ymax,
                            facecolor=props['color'][idx], **_kwargs)

    # add bars to the end of the plot showing range
    if final_ranges:
        # have to explicitly draw it to get the tick labels (these change once
        # you add the vlines)
        plt.gcf().canvas.draw()
        _kwargs = {'linewidth': 2} if final_ranges in [True, None] \
            else final_ranges
        first = df.index[0]
        final = df.index[-1]
        mins = df.T.groupby(color).min()[final]
        maxs = df.T.groupby(color).max()[final]
        ymin, ymax = ax.get_ylim()
        ydiff = ymax - ymin
        xmin, xmax = ax.get_xlim()
        xdiff = xmax - xmin
        xticks = ax.get_xticks()
        xlabels = ax.get_xticklabels()
        # 1.5% increase seems to be ok per extra line
        extra_space = 0.015
        for i, idx in enumerate(mins.index):
            xpos = final + xdiff * extra_space * (i + 1)
            _ymin = (mins[idx] - ymin) / ydiff
            _ymax = (maxs[idx] - ymin) / ydiff
            ax.axvline(xpos, ymin=_ymin, ymax=_ymax,
                       color=props['color'][idx], **_kwargs)
        # for equal spacing between xmin and first datapoint and xmax and last
        # line
        ax.set_xlim(xmin, xpos + first - xmin)
        ax.set_xticks(xticks)
        ax.set_xticklabels(xlabels)

    # build unique legend handles and labels
    handles, labels = ax.get_legend_handles_labels()
    handles, labels = np.array(handles), np.array(labels)
    _, idx = np.unique(labels, return_index=True)
    handles, labels = handles[idx], labels[idx]
    if legend is not False:
        _add_legend(ax, handles, labels, legend)

    # add default labels if possible
    ax.set_xlabel(x.title())
    units = df.columns.get_level_values('unit').unique()
    units_for_ylabel = len(units) == 1 and x == 'year' and y == 'value'
    ylabel = units[0] if units_for_ylabel else y.title()
    ax.set_ylabel(ylabel)

    # build a default title if possible
    if title:
        default_title = []
        for var in ['model', 'scenario', 'region', 'variable']:
            if var in df.columns.names:
                values = df.columns.get_level_values(var).unique()
                if len(values) == 1:
                    default_title.append('{}: {}'.format(var, values[0]))
        title = ' '.join(default_title) if title is True else title
        ax.set_title(title)

    return ax, handles, labels
Пример #24
0
 def _get_kwarg(k):
     # TODO refactor API to return all models if model-list is empty
     x = kwargs.pop(k, "*" if k == "model" else [])
     return [x] if isstr(x) else x
Пример #25
0
def _group_and_agg(df, by, method=np.sum):
    """Groupby & aggregate `df` by column(s), return indexed `pd.Series`"""
    by = [by] if isstr(by) else by
    cols = [c for c in list(df.columns) if c not in ['value'] + by]
    # pick aggregator func (default: sum)
    return df.groupby(cols)['value'].agg(_get_method_func(method))
Пример #26
0
 def _new_meta_column(self, name, value):
     """Add a metadata column, set to `uncategorized` if str else np.nan"""
     if name is None:
         raise ValueError('cannot add a meta column {}'.format(name))
     if name not in self.meta:
         self.meta[name] = 'uncategorized' if isstr(value) else np.nan
Пример #27
0
def line_plot(df,
              x='year',
              y='value',
              ax=None,
              legend=None,
              title=True,
              color=None,
              marker=None,
              linestyle=None,
              cmap=None,
              rm_legend_label=[],
              **kwargs):
    """Plot data as lines with or without markers.

    Parameters
    ----------
    df : pd.DataFrame
        Data to plot as a long-form data frame
    x : string, optional
        The column to use for x-axis values
        default: year
    y : string, optional
        The column to use for y-axis values
        default: value
    ax : matplotlib.Axes, optional
    legend : bool or dictionary, optional
        Add a legend. If a dictionary is provided, it will be used as keyword
        arguments in creating the legend.
        default: None (displays legend only if less than 13 entries)
    title : bool or string, optional
        Display a default or custom title.
    color : string, optional
        A valid matplotlib color or column name. If a column name, common
        values will be provided the same color.
        default: None
    marker : string, optional
        A valid matplotlib marker or column name. If a column name, common
        values will be provided the same marker.
        default: None
    linestyle : string, optional
        A valid matplotlib linestyle or column name. If a column name, common
        values will be provided the same linestyle.
        default: None
    cmap : string, optional
        A colormap to use.
        default: None
    rm_legend_label : string, list, optional
        Remove the color, marker, or linestyle label in the legend.
        default: []
    kwargs : Additional arguments to pass to the pd.DataFrame.plot() function
    """
    if ax is None:
        fig, ax = plt.subplots()

    df = reshape_line_plot(df, x, y)  # long form to one column per line

    # determine color, marker, and linestyle for each line
    defaults = default_props(reset=True,
                             num_colors=len(df.columns),
                             colormap=cmap)
    props = {}
    prop_idx = {}
    rc = run_control()
    for kind, var in [('color', color), ('marker', marker),
                      ('linestyle', linestyle)]:
        rc_has_kind = kind in rc
        if var in df.columns.names:
            rc_has_var = rc_has_kind and var in rc[kind]
            props_for_kind = {}
            for val in df.columns.get_level_values(var).unique():
                if rc_has_var and val in rc[kind][var]:
                    props_for_kind[val] = rc[kind][var][val]
                    # cycle any way to keep defaults the same
                    next(defaults[kind])
                else:
                    props_for_kind[val] = next(defaults[kind])
            props[kind] = props_for_kind
            prop_idx[kind] = df.columns.names.index(var)

    # plot data, keeping track of which legend labels to apply
    no_label = [rm_legend_label] if isstr(rm_legend_label) else rm_legend_label
    for col, data in df.iteritems():
        pargs = {}
        labels = []
        # build plotting args and line legend labels
        for key, kind, var in [('c', 'color', color),
                               ('marker', 'marker', marker),
                               ('linestyle', 'linestyle', linestyle)]:
            if kind in props:
                label = col[prop_idx[kind]]
                pargs[key] = props[kind][label]
                if kind not in no_label:
                    labels.append(repr(label).lstrip("u'").strip("'"))
            else:
                pargs[key] = var
        kwargs.update(pargs)
        data = data.dropna()
        data.plot(ax=ax, **kwargs)
        if labels:
            ax.lines[-1].set_label(' '.join(labels))

    # build unique legend handles and labels
    handles, labels = ax.get_legend_handles_labels()
    handles, labels = np.array(handles), np.array(labels)
    _, idx = np.unique(labels, return_index=True)
    handles, labels = handles[idx], labels[idx]
    if legend is not False:
        _add_legend(ax, handles, labels, legend)

    # add default labels if possible
    ax.set_xlabel(x.title())
    units = df.columns.get_level_values('unit').unique()
    units_for_ylabel = len(units) == 1 and x == 'year' and y == 'value'
    ylabel = units[0] if units_for_ylabel else y.title()
    ax.set_ylabel(ylabel)

    # build a default title if possible
    _title = []
    for var in ['model', 'scenario', 'region', 'variable']:
        if var in df.columns.names:
            values = df.columns.get_level_values(var).unique()
            if len(values) == 1:
                _title.append('{}: {}'.format(var, values[0]))
    if title and _title:
        ax.set_title(' '.join(_title))

    return ax, handles, labels