Exemplo n.º 1
0
def test_cumulative_out_of_range():
    # set logger level to exclude warnings in unit test output
    logger().setLevel('ERROR')
    # note that the series is not order and the index is defined as float
    y = pd.Series(data=[np.nan, 1, 3, 1], index=[2002., 2005., 2007., 2013.])
    assert cumulative(y, 2008, 2015) is np.nan
    logger().setLevel('NOTSET')
Exemplo n.º 2
0
    def validate(self, criteria={}, exclude_on_fail=False):
        """Validate scenarios using criteria on timeseries values

        Parameters
        ----------
        criteria: dict
           dictionary with variable keys and check values
            ('up' and 'lo' for respective bounds, 'year' for years)
        exclude_on_fail: bool, default False
            flag scenarios failing validation as `exclude: True`
        """
        df = _apply_criteria(self.data, criteria, in_range=False)

        if exclude_on_fail:
            idx = _meta_idx(df)
            self.meta.loc[idx, 'exclude'] = True

        if not df.empty:
            msg = '{} of {} data points to not satisfy the criteria'
            logger().info(msg.format(len(df), len(self.data)))

            if exclude_on_fail and len(idx) > 0:
                logger().info('{} non-valid scenario{} will be excluded'
                              .format(len(idx), '' if len(idx) == 1 else 's'))

            return df
Exemplo n.º 3
0
def _add_legend(ax, handles, labels, legend):
    if legend is None and len(labels) >= MAX_LEGEND_LABELS:
        logger().info(
            '>={} labels, not applying legend'.format(MAX_LEGEND_LABELS))
    else:
        legend = {} if legend in [True, None] else legend
        ax.legend(handles, labels, **legend)
Exemplo n.º 4
0
    def connect(self, name):
        # TODO: deprecate in next release
        if name == 'iamc15':
            warnings.warn(
                'The name `iamc15` is deprecated and will be removed in the ' +
                'next release. Please use `IXSE_SR15`.')
            name = 'IXSE_SR15'

        valid = self.valid_connections
        if len(valid) == 0:
            raise RuntimeError(
                'No valid connections found for the provided credentials.')

        if name not in valid:
            msg = """
            {} not recognized as a valid connection name.
            Choose from one of the supported connections for your user: {}.
            """
            raise ValueError(msg.format(name, valid))

        url = '/'.join([_BASE_URL, 'applications', name, 'config'])
        headers = {'Authorization': 'Bearer {}'.format(self._token)}
        r = requests.get(url, headers=headers)
        _check_response(r, 'Could not get application information')
        response = r.json()
        idxs = {x['path']: i for i, x in enumerate(response)}

        self._base_url = response[idxs['baseUrl']]['value']
        # TODO: request the full citation to be added to this metadata intead
        # of linking to the about page
        about = '/'.join([response[idxs['uiUrl']]['value'], '#', 'about'])
        logger().info(_CITE_MSG.format(name, about))

        self._connected = name
Exemplo n.º 5
0
def format_data(df):
    """Convert an imported dataframe and check all required columns"""

    # format columns to lower-case and check that all required columns exist
    df.rename(columns={c: str(c).lower() for c in df.columns}, inplace=True)
    if not set(IAMC_IDX).issubset(set(df.columns)):
        missing = list(set(IAMC_IDX) - set(df.columns))
        raise ValueError("missing required columns `{}`!".format(missing))

    if 'notes' in df.columns:
        logger().info('Ignoring notes column in dataframe')
        df.drop(columns='notes', inplace=True)
        df = df[~df.model.str.contains('database', case=False)]

    # check whether data in IAMC style or year/value layout
    if 'value' not in df.columns:
        numcols = sorted(set(df.columns) - set(IAMC_IDX))
        df = pd.melt(df,
                     id_vars=IAMC_IDX,
                     var_name='year',
                     value_vars=numcols,
                     value_name='value')
    df['year'] = pd.to_numeric(df['year'])

    # drop NaN's
    df.dropna(inplace=True)

    # sort data
    df.sort_values(SORT_IDX, inplace=True)

    return df
Exemplo n.º 6
0
def format_data(df):
    """Convert an imported dataframe and check all required columns"""
    # all lower case
    df.rename(columns={c: str(c).lower() for c in df.columns}, inplace=True)

    if 'notes' in df.columns:  # this came from the database
        logger().info('Ignoring notes column in dataframe')
        df.drop(columns='notes', inplace=True)
        col = df.columns[0]  # first column has database copyright notice
        df = df[~df[col].str.contains('database', case=False)]
Exemplo n.º 7
0
    def check_aggregate(self,
                        variable,
                        components=None,
                        units=None,
                        exclude_on_fail=False,
                        multiplier=1,
                        **kwargs):
        """Check whether the timeseries data match the aggregation
        of components or sub-categories

        Parameters
        ----------
        variable: str
            variable to be checked for matching aggregation of sub-categories
        components: list of str, default None
            list of variables, defaults to all sub-categories of `variable`
        units: str or list of str, default None
            filter variable and components for given unit(s)
        exclude_on_fail: boolean, default False
            flag scenarios failing validation as `exclude: True`
        multiplier: number, default 1
            factor when comparing variable and sum of components
        kwargs: passed to `np.isclose()`
        """
        # default components to all variables one level below `variable`
        if components is None:
            components = self.filter(variable='{}|*'.format(variable),
                                     level=0).variables()

        if not len(components):
            msg = '{} - cannot check aggregate because it has no components'
            logger().info(msg.format(variable))

            return

        # filter and groupby data, use `pd.Series.align` for matching index
        df_variable, df_components = (_aggregate_by_variables(
            self.data, variable,
            units).align(_aggregate_by_variables(self.data, components,
                                                 units)))

        # use `np.isclose` for checking match
        diff = df_variable[~np.isclose(df_variable, multiplier *
                                       df_components, **kwargs)]

        if len(diff):
            msg = '{} - {} of {} data points are not aggregates of components'
            logger().info(msg.format(variable, len(diff), len(df_variable)))

            if exclude_on_fail:
                self._exclude_on_fail(diff.index.droplevel([2, 3]))

            diff = pd.concat([diff], keys=[variable], names=['variable'])

            return diff.unstack().rename_axis(None, axis=1)
Exemplo n.º 8
0
def cumulative(x, first_year, last_year):
    """Returns the cumulative sum of a timeseries (indexed over years),
    implements linear interpolation between years, ignores nan's in the range.
    The function includes the last-year value of the series, and
    raises a warning if start_year or last_year is outside of
    the timeseries range and returns nan

    Parameters
    ----------
    x: pandas.Series
        a timeseries to be summed over time
    first_year: int
        first year of the sum
    last_year: int
        last year of the sum (inclusive)
    """
    # if the timeseries does not cover the range `[first_year, last_year]`,
    # return nan to avoid erroneous aggregation
    if min(x.index) > first_year:
        logger().warning('the timeseries `{}` does not start by {}'.format(
            x.name or x, first_year))
        return np.nan
    if max(x.index) < last_year:
        logger().warning('the timeseries `{}` does not extend until {}'.format(
            x.name or x, last_year))
        return np.nan

    # cast tiemseries colums to `int` if necessary
    if not x.index.dtype == 'int64':
        cast_years_to_int(x, index=True)

    x[first_year] = fill_series(x, first_year)
    x[last_year] = fill_series(x, last_year)

    years = [
        i for i in x.index
        if i >= first_year and i <= last_year and ~np.isnan(x[i])
    ]
    years.sort()

    # loop over years
    if not np.isnan(x[first_year]) and not np.isnan(x[last_year]):
        value = 0
        for (i, yr) in enumerate(years[:-1]):
            next_yr = years[i + 1]
            # the summation is shifted to include the first year fully in sum,
            # otherwise, would return a weighted average of `yr` and `next_yr`
            value += ((next_yr - yr - 1) * x[next_yr] +
                      (next_yr - yr + 1) * x[yr]) / 2

        # the loop above does not include the last element in range
        # (`last_year`), therefore added explicitly
        value += x[last_year]

        return value
Exemplo n.º 9
0
def _add_legend(ax, handles, labels, legend):
    if legend is None and len(labels) >= MAX_LEGEND_LABELS:
        logger().info('>={} labels, not applying legend'.format(
            MAX_LEGEND_LABELS))
    else:
        legend = {} if legend in [True, None] else legend
        loc = legend.pop('loc', 'best')
        outside = loc.split(' ')[1] if loc.startswith('outside ') else False
        _legend = OUTSIDE_LEGEND[outside] if outside else dict(loc=loc)
        _legend.update(legend)
        ax.legend(handles, labels, **_legend)
Exemplo n.º 10
0
def read_file(fname, *args, **kwargs):
    """Read data from a file saved in the standard IAMC format
    or a table with year/value columns
    """
    if not isstr(fname):
        raise ValueError('reading multiple files not supported, '
                         'please use `pyam.IamDataFrame.append()`')
    logger().info('Reading `{}`'.format(fname))
    format_kwargs = {}
    # extract kwargs that are intended for `format_data`
    for c in [i for i in IAMC_IDX + ['year', 'time', 'value'] if i in kwargs]:
        format_kwargs[c] = kwargs.pop(c)
    return format_data(read_pandas(fname, *args, **kwargs), **format_kwargs)
Exemplo n.º 11
0
    def categorize(self,
                   name,
                   value,
                   criteria,
                   color=None,
                   marker=None,
                   linestyle=None):
        """Assign scenarios to a category according to specific criteria
        or display the category assignment

        Parameters
        ----------
        name: str
            category column name
        value: str
            category identifier
        criteria: dict
            dictionary with variables mapped to applicable checks
            ('up' and 'lo' for respective bounds, 'year' for years - optional)
        color: str
            assign a color to this category for plotting
        marker: str
            assign a marker to this category for plotting
        linestyle: str
            assign a linestyle to this category for plotting
        """
        # add plotting run control
        for kind, arg in [('color', color), ('marker', marker),
                          ('linestyle', linestyle)]:
            if arg:
                run_control().update({kind: {name: {value: arg}}})

        # find all data that matches categorization
        rows = _apply_criteria(self.data,
                               criteria,
                               in_range=True,
                               return_test='all')
        idx = _meta_idx(rows)

        if len(idx) == 0:
            logger().info("No scenarios satisfy the criteria")
            return  # EXIT FUNCTION

        # update metadata dataframe
        self._new_meta_column(name, value)
        self.meta.loc[idx, name] = value
        msg = '{} scenario{} categorized as `{}: {}`'
        logger().info(
            msg.format(len(idx), '' if len(idx) == 1 else 's', name, value))
Exemplo n.º 12
0
def read_files(fnames, *args, **kwargs):
    """Read data from a snapshot file saved in the standard IAMC format
    or a table with year/value columns
    """
    if isstr(fnames):
        fnames = [fnames]

    fnames = itertools.chain(*[glob.glob(f) for f in fnames])
    dfs = []
    for fname in fnames:
        logger().info('Reading `{}`'.format(fname))
        df = read_pandas(fname, *args, **kwargs)
        dfs.append(format_data(df))

    return pd.concat(dfs)
Exemplo n.º 13
0
    def load_metadata(self, path, *args, **kwargs):
        """Load metadata exported from `pyam.IamDataFrame` instance

        Parameters
        ----------
        path: string
            xlsx file with metadata exported from `pyam.IamDataFrame` instance
        """
        if not os.path.exists(path):
            raise ValueError("no metadata file '" + path + "' found!")

        if path.endswith('csv'):
            df = pd.read_csv(path, *args, **kwargs)
        else:
            xl = pd.ExcelFile(path)
            if len(xl.sheet_names) > 1 and 'sheet_name' not in kwargs:
                kwargs['sheet_name'] = 'meta'
            df = pd.read_excel(path, *args, **kwargs)

        req_cols = ['model', 'scenario', 'exclude']
        if not set(req_cols).issubset(set(df.columns)):
            e = 'File `{}` does not have required columns ({})!'
            raise ValueError(e.format(path, req_cols))

        # set index, filter to relevant scenarios from imported metadata file
        df.set_index(META_IDX, inplace=True)
        idx = self.meta.index.intersection(df.index)

        n_invalid = len(df) - len(idx)
        if n_invalid > 0:
            msg = 'Ignoring {} scenario{} from imported metadata'
            logger().info(msg.format(n_invalid, 's' if n_invalid > 1 else ''))

        if idx.empty:
            raise ValueError('No valid scenarios in imported metadata file!')

        df = df.loc[idx]

        # Merge in imported metadata
        msg = 'Importing metadata for {} scenario{} (for total of {})'
        logger().info(msg.format(len(df), 's' if len(df) > 1 else '',
                                 len(self.meta)))

        for col in df.columns:
            self._new_meta_column(col)
            self.meta[col] = df[col].combine_first(self.meta[col])
        # set column `exclude` to bool
        self.meta.exclude = self.meta.exclude.astype('bool')
Exemplo n.º 14
0
    def __init__(self, name):
        """
        Parameters
        ----------
        name : str
            A valid database name. For available options, see
            valid_connection_names().
        """
        valid = valid_connection_names()
        if name not in valid:
            raise ValueError('{} is not a valid name. Choose one of {}'.format(
                name, valid))

        logger().info(
            'You are connected to the {} {}. Please cite as:\n\n{}'.format(
                name, 'scenario explorer', _CITATIONS[name]))

        self.base_url = _URL_TEMPLATE.format(name)
Exemplo n.º 15
0
    def require_variable(self,
                         variable,
                         unit=None,
                         year=None,
                         exclude_on_fail=False):
        """Check whether all scenarios have a required variable

        Parameters
        ----------
        variable: str
            required variable
        unit: str, default None
            name of unit (optional)
        years: int or list, default None
            years (optional)
        exclude: bool, default False
            flag scenarios missing the required variables as `exclude: True`
        """
        criteria = {'variable': variable}
        if unit:
            criteria.update({'unit': unit})
        if year:
            criteria.update({'year': year})

        keep = _apply_filters(self.data, self.meta, criteria)
        idx = self.meta.index.difference(_meta_idx(self.data[keep]))

        n = len(idx)
        if n == 0:
            logger().info(
                'All scenarios have the required variable `{}`'.format(
                    variable))
            return

        msg = '{} scenario does not include required variable `{}`' if n == 1 \
            else '{} scenarios do not include required variable `{}`'

        if exclude_on_fail:
            self.meta.loc[idx, 'exclude'] = True
            msg += ', marked as `exclude: True` in metadata'

        logger().info(msg.format(n, variable))
        return pd.DataFrame(index=idx).reset_index()
Exemplo n.º 16
0
    def filter(self, filters=None, keep=True, inplace=False, **kwargs):
        """Return a filtered IamDataFrame (i.e., a subset of current data)

        Parameters
        ----------
        keep: bool, default True
            keep all scenarios satisfying the filters (if True) or the inverse
        inplace: bool, default False
            if True, do operation inplace and return None
        filters by kwargs or dict (deprecated):
            The following columns are available for filtering:
             - metadata columns: filter by category assignment in metadata
             - 'model', 'scenario', 'region', 'variable', 'unit':
               string or list of strings, where ``*`` can be used as a wildcard
             - 'level': the maximum "depth" of IAM variables (number of '|')
               (exluding the strings given in the 'variable' argument)
             - 'year': takes an integer, a list of integers or a range
                note that the last year of a range is not included,
                so ``range(2010,2015)`` is interpreted as ``[2010, ..., 2014]``
            - 'regexp=True' overrides pseudo-regexp syntax in `pattern_match()`
        """
        if filters is not None:
            warnings.warn(
                '`filters` keyword argument in filters() is deprecated and will be removed in the next release')
            kwargs.update(filters)

        _keep = _apply_filters(self.data, self.meta, kwargs)
        _keep = _keep if keep else ~_keep
        ret = copy.deepcopy(self) if not inplace else self
        ret.data = ret.data[_keep]

        idx = pd.MultiIndex.from_tuples(
            pd.unique(list(zip(ret.data['model'], ret.data['scenario']))),
            names=('model', 'scenario')
        )
        if len(idx) == 0:
            logger().warning('Filtered IamDataFrame is empty!')

        ret.meta = ret.meta.loc[idx]
        if not inplace:
            return ret
Exemplo n.º 17
0
    def validate(self, criteria={}, exclude_on_fail=False):
        """Validate scenarios using criteria on timeseries values

        Parameters
        ----------
        criteria: dict
           dictionary with variable keys and check values
            ('up' and 'lo' for respective bounds, 'year' for years)
        exclude_on_fail: bool, default False
            flag scenarios failing validation as `exclude: True`
        """
        df = _apply_criteria(self.data, criteria, in_range=False)

        if not df.empty:
            msg = '{} of {} data points to not satisfy the criteria'
            logger().info(msg.format(len(df), len(self.data)))

            if exclude_on_fail and len(df) > 0:
                self._exclude_on_fail(df)

            return df
Exemplo n.º 18
0
 def _exclude_on_fail(self, df):
     """Assign a selection of scenarios as `exclude: True` in meta"""
     idx = df if isinstance(df, pd.MultiIndex) else _meta_idx(df)
     self.meta.loc[idx, 'exclude'] = True
     logger().info('{} non-valid scenario{} will be excluded'.format(
         len(idx), '' if len(idx) == 1 else 's'))
Exemplo n.º 19
0
def test_context_adjust_log_level():
    assert logger().getEffectiveLevel() == 20
    with adjust_log_level():
        assert logger().getEffectiveLevel() == 40
    assert logger().getEffectiveLevel() == 20
Exemplo n.º 20
0
    def map_regions(self, map_col, agg=None, copy_col=None, fname=None,
                    region_col=None, remove_duplicates=False, inplace=False):
        """Plot regional data for a single model, scenario, variable, and year

        see pyam.plotting.region_plot() for all available options

        Parameters
        ----------
        map_col: string
            The column used to map new regions to. Common examples include
            iso and 5_region.
        agg: string, optional
            Perform a data aggregation. Options include: sum.
        copy_col: string, optional
            Copy the existing region data into a new column for later use.
        fname: string, optional
            Use a non-default region mapping file
        region_col: string, optional
            Use a non-default column name for regions to map from.
        remove_duplicates: bool, optional, default: False
            If there are duplicates in the mapping from one regional level to
            another, then remove these duplicates by counting the most common
            mapped value.
            This option is most useful when mapping from high resolution
            (e.g., model regions) to low resolution (e.g., 5_region).
        inplace : bool, default False
            if True, do operation inplace and return None
        """
        models = self.meta.index.get_level_values('model').unique()
        fname = fname or run_control()['region_mapping']['default']
        mapping = read_pandas(fname).rename(str.lower, axis='columns')
        map_col = map_col.lower()

        ret = copy.deepcopy(self) if not inplace else self
        _df = ret.data
        columns_orderd = _df.columns

        # merge data
        dfs = []
        for model in models:
            df = _df[_df['model'] == model]
            _col = region_col or '{}.REGION'.format(model)
            _map = mapping.rename(columns={_col.lower(): 'region'})
            _map = _map[['region', map_col]].dropna().drop_duplicates()
            _map = _map[_map['region'].isin(_df['region'])]
            if remove_duplicates and _map['region'].duplicated().any():
                # find duplicates
                where_dup = _map['region'].duplicated(keep=False)
                dups = _map[where_dup]
                logger().warning("""
                Duplicate entries found for the following regions.
                Mapping will occur only for the most common instance.
                {}""".format(dups['region'].unique()))
                # get non duplicates
                _map = _map[~where_dup]
                # order duplicates by the count frequency
                dups = (dups
                        .groupby(['region', map_col])
                        .size()
                        .reset_index(name='count')
                        .sort_values(by='count', ascending=False)
                        .drop('count', axis=1))
                # take top occurance
                dups = dups[~dups['region'].duplicated(keep='first')]
                # combine them back
                _map = pd.concat([_map, dups])
            if copy_col is not None:
                df[copy_col] = df['region']

            df = (df
                  .merge(_map, on='region')
                  .drop('region', axis=1)
                  .rename(columns={map_col: 'region'})
                  )
            dfs.append(df)
        df = pd.concat(dfs)

        # perform aggregations
        if agg == 'sum':
            df = df.groupby(LONG_IDX).sum().reset_index()

        ret.data = (df
                    .reindex(columns=columns_orderd)
                    .sort_values(SORT_IDX)
                    .reset_index(drop=True)
                    )
        if not inplace:
            return ret
Exemplo n.º 21
0
    def check_aggregate_regions(self, variable, region='World',
                                components=None, units=None,
                                exclude_on_fail=False, **kwargs):
        """Check whether the region timeseries data match the aggregation
        of components

        Parameters
        ----------
        variable: str
            variable to be checked for matching aggregation of components data
        region: str
            region to be checked for matching aggregation of components data
        components: list of str, default None
            list of regions, defaults to all regions except region
        units: str or list of str, default None
            filter variable and components for given unit(s)
        exclude_on_fail: boolean, default False
            flag scenarios failing validation as `exclude: True`
        kwargs: passed to `np.isclose()`
        """
        var_df = self.filter(variable=variable, level=0)

        if components is None:
            components = list(set(var_df.data.region) - set([region]))

        if not len(components):
            msg = (
                'cannot check regional aggregate for `{}` because it has no '
                'regional components'
            )
            logger().info(msg.format(variable))

            return None

        # filter and groupby data, use `pd.Series.align` for matching index
        df_region, df_components = (
            _aggregate_by_regions(var_df.data, region, units)
            .align(_aggregate_by_regions(var_df.data, components, units))
        )

        df_components.index = df_components.index.droplevel(
            "variable"
        )

        # Add in variables that are included in region totals but which
        # aren't included in the regional components.
        # For example, if we are looking at World and Emissions|BC, we need
        # to add aviation and shipping to the sum of Emissions|BC for each
        # of World's regional components to do a valid check.
        different_region = components[0]
        var_list = pd.Series(self.data.variable.unique())
        var_components = var_list[pattern_match(var_list,
                                                '{}|*'.format(variable), 0)]
        for var_to_add in var_components:
            var_rows = self.data.variable == var_to_add
            region_rows = self.data.region == different_region
            var_has_regional_info = (var_rows & region_rows).any()
            if not var_has_regional_info:
                df_var_to_add = self.filter(
                    region=region, variable=var_to_add
                ).data.groupby(REGION_IDX).sum()['value']
                df_var_to_add.index = df_var_to_add.index.droplevel("variable")

                if len(df_var_to_add):
                    df_components = df_components.add(df_var_to_add,
                                                      fill_value=0)

        df_components = pd.concat([df_components], keys=[variable],
                                  names=['variable'])

        # use `np.isclose` for checking match
        diff = df_region[~np.isclose(df_region, df_components, **kwargs)]

        if len(diff):
            msg = (
                '{} - {} of {} data points are not aggregates of regional '
                'components'
            )
            logger().info(msg.format(variable, len(diff), len(df_region)))

            if exclude_on_fail:
                self._exclude_on_fail(diff.index.droplevel([2, 3]))

            diff = pd.concat([diff], keys=[region], names=['region'])

            return diff.unstack().rename_axis(None, axis=1)
Exemplo n.º 22
0
def format_data(df, **kwargs):
    """Convert a `pd.Dataframe` or `pd.Series` to the required format"""
    if isinstance(df, pd.Series):
        df = df.to_frame()

    # Check for R-style year columns, converting where necessary
    def convert_r_columns(c):
        try:
            first = c[0]
            second = c[1:]
            if first == 'X':
                try:
                    #  bingo! was X2015 R-style, return the integer
                    return int(second)
                except:
                    # nope, not an int, fall down to final return statement
                    pass
        except:
            # not a string/iterable/etc, fall down to final return statement
            pass
        return c
    df.columns = df.columns.map(convert_r_columns)

    # if `value` is given but not `variable`,
    # melt value columns and use column name as `variable`
    if 'value' in kwargs and 'variable' not in kwargs:
        value = kwargs.pop('value')
        value = value if islistable(value) else [value]
        _df = df.set_index(list(set(df.columns) - set(value)))
        dfs = []
        for v in value:
            if v not in df.columns:
                raise ValueError('column `{}` does not exist!'.format(v))
            vdf = _df[v].to_frame().rename(columns={v: 'value'})
            vdf['variable'] = v
            dfs.append(vdf.reset_index())
        df = pd.concat(dfs).reset_index(drop=True)

    # otherwise, rename columns or concat to IAMC-style or do a fill-by-value
    for col, value in kwargs.items():
        if col in df:
            raise ValueError('conflict of kwarg with column `{}` in dataframe!'
                             .format(col))

        if isstr(value) and value in df:
            df.rename(columns={value: col}, inplace=True)
        elif islistable(value) and all([c in df.columns for c in value]):
            df[col] = df.apply(lambda x: concat_with_pipe(x, value), axis=1)
            df.drop(value, axis=1, inplace=True)
        elif isstr(value):
            df[col] = value
        else:
            raise ValueError('invalid argument for casting `{}: {}`'
                             .format(col, value))

    # all lower case
    str_cols = [c for c in df.columns if isstr(c)]
    df.rename(columns={c: str(c).lower() for c in str_cols}, inplace=True)

    if 'notes' in df.columns:  # this came from the database
        logger().info('Ignoring notes column in dataframe')
        df.drop(columns='notes', inplace=True)
        col = df.columns[0]  # first column has database copyright notice
        df = df[~df[col].str.contains('database', case=False)]