Пример #1
0
def verify_index_integrity(df):
    """Verify integrity of index

    Arguments
    ---------
    df : Union[pd.DataFrame, pd.Series, pd.Index]

    Raises
    ------
    ValueError
    """
    index = df if isinstance(df, pd.Index) else df.index
    if not index.is_unique:
        overlap = index[index.duplicated()].unique()

        raise_data_error("Timeseries data has overlapping values",
                         overlap.to_frame(index=False))
Пример #2
0
def swap_time_for_year(df, inplace, subannual=False):
    """Internal implementation to swap 'time' domain to 'year' (as int)"""
    if not df.time_col == "time":
        raise ValueError("Time domain must be datetime to use this method")

    ret = df.copy() if not inplace else df

    index = ret._data.index

    time = pd.Series(index.get_level_values("time"))
    order = [v if v != "time" else "year" for v in index.names]

    index = index.droplevel("time")
    index = append_index_col(index,
                             time.apply(lambda x: x.year),
                             "year",
                             order=order)

    if subannual:
        # if subannual is True, default to simple datetime format without year
        if subannual is True:
            subannual = "%m-%d %H:%M%z"
        if isinstance(subannual, str):
            _subannual = time.apply(lambda x: x.strftime(subannual))
        else:
            _subannual = time.apply(subannual)

        index = append_index_col(index, _subannual, "subannual")
        ret.extra_cols.append("subannual")

    rows = index.duplicated()
    if any(rows):
        error_msg = "Swapping time for year causes duplicates in `data`"
        raise_data_error(error_msg,
                         index[rows].to_frame().reset_index(drop=True))

    # assign data and other attributes
    ret._data.index = index
    ret.time_col = "year"
    ret._set_attributes()
    delattr(ret, "time")

    if not inplace:
        return ret
Пример #3
0
def reshape_mpl(df, x, y, idx_cols, **kwargs):
    """Reshape data from long form to "bar plot form".

    Matplotlib requires x values as the index with one column for bar grouping.
    Table values come from y values.
    """
    idx_cols = to_list(idx_cols)
    if x not in idx_cols:
        idx_cols += [x]

    # check for duplicates
    rows = df[idx_cols].duplicated()
    if any(rows):
        raise_data_error("Duplicates in plot data", df.loc[rows, idx_cols])

    # reshape the data
    df = df.set_index(idx_cols)[y].unstack(x).T

    # reindex to get correct order
    for key, value in kwargs.items():
        level = None
        if df.columns.name == key:  # single-dimension index
            axis, _values = "columns", df.columns.values
        elif df.index.name == key:  # single-dimension index
            axis, _values = "index", list(df.index)
        elif key in df.columns.names:  # several dimensions -> pd.MultiIndex
            axis, _values = "columns", get_index_levels(df.columns, key)
            level = key
        else:
            raise ValueError(f"No dimension {key} in the data!")

        # if not given, determine order based on run control (if possible)
        if value is None and key in run_control()["order"]:
            # select relevant items from run control, then add other cols
            value = [i for i in run_control()["order"][key] if i in _values]
            value += [i for i in _values if i not in value]
        df = df.reindex(**{axis: value, "level": level})

    return df
Пример #4
0
            id_vars=index + REQUIRED_COLS + extra_cols,
            var_name=time_col,
            value_vars=melt_cols,
            value_name="value",
        )

    # cast value column to numeric and drop nan
    try:
        df["value"] = pd.to_numeric(df["value"])
    except ValueError as e:
        # get the row number where the error happened
        row_nr_regex = re.compile(r"(?<=at position )\d+")
        row_nr = int(row_nr_regex.search(str(e)).group())
        short_error_regex = re.compile(r".*(?= at position \d*)")
        short_error = short_error_regex.search(str(e)).group()
        raise_data_error(f"{short_error} in `data`", df.iloc[[row_nr]])

    df.dropna(inplace=True, subset=["value"])

    # replace missing units by an empty string for user-friendly filtering
    df.loc[df.unit.isnull(), "unit"] = ""

    # verify that there are no nan's left (in columns)
    null_rows = df.isnull().T.any()
    if null_rows.any():
        cols = ", ".join(df.columns[df.isnull().any().values])
        raise_data_error(f"Empty cells in `data` (columns: '{cols}')",
                         df.loc[null_rows])
    del null_rows

    # cast to pd.Series, check for duplicates