def verify_index_integrity(df): """Verify integrity of index Arguments --------- df : Union[pd.DataFrame, pd.Series, pd.Index] Raises ------ ValueError """ index = df if isinstance(df, pd.Index) else df.index if not index.is_unique: overlap = index[index.duplicated()].unique() raise_data_error("Timeseries data has overlapping values", overlap.to_frame(index=False))
def swap_time_for_year(df, inplace, subannual=False): """Internal implementation to swap 'time' domain to 'year' (as int)""" if not df.time_col == "time": raise ValueError("Time domain must be datetime to use this method") ret = df.copy() if not inplace else df index = ret._data.index time = pd.Series(index.get_level_values("time")) order = [v if v != "time" else "year" for v in index.names] index = index.droplevel("time") index = append_index_col(index, time.apply(lambda x: x.year), "year", order=order) if subannual: # if subannual is True, default to simple datetime format without year if subannual is True: subannual = "%m-%d %H:%M%z" if isinstance(subannual, str): _subannual = time.apply(lambda x: x.strftime(subannual)) else: _subannual = time.apply(subannual) index = append_index_col(index, _subannual, "subannual") ret.extra_cols.append("subannual") rows = index.duplicated() if any(rows): error_msg = "Swapping time for year causes duplicates in `data`" raise_data_error(error_msg, index[rows].to_frame().reset_index(drop=True)) # assign data and other attributes ret._data.index = index ret.time_col = "year" ret._set_attributes() delattr(ret, "time") if not inplace: return ret
def reshape_mpl(df, x, y, idx_cols, **kwargs): """Reshape data from long form to "bar plot form". Matplotlib requires x values as the index with one column for bar grouping. Table values come from y values. """ idx_cols = to_list(idx_cols) if x not in idx_cols: idx_cols += [x] # check for duplicates rows = df[idx_cols].duplicated() if any(rows): raise_data_error("Duplicates in plot data", df.loc[rows, idx_cols]) # reshape the data df = df.set_index(idx_cols)[y].unstack(x).T # reindex to get correct order for key, value in kwargs.items(): level = None if df.columns.name == key: # single-dimension index axis, _values = "columns", df.columns.values elif df.index.name == key: # single-dimension index axis, _values = "index", list(df.index) elif key in df.columns.names: # several dimensions -> pd.MultiIndex axis, _values = "columns", get_index_levels(df.columns, key) level = key else: raise ValueError(f"No dimension {key} in the data!") # if not given, determine order based on run control (if possible) if value is None and key in run_control()["order"]: # select relevant items from run control, then add other cols value = [i for i in run_control()["order"][key] if i in _values] value += [i for i in _values if i not in value] df = df.reindex(**{axis: value, "level": level}) return df
id_vars=index + REQUIRED_COLS + extra_cols, var_name=time_col, value_vars=melt_cols, value_name="value", ) # cast value column to numeric and drop nan try: df["value"] = pd.to_numeric(df["value"]) except ValueError as e: # get the row number where the error happened row_nr_regex = re.compile(r"(?<=at position )\d+") row_nr = int(row_nr_regex.search(str(e)).group()) short_error_regex = re.compile(r".*(?= at position \d*)") short_error = short_error_regex.search(str(e)).group() raise_data_error(f"{short_error} in `data`", df.iloc[[row_nr]]) df.dropna(inplace=True, subset=["value"]) # replace missing units by an empty string for user-friendly filtering df.loc[df.unit.isnull(), "unit"] = "" # verify that there are no nan's left (in columns) null_rows = df.isnull().T.any() if null_rows.any(): cols = ", ".join(df.columns[df.isnull().any().values]) raise_data_error(f"Empty cells in `data` (columns: '{cols}')", df.loc[null_rows]) del null_rows # cast to pd.Series, check for duplicates