def compute_ffill_by_group(df, id_cols, reference_cols, value_col): """ Compute ffill with groupby. There is a performance issue with a simple groupby/fillna (2017/07) - `id_cols` are the columns id to group, - `reference_cols` are the other columns used to order, - `value_col` is the name of the column to fill, Args: df (pd.DataFrame): id_cols (list(str)): reference_cols (list(str)): value_col (str): """ check_params_columns_duplicate(id_cols + reference_cols + [value_col]) df = df.sort_values(by=id_cols + reference_cols) df = df.set_index(id_cols) df['fill'] = 1 - df[value_col].isnull().astype(int) df['fill'] = df.groupby(level=list(range(0, len(id_cols) - 1)))['fill'].cumsum() df[value_col] = df[value_col].ffill() df.loc[df['fill'] == 0, value_col] = None del df['fill'] return df.reset_index()
def compute_cumsum(df, id_cols, reference_cols, value_cols, new_value_cols=None, cols_to_keep=None): """ Compute cumsum for a group of columns. - `id_cols` are the columns id to create each group, - `reference_cols` are the columns to order the cumsum, - `value_cols` are the columns to cumsum, - `new_value_cols` are the new columns with the result cumsum - `cols_to_keep` are other column to keep in the dataframe. This option can be used if there is only one row by group [id_cols + reference_cols] For example : MONTH DAY NAME VALUE X 1 1 A 1 lo 2 1 A 1 lo 2 15 A 1 la 1 15 B 1 la The function `compute_cumsum` with the arguments : id_cols=['NAME'] reference_cols=['MONTH','DAY'] cumsum_cols=['VALUE'] cols_to_keep=['X'] give as a result : NAME MONTH DAY X VALUE A 1 1 lo 1 A 2 1 la 2 A 2 15 lo 3 B 1 15 la 1 Args: df (pd.DataFrame): id_cols (list(str)): reference_cols (list(str)): value_cols (list(str)): new_value_cols (list(str)): cols_to_keep (list(str)): """ if cols_to_keep is None: cols_to_keep = [] if new_value_cols is None: new_value_cols = value_cols if len(value_cols) != len(new_value_cols): raise ParamsValueError('`value_cols` and `new_value_cols` needs ' 'to have the same number of elements') check_params_columns_duplicate(id_cols + reference_cols + cols_to_keep + value_cols) levels = list(range(0, len(id_cols))) df = df.groupby(id_cols + reference_cols + cols_to_keep).sum() df[new_value_cols] = df.groupby(level=levels)[value_cols].cumsum() return df.reset_index()
def __compute_evolution( df, id_cols, value_col, date_col=None, freq=1, compare_to=None, method='abs', format='column', offseted_suffix='_offseted', evolution_col_name='evolution_computed', how='left', fillna=None, raise_duplicate_error=True, ): """ Compute an evolution column : - against a period distant from a fixed frequency. - against a part of the df Unfortunately, pandas doesn't allow .change() and .pct_change() to be executed with a MultiIndex. Args: df (pd.DataFrame): id_cols (list(str)): value_col (str): date_col (str/dict): default None freq (int/pd.DateOffset/pd.Serie): default 1 compare_to (str): default None method (str): default ``'abs'`` can be also ``'pct'`` format(str): default 'column' can be also 'df' offseted_suffix(str): default '_offseted' evolution_col_name(str): default 'evolution_computed' how(str): default 'left' fillna(str/int): default None """ if date_col is not None: is_date_to_format = isinstance(date_col, dict) or (df[date_col].dtype == np.object) if is_date_to_format: if isinstance(date_col, dict): date_format = date_col.get('format', None) date_col = date_col['selector'] else: date_format = None df['_' + date_col + '_copy_'] = pd.to_datetime(df[date_col], format=date_format) date_col = '_' + date_col + '_copy_' is_freq_dict = isinstance(freq, dict) if is_freq_dict: freq = pd.DateOffset(**{k: int(v) for k, v in freq.items()}) check_params_columns_duplicate(id_cols + [value_col, date_col]) # create df_offseted group_cols = id_cols + [date_col] df_offseted = df[group_cols + [value_col]].copy() df_offseted[date_col] += freq df_with_offseted_values = apply_merge(df, df_offseted, group_cols, how, offseted_suffix, raise_duplicate_error) if is_date_to_format: del df_with_offseted_values[date_col] elif compare_to is not None: # create df_offseted check_params_columns_duplicate(id_cols + [value_col]) group_cols = id_cols df_offseted = df.query(compare_to).copy() df_offseted = df_offseted[group_cols + [value_col]] df_with_offseted_values = apply_merge(df, df_offseted, group_cols, how, offseted_suffix, raise_duplicate_error) apply_fillna(df_with_offseted_values, value_col, offseted_suffix, fillna) apply_method(df_with_offseted_values, evolution_col_name, value_col, offseted_suffix, method) return apply_format(df_with_offseted_values, evolution_col_name, format)
def add_missing_row( df: pd.DataFrame, id_cols: List[str], reference_col: str, complete_index: Union[Dict[str, str], Sequence[str]] = None, method: str = None, cols_to_keep: List[str] = None, ) -> pd.DataFrame: """ Add missing row to a df base on a reference column --- ### Parameters *mandatory :* - `id_cols` (*list of str*): names of the columns used to create each group - `reference_col` (*str*): name of the column used to identify missing rows *optional :* - `complete_index` (*list* or *dict*): [A, B, C] a list of values used to add missing rows. It can also be a dict to declare a date range. By default, use all values of reference_col. - `method` (*str*): by default all missing rows are added. The possible values are : - `"between"` : add missing rows having their value between min and max values for each group, - `"between_and_after"` : add missing rows having their value bigger than min value for each group. - `"between_and_before"` : add missing rows having their value smaller than max values for each group. - `cols_to_keep` (*list of str*): name of other columns to keep, linked to the reference_col. --- ### Example **Input** YEAR | MONTH | NAME :---:|:---:|:--: 2017|1|A 2017|2|A 2017|3|A 2017|1|B 2017|3|B ```cson add_missing_row: id_cols: ['NAME'] reference_col: 'MONTH' ``` **Output** YEAR | MONTH | NAME :---:|:---:|:--: 2017|1|A 2017|2|A 2017|3|A 2017|1|B 2017|2|B 2017|3|B """ if cols_to_keep is None: cols_for_index = [reference_col] else: cols_for_index = [reference_col] + cols_to_keep check_params_columns_duplicate(id_cols + cols_for_index) if method == 'between' or method == 'between_and_after': df['start'] = df.groupby(id_cols)[reference_col].transform(min) id_cols += ['start'] if method == 'between' or method == 'between_and_before': df['end'] = df.groupby(id_cols)[reference_col].transform(max) id_cols += ['end'] names = id_cols + cols_for_index new_df = df.set_index(names) index_values: Union[Any, tuple] = df.groupby(id_cols).sum().index.values if complete_index is None: complex_index_values: Union[Any, tuple] = df.groupby( cols_for_index).sum().index.values elif isinstance(complete_index, dict): if complete_index['type'] == 'date': freq = complete_index['freq'] date_format = complete_index['format'] start = complete_index['start'] end = complete_index['end'] if isinstance(freq, dict): freq = pd.DateOffset(**{k: int(v) for k, v in freq.items()}) new_index = pd.date_range(start=start, end=end, freq=freq) complex_index_values = new_index.strftime(date_format).values else: raise ParamsValueError(f'Unknown complete index type: ' f'{complete_index["type"]}') else: complex_index_values = list(complete_index) def get_tuple(x: Union[Any, tuple]) -> tuple: if not isinstance(x, tuple): return (x, ) return x new_tuples_index: List[tuple] = [ get_tuple(x) + get_tuple(y) for x in index_values for y in complex_index_values ] new_index = pd.MultiIndex.from_tuples(new_tuples_index, names=names) new_df = new_df.reindex(new_index).reset_index() if method == 'between' or method == 'between_and_after': new_df = new_df[new_df[reference_col] >= new_df['start']] del new_df['start'] if method == 'between' or method == 'between_and_before': new_df = new_df[new_df[reference_col] <= new_df['end']] del new_df['end'] return new_df
def add_missing_row(df, id_cols, reference_col, complete_index=None, method=None, cols_to_keep=None): """ Add missing row to a df base on a reference column - `id_cols` are the columns id to group, - `reference_col` is the column with groups missing values - `complete_index` (optional) a set of values used to add missing rows, by default use the function `unique` on reference_col. Can be dict for date_range - `method` (optional) method to choose values to keep. E.g between min and max value of the group. - `cols_to_keep` (optional) is the columns link to the reference_col to keep. For example : YEAR MONTH NAME VALUE X 2017 1 A 1 lo 2017 2 A 1 lo 2017 3 A 1 la 2017 1 B 1 la 2017 3 B 1 la The function `add_missing_row` with the arguments : id_cols=['NAME'] reference_col='MONTH' give as a result : YEAR MONTH NAME VALUE X 2017 1 A 1 lo 2017 2 A 1 lo 2017 3 A 1 la 2017 1 B 1 la 2017 2 B NA NA 2017 3 B 1 la Args: df (pd.DataFrame): id_cols (list(str)): reference_col (str): complete_index (tuple/dict): method (str): keep_cols (list(str)): """ if cols_to_keep is None: cols_for_index = [reference_col] else: cols_for_index = [reference_col] + cols_to_keep check_params_columns_duplicate(id_cols + cols_for_index) if method == 'between' or method == 'between_and_after': df['start'] = df.groupby(id_cols)[reference_col].transform(min) id_cols += ['start'] if method == 'between' or method == 'between_and_before': df['end'] = df.groupby(id_cols)[reference_col].transform(max) id_cols += ['end'] names = id_cols + cols_for_index new_df = df.set_index(names) index_values = df.groupby(id_cols).sum().index.values if complete_index is None: complete_index = df.groupby(cols_for_index).sum().index.values elif isinstance(complete_index, dict): if complete_index['type'] == 'date': freq = complete_index['freq'] date_format = complete_index['format'] start = complete_index['start'] end = complete_index['end'] if isinstance(freq, dict): freq = pd.DateOffset(**{k: int(v) for k, v in freq.items()}) complete_index = pd.date_range(start=start, end=end, freq=freq) complete_index = complete_index.strftime(date_format) else: raise ParamsValueError(f'Unknown complete index type: ' f'{complete_index["type"]}') if not isinstance(index_values[0], tuple): index_values = [(x,) for x in index_values] if not isinstance(complete_index[0], tuple): complete_index = [(x,) for x in complete_index] new_tuples_index = [x + y for x in index_values for y in complete_index] new_index = pd.MultiIndex.from_tuples(new_tuples_index, names=names) new_df = new_df.reindex(new_index).reset_index() if method == 'between' or method == 'between_and_after': new_df = new_df[new_df[reference_col] >= new_df['start']] del new_df['start'] if method == 'between' or method == 'between_and_before': new_df = new_df[new_df[reference_col] <= new_df['end']] del new_df['end'] return new_df
def compute_cumsum(df, id_cols: List[str], reference_cols: List[str], value_cols: List[str], new_value_cols: List[str] = None, cols_to_keep: List[str] = None): """ Compute cumsum for a group of columns. --- ### Parameters *mandatory :* - `id_cols` (*list*): the columns id to create each group - `reference_cols` (*list*): the columns to order the cumsum - `value_cols` (*list*): the columns to cumsum *optional :* - `new_value_cols` (*list*): the new columns with the result cumsum - `cols_to_keep` (*list*): other columns to keep in the dataset. This option can be used if there is only one row by group [id_cols + reference_cols] --- ### Example **Input** MONTH | DAY | NAME | VALUE | X :---:|:---:|:--:|:---:|:---: 1 | 1 | A | 1 | lo 2 | 1 | A | 1 | lo 2 | 15 | A | 1 | la 1 | 15 | B | 1 | la ```cson compute_cumsum: id_cols: ['NAME'] reference_cols: ['MONTH', 'DAY'] cumsum_cols: ['VALUE'] cols_to_keep: ['X'] ``` **Output** NAME | MONTH | DAY | X | VALUE :---:|:---:|:--:|:---:|:---: A | 1 | 1 | lo | 1 A | 2 | 1 | la | 2 A | 2 | 15 | lo | 3 B | 1 | 15 | la | 1 """ if cols_to_keep is None: cols_to_keep = [] if new_value_cols is None: new_value_cols = value_cols if len(value_cols) != len(new_value_cols): raise ParamsValueError('`value_cols` and `new_value_cols` needs ' 'to have the same number of elements') check_params_columns_duplicate(id_cols + reference_cols + cols_to_keep + value_cols) levels = list(range(0, len(id_cols))) df = df.groupby(id_cols + reference_cols + cols_to_keep).sum() df[new_value_cols] = df.groupby(level=levels)[value_cols].cumsum() return df.reset_index()