示例#1
0
def compute_ffill_by_group(df, id_cols, reference_cols, value_col):
    """
    Compute ffill with groupby. There is a performance issue with a simple
    groupby/fillna (2017/07)
    - `id_cols` are the columns id to group,
    - `reference_cols` are the other columns used to order,
    - `value_col` is the name of the column to fill,

    Args:
        df (pd.DataFrame):
        id_cols (list(str)):
        reference_cols (list(str)):
        value_col (str):
    """
    check_params_columns_duplicate(id_cols + reference_cols + [value_col])
    df = df.sort_values(by=id_cols + reference_cols)
    df = df.set_index(id_cols)
    df['fill'] = 1 - df[value_col].isnull().astype(int)
    df['fill'] = df.groupby(level=list(range(0,
                                             len(id_cols) -
                                             1)))['fill'].cumsum()
    df[value_col] = df[value_col].ffill()
    df.loc[df['fill'] == 0, value_col] = None
    del df['fill']
    return df.reset_index()
示例#2
0
def compute_cumsum(df,
                   id_cols,
                   reference_cols,
                   value_cols,
                   new_value_cols=None,
                   cols_to_keep=None):
    """
    Compute cumsum for a group of columns.
    - `id_cols` are the columns id to create each group,
    - `reference_cols` are the columns to order the cumsum,
    - `value_cols` are the columns to cumsum,
    - `new_value_cols` are the new columns with the result cumsum
    - `cols_to_keep` are other column to keep in the dataframe. This option can
     be used if there is only one row by group [id_cols + reference_cols]

    For example :

    MONTH  DAY NAME  VALUE  X
     1      1    A      1  lo
     2      1    A      1  lo
     2     15    A      1  la
     1     15    B      1  la

    The function `compute_cumsum` with the arguments :
            id_cols=['NAME']
            reference_cols=['MONTH','DAY']
            cumsum_cols=['VALUE']
            cols_to_keep=['X']
    give as a result :


    NAME  MONTH  DAY  X  VALUE
     A     1      1  lo      1
     A     2      1  la      2
     A     2     15  lo      3
     B     1     15  la      1


    Args:
        df (pd.DataFrame):
        id_cols (list(str)):
        reference_cols (list(str)):
        value_cols (list(str)):
        new_value_cols (list(str)):
        cols_to_keep (list(str)):
    """
    if cols_to_keep is None:
        cols_to_keep = []

    if new_value_cols is None:
        new_value_cols = value_cols
    if len(value_cols) != len(new_value_cols):
        raise ParamsValueError('`value_cols` and `new_value_cols` needs '
                               'to have the same number of elements')

    check_params_columns_duplicate(id_cols + reference_cols + cols_to_keep +
                                   value_cols)

    levels = list(range(0, len(id_cols)))

    df = df.groupby(id_cols + reference_cols + cols_to_keep).sum()
    df[new_value_cols] = df.groupby(level=levels)[value_cols].cumsum()

    return df.reset_index()
示例#3
0
def __compute_evolution(
    df,
    id_cols,
    value_col,
    date_col=None,
    freq=1,
    compare_to=None,
    method='abs',
    format='column',
    offseted_suffix='_offseted',
    evolution_col_name='evolution_computed',
    how='left',
    fillna=None,
    raise_duplicate_error=True,
):
    """
    Compute an evolution column :
        - against a period distant from a fixed frequency.
        - against a part of the df

    Unfortunately, pandas doesn't allow .change() and .pct_change() to be
    executed with a MultiIndex.

    Args:
        df (pd.DataFrame):
        id_cols (list(str)):
        value_col (str):
        date_col (str/dict): default None
        freq (int/pd.DateOffset/pd.Serie): default 1
        compare_to (str): default None
        method (str): default ``'abs'`` can be also ``'pct'``
        format(str): default 'column' can be also 'df'
        offseted_suffix(str): default '_offseted'
        evolution_col_name(str): default 'evolution_computed'
        how(str): default 'left'
        fillna(str/int): default None
    """
    if date_col is not None:
        is_date_to_format = isinstance(date_col, dict) or (df[date_col].dtype
                                                           == np.object)
        if is_date_to_format:
            if isinstance(date_col, dict):
                date_format = date_col.get('format', None)
                date_col = date_col['selector']
            else:
                date_format = None
            df['_' + date_col + '_copy_'] = pd.to_datetime(df[date_col],
                                                           format=date_format)
            date_col = '_' + date_col + '_copy_'

        is_freq_dict = isinstance(freq, dict)
        if is_freq_dict:
            freq = pd.DateOffset(**{k: int(v) for k, v in freq.items()})

        check_params_columns_duplicate(id_cols + [value_col, date_col])
        # create df_offseted
        group_cols = id_cols + [date_col]
        df_offseted = df[group_cols + [value_col]].copy()
        df_offseted[date_col] += freq

        df_with_offseted_values = apply_merge(df, df_offseted, group_cols, how,
                                              offseted_suffix,
                                              raise_duplicate_error)
        if is_date_to_format:
            del df_with_offseted_values[date_col]

    elif compare_to is not None:
        # create df_offseted
        check_params_columns_duplicate(id_cols + [value_col])
        group_cols = id_cols
        df_offseted = df.query(compare_to).copy()
        df_offseted = df_offseted[group_cols + [value_col]]

        df_with_offseted_values = apply_merge(df, df_offseted, group_cols, how,
                                              offseted_suffix,
                                              raise_duplicate_error)

    apply_fillna(df_with_offseted_values, value_col, offseted_suffix, fillna)
    apply_method(df_with_offseted_values, evolution_col_name, value_col,
                 offseted_suffix, method)
    return apply_format(df_with_offseted_values, evolution_col_name, format)
示例#4
0
def add_missing_row(
    df: pd.DataFrame,
    id_cols: List[str],
    reference_col: str,
    complete_index: Union[Dict[str, str], Sequence[str]] = None,
    method: str = None,
    cols_to_keep: List[str] = None,
) -> pd.DataFrame:
    """
    Add missing row to a df base on a reference column

    ---

    ### Parameters

    *mandatory :*
    - `id_cols` (*list of str*): names of the columns used to create each group
    - `reference_col` (*str*): name of the column used to identify missing rows

    *optional :*
    - `complete_index` (*list* or *dict*): [A, B, C] a list of values used to add missing rows.
      It can also be a dict to declare a date range.
      By default, use all values of reference_col.
    - `method` (*str*): by default all missing rows are added. The possible values are :
        - `"between"` : add missing rows having their value between min and max values for each group,
        - `"between_and_after"` : add missing rows having their value bigger than min value for each group.
        - `"between_and_before"` : add missing rows having their value smaller than max values for each group.
    - `cols_to_keep` (*list of str*): name of other columns to keep, linked to the reference_col.

    ---

    ### Example

    **Input**

    YEAR | MONTH | NAME
    :---:|:---:|:--:
    2017|1|A
    2017|2|A
    2017|3|A
    2017|1|B
    2017|3|B

    ```cson
    add_missing_row:
      id_cols: ['NAME']
      reference_col: 'MONTH'
    ```

    **Output**

    YEAR | MONTH | NAME
    :---:|:---:|:--:
    2017|1|A
    2017|2|A
    2017|3|A
    2017|1|B
    2017|2|B
    2017|3|B

    """
    if cols_to_keep is None:
        cols_for_index = [reference_col]
    else:
        cols_for_index = [reference_col] + cols_to_keep
    check_params_columns_duplicate(id_cols + cols_for_index)

    if method == 'between' or method == 'between_and_after':
        df['start'] = df.groupby(id_cols)[reference_col].transform(min)
        id_cols += ['start']
    if method == 'between' or method == 'between_and_before':
        df['end'] = df.groupby(id_cols)[reference_col].transform(max)
        id_cols += ['end']

    names = id_cols + cols_for_index
    new_df = df.set_index(names)
    index_values: Union[Any, tuple] = df.groupby(id_cols).sum().index.values

    if complete_index is None:
        complex_index_values: Union[Any, tuple] = df.groupby(
            cols_for_index).sum().index.values
    elif isinstance(complete_index, dict):
        if complete_index['type'] == 'date':
            freq = complete_index['freq']
            date_format = complete_index['format']
            start = complete_index['start']
            end = complete_index['end']
            if isinstance(freq, dict):
                freq = pd.DateOffset(**{k: int(v) for k, v in freq.items()})
            new_index = pd.date_range(start=start, end=end, freq=freq)
            complex_index_values = new_index.strftime(date_format).values
        else:
            raise ParamsValueError(f'Unknown complete index type: '
                                   f'{complete_index["type"]}')
    else:
        complex_index_values = list(complete_index)

    def get_tuple(x: Union[Any, tuple]) -> tuple:
        if not isinstance(x, tuple):
            return (x, )
        return x

    new_tuples_index: List[tuple] = [
        get_tuple(x) + get_tuple(y) for x in index_values
        for y in complex_index_values
    ]

    new_index = pd.MultiIndex.from_tuples(new_tuples_index, names=names)
    new_df = new_df.reindex(new_index).reset_index()

    if method == 'between' or method == 'between_and_after':
        new_df = new_df[new_df[reference_col] >= new_df['start']]
        del new_df['start']
    if method == 'between' or method == 'between_and_before':
        new_df = new_df[new_df[reference_col] <= new_df['end']]
        del new_df['end']

    return new_df
示例#5
0
def add_missing_row(df, id_cols, reference_col, complete_index=None, method=None,
                    cols_to_keep=None):
    """
    Add missing row to a df base on a reference column
    - `id_cols` are the columns id to group,
    - `reference_col` is the column with groups missing values
    - `complete_index` (optional) a set of values used to add missing rows,
       by default use the function `unique` on reference_col. Can be dict for date_range
    - `method` (optional) method to choose values to keep.
       E.g between min and max value of the group.
    - `cols_to_keep` (optional) is the columns link to the reference_col to keep.

    For example :

    YEAR MONTH NAME  VALUE  X
    2017   1     A      1  lo
    2017   2     A      1  lo
    2017   3     A      1  la
    2017   1     B      1  la
    2017   3     B      1  la

    The function `add_missing_row` with the arguments :
            id_cols=['NAME']
            reference_col='MONTH'
    give as a result :


    YEAR MONTH NAME  VALUE  X
    2017   1     A      1  lo
    2017   2     A      1  lo
    2017   3     A      1  la
    2017   1     B      1  la
    2017   2     B      NA NA
    2017   3     B      1  la

    Args:
        df (pd.DataFrame):
        id_cols (list(str)):
        reference_col (str):
        complete_index (tuple/dict):
        method (str):
        keep_cols (list(str)):
    """
    if cols_to_keep is None:
        cols_for_index = [reference_col]
    else:
        cols_for_index = [reference_col] + cols_to_keep
    check_params_columns_duplicate(id_cols + cols_for_index)

    if method == 'between' or method == 'between_and_after':
        df['start'] = df.groupby(id_cols)[reference_col].transform(min)
        id_cols += ['start']
    if method == 'between' or method == 'between_and_before':
        df['end'] = df.groupby(id_cols)[reference_col].transform(max)
        id_cols += ['end']

    names = id_cols + cols_for_index
    new_df = df.set_index(names)
    index_values = df.groupby(id_cols).sum().index.values

    if complete_index is None:
        complete_index = df.groupby(cols_for_index).sum().index.values
    elif isinstance(complete_index, dict):
        if complete_index['type'] == 'date':
            freq = complete_index['freq']
            date_format = complete_index['format']
            start = complete_index['start']
            end = complete_index['end']
            if isinstance(freq, dict):
                freq = pd.DateOffset(**{k: int(v) for k, v in freq.items()})
            complete_index = pd.date_range(start=start, end=end, freq=freq)
            complete_index = complete_index.strftime(date_format)
        else:
            raise ParamsValueError(f'Unknown complete index type: '
                                   f'{complete_index["type"]}')

    if not isinstance(index_values[0], tuple):
        index_values = [(x,) for x in index_values]
    if not isinstance(complete_index[0], tuple):
        complete_index = [(x,) for x in complete_index]
    new_tuples_index = [x + y for x in index_values for y in complete_index]

    new_index = pd.MultiIndex.from_tuples(new_tuples_index, names=names)
    new_df = new_df.reindex(new_index).reset_index()

    if method == 'between' or method == 'between_and_after':
        new_df = new_df[new_df[reference_col] >= new_df['start']]
        del new_df['start']
    if method == 'between' or method == 'between_and_before':
        new_df = new_df[new_df[reference_col] <= new_df['end']]
        del new_df['end']

    return new_df
def compute_cumsum(df,
                   id_cols: List[str],
                   reference_cols: List[str],
                   value_cols: List[str],
                   new_value_cols: List[str] = None,
                   cols_to_keep: List[str] = None):
    """
    Compute cumsum for a group of columns.

    ---

    ### Parameters

    *mandatory :*
    - `id_cols` (*list*): the columns id to create each group
    - `reference_cols` (*list*): the columns to order the cumsum
    - `value_cols` (*list*): the columns to cumsum

    *optional :*
    - `new_value_cols` (*list*): the new columns with the result cumsum
    - `cols_to_keep` (*list*): other columns to keep in the dataset.
      This option can be used if there is only one row by group [id_cols + reference_cols]

    ---

    ### Example

    **Input**

    MONTH | DAY | NAME | VALUE | X
    :---:|:---:|:--:|:---:|:---:
     1   |   1 |   A  |  1 | lo
     2   |   1 |   A  |  1 | lo
     2   |  15 |   A  |  1 | la
     1   |  15 |   B  |  1 | la

    ```cson
    compute_cumsum:
      id_cols: ['NAME']
      reference_cols: ['MONTH', 'DAY']
      cumsum_cols: ['VALUE']
      cols_to_keep: ['X']
    ```

    **Output**

    NAME | MONTH | DAY | X | VALUE
    :---:|:---:|:--:|:---:|:---:
     A  |   1  |    1 | lo  |    1
     A  |   2  |    1 | la  |    2
     A  |   2  |   15 | lo  |    3
     B  |   1  |   15 | la  |    1
    """
    if cols_to_keep is None:
        cols_to_keep = []

    if new_value_cols is None:
        new_value_cols = value_cols
    if len(value_cols) != len(new_value_cols):
        raise ParamsValueError('`value_cols` and `new_value_cols` needs '
                               'to have the same number of elements')

    check_params_columns_duplicate(id_cols + reference_cols + cols_to_keep +
                                   value_cols)

    levels = list(range(0, len(id_cols)))

    df = df.groupby(id_cols + reference_cols + cols_to_keep).sum()
    df[new_value_cols] = df.groupby(level=levels)[value_cols].cumsum()

    return df.reset_index()