def fill_missing_dates_by_group(data: dd = None, groupby_columns: List[str] = None, fill_method: str = None, date_range: Tuple[str] = None, date_column: str = None, fill_value=None) -> dd: """ split input dataframe into groups according to groupby columns and reindex with continuous dates with specified date range. Fill missing values according to fill method :param data: dataframe :param groupby_columns: list of columns to groupby :param fill_method: method used to fill missing data :param date_range: date range to reidex to :param date_column: name of date column :return: modified dataframe """ output_schema = dict(data.dtypes) output_schema = list(output_schema.items()) columns = data.columns data = data.set_index(date_column, sorted=True) data = data.groupby(by=groupby_columns).apply( lambda df_g: fill_missing_dates(data=df_g, date_column=date_column, fill_method=fill_method, columns=columns, date_range=date_range, fill_value=fill_value, groupby_columns=groupby_columns), meta=output_schema).reset_index(drop=True) return data
def rolling_mean_by_date_by_group(data: dd = None, groupby_columns: List[str] = None, metric_columns: List[str] = None, date_column: str = None, window: int = None) -> dd: """ Split input dateframe into groups and preform a rolling average on the metric columns for each group :param data: input dataframe :param groupby_columns: list of columns to group by :param metric_columns: columns to calculate rolling average on :param date_column: name of date column :param window: window size to be used on rolling average :return: modified dask dataframe """ data = data.set_index(date_column, sorted=True) output_schema = dict(data.dtypes) for metric_column in metric_columns: output_schema[f'{metric_column}_rolling_mean'] = 'float32' output_schema = list(output_schema.items()) data = data.groupby(by=groupby_columns).apply( lambda df_g: rolling_mean_by_date( data=df_g, metric_columns=metric_columns, window=window), meta=output_schema) data = data.reset_index().rename(columns={'index': date_column}) return data
def yoy_percent_change_by_group(data: dd = None, groupby_columns: List[str] = None, metric_columns: List[str] = None, date_column: str = None) -> dd: """ Split dataframe into groups and calculate year over year percent change for the etric columns in each group :param data: input dataframe :param groupby_columns: list of columns to group by :param metric_columns: columns to calculate rolling average on :param date_column: name of date column :return: modified dataframe """ data = data.set_index(date_column, sorted=True) output_schema = dict(data.dtypes) for metric_column in metric_columns: output_schema[f'{metric_column}_yoy_pct_change'] = 'float32' output_schema = list(output_schema.items()) data = data.groupby(by=groupby_columns).apply( lambda df_g: yoy_percent_change(data=df_g, metric_columns=metric_columns), meta=output_schema) data = data.reset_index().rename(columns={'index': date_column}) return data