def assign_feature(df, config): """ ETL feature to assign new columns to a given dataframe :param df: pd.DataFrame; Provided dataframe :param config: dict; Provided feature configuration :return: df_target: pd.DataFrame; Resulted dataframe """ if not config: return df else: df_target = df length = len(df_target.index) config_assign = dict() # Assign new columns, using static values. config_assign_const = miscu.eval_elem_mapping(config, 'col_const') if config_assign_const and isinstance(config_assign_const, dict): config_assign.update(config_assign_const) # Assign new columns, using variable values. config_assign_var = miscu.eval_elem_mapping(config, 'col_var') if config_assign_var and isinstance(config_assign_var, dict): config_assign.update(config_assign_var) for col_name, col_value in config_assign.items(): df_target[col_name] = [col_value] * length return df_target
def aggregate_feature(df, config): """ ETL feature to aggregate given dataframe. :param df: pd.DataFrame; Provided dataframe :param config: dict; Provided feature configuration :return: df_target: pd.DataFrame; Resulted dataframe Sample: "aggregate" """ if config: return df.groupby(miscu.eval_elem_mapping(config, "group_by")).agg( miscu.eval_elem_mapping(config, "agg")).reset_index() else: return df
def read_feature(config): """ ETL feature to read a file, based on provided ETL configuration section This is a composite feature, since it can call apply_dtype_feature, if appropriate config section exists :param config: dict; Provided configuration mapping :return: pd.DataFrame; Resulted dataframe """ df_target = fileu.read( description=miscu.eval_elem_mapping(config, 'description'), path=miscu.eval_elem_mapping(config, 'path'), file_type=miscu.eval_elem_mapping(config, 'file_type', default_value='excel'), separator=miscu.eval_elem_mapping(config, 'separator', default_value=','), skip_rows=miscu.eval_elem_mapping(config, 'skip_rows', default_value=0), use_cols=miscu.eval_elem_mapping(config, 'use_cols'), sheet_name=miscu.eval_elem_mapping(config, 'sheet_name', default_value=0)) df_target.columns = df_target.columns.str.strip() # Call apply_dtype_feature, if appropriate config section exists apply_dtype_config = miscu.eval_elem_mapping(config, 'apply_dtype') if apply_dtype_config: df_target = apply_dtype_feature(df_target, apply_dtype_config) return df_target
def write_feature(df, config): """ ETL feature to write a dataset to a file, based on provided ETL configuration section :param df: pd.DataFrame; Provided dataframe :param config: dict; Provided feature configuration :return null """ fileu.write(df=df, description=miscu.eval_elem_mapping(config, 'description'), path=miscu.eval_elem_mapping(config, 'path'), file_type=miscu.eval_elem_mapping(config, 'file_type', default_value='excel'), index=miscu.eval_elem_mapping(config, 'index'), separator=miscu.eval_elem_mapping(config, 'separator', default_value=','), mode="new")
def mapping_feature(df, config): """ ETL feature to merge given dataframe with extracted mapping dataframe :param df: pd.DataFrame; Provided dataframe :param config: dict; Provided feature configuration :return: df_target: pd.DataFrame; Resulted dataframe """ df_mapping = read_feature(config['read']) df_target = pd.merge(df, df_mapping, how='left', left_index=True, left_on=miscu.eval_elem_mapping(config, 'left_on'), right_on=miscu.eval_elem_mapping(config, 'right_on')) df_target.drop(columns=miscu.eval_elem_mapping(config, 'right_on'), inplace=True) return df_target
def rearrange_feature(df, config): """ ETL feature to rename and reorder columns of given dataframe. :param df: pd.DataFrame; Provided dataframe :param config: dict; Provided feature configuration :return: df_target: pd.DataFrame; Resulted dataframe """ if not config: return df else: df_target = df # Rename columns. config_to_rename = miscu.eval_elem_mapping(config, 'col_rename') if config_to_rename and isinstance(config_to_rename, dict): df_target.rename(columns=config_to_rename, inplace=True) # Reorder columns. config_to_reorder = miscu.eval_elem_mapping(config, 'col_reorder') if config_to_reorder and isinstance(config_to_reorder, list): df_target = df_target.reindex(columns=config_to_reorder) return df_target
def dupl_feature(df, config): """ ETL feature to duplicate every row with an ability to change particular values. :param df: pd.DataFrame; Provided dataframe :param config: dict; Provided feature configuration :return: df_target: pd.DataFrame; Resulted dataframe """ if not config: return df else: length = len(df.values) df_target = pd.DataFrame(np.repeat(df.values, [2] * length, axis=0), columns=df.columns.values) config_assign = miscu.eval_elem_mapping(config, 'col_const') for col_name, col_value in config_assign.items(): df_target.loc[::2, col_name] = col_value return df_target
def run_transformation(args, config): """ Transformation process :param args: dict; Command line arguments mapping :param config: dict; Configuration mapping :return: null """ # -------------------------------- # Input section # -------------------------------- # Extract normalized data source (output of Extraction process) # Prepare additional input parameters and update appropriate configuration section. # Inject 'path' and 'description' into <input> config section. input_update_with = { 'path': miscu.eval_elem_mapping(args, 'input_path'), 'description': config['description'] } input_config = miscu.eval_elem_mapping(config, 'input') input_read_config = miscu.eval_update_mapping(input_config, "read", input_update_with) # Run read ETL feature. df_target = etlu.read_feature(input_read_config) # Engage plugin from <input> config section, if available. input_plugin = miscu.eval_func(input_config, "plugin") if input_plugin: df_target = input_plugin(df_target) # -------------------------------- # Aggregate section # -------------------------------- # Run aggregate ETL feature to sum AMOUNT column, grouping by the combination of EXT_ACCOUNT, MAP_ACCOUNT, TYPE. aggregate_config = miscu.eval_elem_mapping(config, 'aggregate') df_target = etlu.aggregate_feature(df_target, aggregate_config) # -------------------------------- # Assignment section # -------------------------------- # Prepare assign_var configuration section, by getting values from args configuration. assign_config = miscu.eval_elem_mapping(config, 'assign') assign_config_var = miscu.eval_elem_mapping(assign_config, 'col_var') assign_update_with = dict() for col_name, args_key in assign_config_var.items(): assign_update_with[col_name] = args[args_key] assign_config_var = miscu.eval_update_mapping(assign_config, 'col_var', assign_update_with) # Run assignment ETL feature (as per requirements). df_target = etlu.assign_feature(df_target, assign_config) # Engage plugin from <assign> config section, if available. assign_plugin = miscu.eval_func(assign_config, "plugin") if assign_plugin: df_target = assign_plugin(df_target) # -------------------------------- # Duplication section # -------------------------------- # Run duplicate ETL feature (as per requirements). # Sign of Amount value of duplicated row will be flipped dupl_config = miscu.eval_elem_mapping(config, 'dupl') df_target = etlu.dupl_feature(df_target, dupl_config) # -------------------------------- # Output section # -------------------------------- # Prepare additional output parameters and update appropriate configuration section. # Inject 'path' and 'description' into <output> config section. output_update_with = { 'path': miscu.eval_elem_mapping(args, 'output_path'), 'description': config['description'] } output_config = miscu.eval_elem_mapping(config, 'output') output_write_config = miscu.eval_update_mapping(output_config, "write", output_update_with) # -------------------------------- # Rearrange section # -------------------------------- # Run rearrange ETL feature. rearrange_config = miscu.eval_elem_mapping(output_config, 'rearrange') df_target = etlu.rearrange_feature(df_target, rearrange_config) # Engage plugin from <output> config section. # Our plugin will add Total Amount value. output_plugin = miscu.eval_func(output_config, "plugin") if output_plugin: df_target = output_plugin(df_target) # Run write ETL feature. etlu.write_feature(df_target, output_write_config)
def run_extraction(args, config): """ Extraction process :param args: dict; Command line arguments mapping :param config: dict; Configuration mapping :return: null """ # -------------------------------- # Input section # -------------------------------- # Prepare additional input parameters and update appropriate configuration section. # Inject 'path' and 'description' into <input> config section. input_update_with = { 'path': miscu.eval_elem_mapping(args, 'input_path'), 'description': config['description'] } input_config = miscu.eval_elem_mapping(config, 'input') input_read_config = miscu.eval_update_mapping(input_config, "read", input_update_with) # Run read ETL feature. df_target = etlu.read_feature(input_read_config) # Engage plugin from <input> config section, if available. input_plugin = miscu.eval_func(input_config, "plugin") if input_plugin: df_target = input_plugin(df_target) # -------------------------------- # Mapping section # -------------------------------- # Prepare additional mapping parameters and update appropriate configuration section. # Inject 'path' and 'description' into <mapping> config section. mapping_update_with = { 'path': miscu.eval_elem_mapping(args, 'mapping_path'), 'description': config['description'] } mapping_config = miscu.eval_elem_mapping(config, 'mapping') mapping_read_config = miscu.eval_update_mapping(mapping_config, 'read', mapping_update_with) # Run mapping ETL feature. df_target = etlu.mapping_feature(df_target, mapping_config) # Engage plugin from <mapping> config section, if available. mapping_plugin = miscu.eval_func(mapping_config, "plugin") if mapping_plugin: df_target = mapping_plugin(df_target) # -------------------------------- # Assignment section # -------------------------------- # Prepare assign_var configuration section, by getting values from args configuration. assign_config = miscu.eval_elem_mapping(config, 'assign') assign_config_var = miscu.eval_elem_mapping(assign_config, 'col_var') assign_update_with = dict() for col_name, args_key in assign_config_var.items(): assign_update_with[col_name] = args[args_key] assign_config_var = miscu.eval_update_mapping(assign_config, 'col_var', assign_update_with) # Run assignment ETL feature. df_target = etlu.assign_feature(df_target, assign_config) # Engage plugin from <assign> config section, if available. assign_plugin = miscu.eval_func(assign_config, "plugin") if assign_plugin: df_target = assign_plugin(df_target) # -------------------------------- # Output section # -------------------------------- # Prepare additional output parameters and update appropriate configuration section. # Inject 'path' and 'description' into <output> config section. output_update_with = { 'path': miscu.eval_elem_mapping(args, 'output_path'), 'description': config['description'] } output_config = miscu.eval_elem_mapping(config, 'output') output_read_config = miscu.eval_update_mapping(output_config, "write", output_update_with) # -------------------------------- # Rearrange section # -------------------------------- # Run rearrange ETL feature. rearrange_config = miscu.eval_elem_mapping(output_config, 'rearrange') df_target = etlu.rearrange_feature(df_target, rearrange_config) # Engage plugin from <output> config section, if available. output_plugin = miscu.eval_func(output_config, "plugin") if output_plugin: df_target = output_plugin(df_target) # Run write ETL feature. etlu.write_feature(df_target, output_read_config)