def assign_feature(df, config):
    """
    ETL feature to assign new columns to a given dataframe
    :param df: pd.DataFrame; Provided dataframe
    :param config: dict; Provided feature configuration
    :return: df_target: pd.DataFrame; Resulted dataframe
    """
    if not config:
        return df
    else:
        df_target = df
        length = len(df_target.index)
        config_assign = dict()

        # Assign new columns, using static values.
        config_assign_const = miscu.eval_elem_mapping(config, 'col_const')
        if config_assign_const and isinstance(config_assign_const, dict):
            config_assign.update(config_assign_const)

        # Assign new columns, using variable values.
        config_assign_var = miscu.eval_elem_mapping(config, 'col_var')
        if config_assign_var and isinstance(config_assign_var, dict):
            config_assign.update(config_assign_var)

        for col_name, col_value in config_assign.items():
            df_target[col_name] = [col_value] * length

        return df_target
def aggregate_feature(df, config):
    """
    ETL feature to aggregate given dataframe.
    :param df: pd.DataFrame; Provided dataframe
    :param config: dict; Provided feature configuration
    :return: df_target: pd.DataFrame; Resulted dataframe
    Sample:
    "aggregate"
    """
    if config:
        return df.groupby(miscu.eval_elem_mapping(config, "group_by")).agg(
            miscu.eval_elem_mapping(config, "agg")).reset_index()
    else:
        return df
def read_feature(config):
    """
    ETL feature to read a file, based on provided ETL configuration section
    This is a composite feature, since it can call apply_dtype_feature, if appropriate config section exists
    :param config: dict; Provided configuration mapping
    :return: pd.DataFrame; Resulted dataframe
    """
    df_target = fileu.read(
        description=miscu.eval_elem_mapping(config, 'description'),
        path=miscu.eval_elem_mapping(config, 'path'),
        file_type=miscu.eval_elem_mapping(config,
                                          'file_type',
                                          default_value='excel'),
        separator=miscu.eval_elem_mapping(config,
                                          'separator',
                                          default_value=','),
        skip_rows=miscu.eval_elem_mapping(config, 'skip_rows',
                                          default_value=0),
        use_cols=miscu.eval_elem_mapping(config, 'use_cols'),
        sheet_name=miscu.eval_elem_mapping(config,
                                           'sheet_name',
                                           default_value=0))

    df_target.columns = df_target.columns.str.strip()

    # Call apply_dtype_feature, if appropriate config section exists
    apply_dtype_config = miscu.eval_elem_mapping(config, 'apply_dtype')
    if apply_dtype_config:
        df_target = apply_dtype_feature(df_target, apply_dtype_config)

    return df_target
def write_feature(df, config):
    """
    ETL feature to write a dataset to a file, based on provided ETL configuration section
    :param df: pd.DataFrame; Provided dataframe
    :param config: dict; Provided feature configuration
    :return null
    """
    fileu.write(df=df,
                description=miscu.eval_elem_mapping(config, 'description'),
                path=miscu.eval_elem_mapping(config, 'path'),
                file_type=miscu.eval_elem_mapping(config,
                                                  'file_type',
                                                  default_value='excel'),
                index=miscu.eval_elem_mapping(config, 'index'),
                separator=miscu.eval_elem_mapping(config,
                                                  'separator',
                                                  default_value=','),
                mode="new")
def mapping_feature(df, config):
    """
    ETL feature to merge given dataframe with extracted mapping dataframe
    :param df: pd.DataFrame; Provided dataframe
    :param config: dict; Provided feature configuration
    :return: df_target: pd.DataFrame; Resulted dataframe
    """
    df_mapping = read_feature(config['read'])
    df_target = pd.merge(df,
                         df_mapping,
                         how='left',
                         left_index=True,
                         left_on=miscu.eval_elem_mapping(config, 'left_on'),
                         right_on=miscu.eval_elem_mapping(config, 'right_on'))
    df_target.drop(columns=miscu.eval_elem_mapping(config, 'right_on'),
                   inplace=True)

    return df_target
def rearrange_feature(df, config):
    """
    ETL feature to rename and reorder columns of given dataframe.
    :param df: pd.DataFrame; Provided dataframe
    :param config: dict; Provided feature configuration
    :return: df_target: pd.DataFrame; Resulted dataframe
    """
    if not config:
        return df
    else:
        df_target = df

        # Rename columns.
        config_to_rename = miscu.eval_elem_mapping(config, 'col_rename')
        if config_to_rename and isinstance(config_to_rename, dict):
            df_target.rename(columns=config_to_rename, inplace=True)

        # Reorder columns.
        config_to_reorder = miscu.eval_elem_mapping(config, 'col_reorder')
        if config_to_reorder and isinstance(config_to_reorder, list):
            df_target = df_target.reindex(columns=config_to_reorder)

    return df_target
def dupl_feature(df, config):
    """
    ETL feature to duplicate every row with an ability to change particular values.
    :param df: pd.DataFrame; Provided dataframe
    :param config: dict; Provided feature configuration
    :return: df_target: pd.DataFrame; Resulted dataframe
    """
    if not config:
        return df
    else:
        length = len(df.values)
        df_target = pd.DataFrame(np.repeat(df.values, [2] * length, axis=0),
                                 columns=df.columns.values)
        config_assign = miscu.eval_elem_mapping(config, 'col_const')
        for col_name, col_value in config_assign.items():
            df_target.loc[::2, col_name] = col_value

        return df_target
示例#8
0
def run_transformation(args, config):
    """
    Transformation process
    :param args: dict; Command line arguments mapping
    :param config: dict; Configuration mapping
    :return: null
    """

    # --------------------------------
    # Input section
    # --------------------------------

    # Extract normalized data source (output of Extraction process)
    # Prepare additional input parameters and update appropriate configuration section.
    # Inject 'path' and 'description' into <input> config section.
    input_update_with = {
        'path': miscu.eval_elem_mapping(args, 'input_path'),
        'description': config['description']
    }
    input_config = miscu.eval_elem_mapping(config, 'input')
    input_read_config = miscu.eval_update_mapping(input_config, "read",
                                                  input_update_with)

    # Run read ETL feature.
    df_target = etlu.read_feature(input_read_config)

    # Engage plugin from <input> config section, if available.
    input_plugin = miscu.eval_func(input_config, "plugin")
    if input_plugin:
        df_target = input_plugin(df_target)

    # --------------------------------
    # Aggregate section
    # --------------------------------

    # Run aggregate ETL feature to sum AMOUNT column, grouping by the combination of EXT_ACCOUNT, MAP_ACCOUNT, TYPE.
    aggregate_config = miscu.eval_elem_mapping(config, 'aggregate')
    df_target = etlu.aggregate_feature(df_target, aggregate_config)

    # --------------------------------
    # Assignment section
    # --------------------------------

    # Prepare assign_var configuration section, by getting values from args configuration.
    assign_config = miscu.eval_elem_mapping(config, 'assign')
    assign_config_var = miscu.eval_elem_mapping(assign_config, 'col_var')
    assign_update_with = dict()
    for col_name, args_key in assign_config_var.items():
        assign_update_with[col_name] = args[args_key]
    assign_config_var = miscu.eval_update_mapping(assign_config, 'col_var',
                                                  assign_update_with)

    # Run assignment ETL feature (as per requirements).
    df_target = etlu.assign_feature(df_target, assign_config)

    # Engage plugin from <assign> config section, if available.
    assign_plugin = miscu.eval_func(assign_config, "plugin")
    if assign_plugin:
        df_target = assign_plugin(df_target)

    # --------------------------------
    # Duplication section
    # --------------------------------

    # Run duplicate ETL feature (as per requirements).
    # Sign of Amount value of duplicated row will be flipped
    dupl_config = miscu.eval_elem_mapping(config, 'dupl')
    df_target = etlu.dupl_feature(df_target, dupl_config)

    # --------------------------------
    # Output section
    # --------------------------------

    # Prepare additional output parameters and update appropriate configuration section.
    # Inject 'path' and 'description' into <output> config section.
    output_update_with = {
        'path': miscu.eval_elem_mapping(args, 'output_path'),
        'description': config['description']
    }
    output_config = miscu.eval_elem_mapping(config, 'output')
    output_write_config = miscu.eval_update_mapping(output_config, "write",
                                                    output_update_with)

    # --------------------------------
    # Rearrange section
    # --------------------------------

    # Run rearrange ETL feature.
    rearrange_config = miscu.eval_elem_mapping(output_config, 'rearrange')
    df_target = etlu.rearrange_feature(df_target, rearrange_config)

    # Engage plugin from <output> config section.
    # Our plugin will add Total Amount value.
    output_plugin = miscu.eval_func(output_config, "plugin")
    if output_plugin:
        df_target = output_plugin(df_target)

    # Run write ETL feature.
    etlu.write_feature(df_target, output_write_config)
示例#9
0
def run_extraction(args, config):
    """
    Extraction process
    :param args: dict; Command line arguments mapping
    :param config: dict; Configuration mapping
    :return: null
    """

    # --------------------------------
    # Input section
    # --------------------------------

    # Prepare additional input parameters and update appropriate configuration section.
    # Inject 'path' and 'description' into <input> config section.
    input_update_with = {
        'path': miscu.eval_elem_mapping(args, 'input_path'),
        'description': config['description']
    }
    input_config = miscu.eval_elem_mapping(config, 'input')
    input_read_config = miscu.eval_update_mapping(input_config, "read",
                                                  input_update_with)

    # Run read ETL feature.
    df_target = etlu.read_feature(input_read_config)

    # Engage plugin from <input> config section, if available.
    input_plugin = miscu.eval_func(input_config, "plugin")
    if input_plugin:
        df_target = input_plugin(df_target)

    # --------------------------------
    # Mapping section
    # --------------------------------

    # Prepare additional mapping parameters and update appropriate configuration section.
    # Inject 'path' and 'description' into <mapping> config section.
    mapping_update_with = {
        'path': miscu.eval_elem_mapping(args, 'mapping_path'),
        'description': config['description']
    }
    mapping_config = miscu.eval_elem_mapping(config, 'mapping')
    mapping_read_config = miscu.eval_update_mapping(mapping_config, 'read',
                                                    mapping_update_with)

    # Run mapping ETL feature.
    df_target = etlu.mapping_feature(df_target, mapping_config)

    # Engage plugin from <mapping> config section, if available.
    mapping_plugin = miscu.eval_func(mapping_config, "plugin")
    if mapping_plugin:
        df_target = mapping_plugin(df_target)

    # --------------------------------
    # Assignment section
    # --------------------------------

    # Prepare assign_var configuration section, by getting values from args configuration.
    assign_config = miscu.eval_elem_mapping(config, 'assign')
    assign_config_var = miscu.eval_elem_mapping(assign_config, 'col_var')
    assign_update_with = dict()
    for col_name, args_key in assign_config_var.items():
        assign_update_with[col_name] = args[args_key]
    assign_config_var = miscu.eval_update_mapping(assign_config, 'col_var',
                                                  assign_update_with)

    # Run assignment ETL feature.
    df_target = etlu.assign_feature(df_target, assign_config)

    # Engage plugin from <assign> config section, if available.
    assign_plugin = miscu.eval_func(assign_config, "plugin")
    if assign_plugin:
        df_target = assign_plugin(df_target)

    # --------------------------------
    # Output section
    # --------------------------------

    # Prepare additional output parameters and update appropriate configuration section.
    # Inject 'path' and 'description' into <output> config section.
    output_update_with = {
        'path': miscu.eval_elem_mapping(args, 'output_path'),
        'description': config['description']
    }
    output_config = miscu.eval_elem_mapping(config, 'output')
    output_read_config = miscu.eval_update_mapping(output_config, "write",
                                                   output_update_with)

    # --------------------------------
    # Rearrange section
    # --------------------------------

    # Run rearrange ETL feature.
    rearrange_config = miscu.eval_elem_mapping(output_config, 'rearrange')
    df_target = etlu.rearrange_feature(df_target, rearrange_config)

    # Engage plugin from <output> config section, if available.
    output_plugin = miscu.eval_func(output_config, "plugin")
    if output_plugin:
        df_target = output_plugin(df_target)

    # Run write ETL feature.
    etlu.write_feature(df_target, output_read_config)