Exemplo n.º 1
0
def cos_transform_load(folder_path, sql_query_file_path, table_name,
                       connection_string):
    file_path = get_file_path(folder_path)
    df = pd.read_csv(file_path,
                     header=1,
                     delimiter=';',
                     dtype={
                         'Item Number': float,
                         'Warehouse Code': object
                     })
    df = df.drop(columns=['Item', 'Venue'])
    df = df.dropna(axis=0, thresh=6)
    df.columns = [
        "CUSTITEMNMBR", "store_city", "store_province", "store_id",
        "store_address", "store_postalCode", "pos_qty"
    ]
    df = df.dropna(axis=0, subset=['pos_qty'])
    df = df[df['pos_qty'] != 0]
    df['pos_qty'] = df['pos_qty'].astype(int)
    df['week_starting'] = extract_date_from_report_name(folder_path)
    df['ref_custnmbr'] = 'COST0000'
    df['CUSTITEMNMBR'] = df['CUSTITEMNMBR'].astype('int')
    df['CUSTITEMNMBR'] = df['CUSTITEMNMBR'].astype('str')
    add_genacol_item_number_column(data_frame=df,
                                   connection_string=connection_string,
                                   sql_query_file_path=sql_query_file_path)
    df = df.drop(columns=['CUSTITEMNMBR'])
    df = df[Variable.get(
        key='databases',
        deserialize_json=True)['asterix_test']['pos_table_column_name_order']]

    return df, table_name, connection_string
Exemplo n.º 2
0
def init_shp_stores(save_to_folder, update_folder_path):
    """
    Takes a shp stores report from the update_folder_path, restructures it (see shp_stores_transform function)
    and saves it in the save_to_folder. Finally, it deletes the original shp stores report.
    The created report becomes the master file.

    :param save_to_folder: A string indicating to the folder that should receive the final report.
    :param update_folder_path: A string indicating the folder that contains the report to transform.

    :return: A new master file in save_to_folder and no more file in update_folder_path.
    """
    file_path = get_file_path(update_folder_path)
    df = shp_stores_transform(folder_path=update_folder_path)
    df.to_csv(save_to_folder + '/' + 'shp_master_store_list.csv', index=False)
    os.remove(file_path)
Exemplo n.º 3
0
def uni_transform_load(folder_path, sql_query_file_path, table_name,
                       connection_string):
    """
    Restructure the uni report to be compliant with the database table.
    Here are the mains steps:
    1. Select the column of interest.
    2. Remove the unnecessary header rows and the last row (refer to the total).
    3. Rename the columns.
    4. Generate a column with the date (see extract_date_from_report_name function).
    5. Generate the genacol item number (see add_genacol_item_number_column function).
    6. Drop the external item number column.
    7. Generate a column with the banner name (see extract_banner_from_report_name function).
    8. Generate the missing column and assign 'NULL'.
    9. Reorganize the column order based on the sql table order (see pos_table_column_name_order in Airflow config variables).

    :param folder_path: A string indicating the folder that contains the report.
    :param sql_query_file_path: A string indicating the path to the .sql file used to generate the lookup table (see add_genacol_item_number_column function).
    :param table_name: A string indicating the name of the SQL table that should received the data frame.
    :param connection_string: A string indicating the connection information to a database.
        Follow this structure https://docs.sqlalchemy.org/en/13/core/engines.html.
        Recommended to pull connection string from Airflow connections.

    :return: A pandas data frame compliant with the database format.
    """
    file_path = get_file_path(folder_path)
    df = pd.read_excel(file_path, header=None)
    df = df[[0, 13]]
    df = df.drop([0, 1, 2])
    df.drop(df.tail(1).index,
            inplace=True)  # Could also do:  df = df[df[0] != 'Total']
    df.columns = ['CUSTITEMNMBR', 'pos_qty']
    df['CUSTITEMNMBR'] = df['CUSTITEMNMBR'].str[:12]
    df['week_starting'] = extract_date_from_report_name(folder_path)
    add_genacol_item_number_column(data_frame=df,
                                   connection_string=connection_string,
                                   sql_query_file_path=sql_query_file_path)
    df = df.drop(columns=['CUSTITEMNMBR'])
    df['ref_custnmbr'] = 'UNIP0000'
    df['store_city'] = 'NULL'
    df['store_province'] = 'QC'
    df['store_id'] = 'NULL'
    df['store_address'] = 'NULL'
    df['store_postalCode'] = 'NULL'
    df = df[Variable.get(
        key='databases',
        deserialize_json=True)['asterix_test']['pos_table_column_name_order']]

    return df, table_name, connection_string
Exemplo n.º 4
0
def wal_transform_load(folder_path, sql_query_file_path, table_name,
                       connection_string):
    """
    Restructure the wal report to be compliant with the database table.
    Here are the mains steps:
    1. Select the column of interest.
    2. Rename the columns.
    3. Remove the rows with a quantity equal to 0.
    4. Generate a column with the date (see extract_date_from_report_name function).
    5. Generate the genacol item number (see add_genacol_item_number_column function).
    6. Drop the external item number column.
    7. Generate a column with the banner name (see extract_banner_from_report_name function).
    8. Reorganize the column order based on the sql table order (see pos_table_column_name_order in Airflow config variables).
    9. Change the format of certain column.

    :param folder_path: A string indicating the folder that contains the report.
    :param sql_query_file_path: A string indicating the path to the .sql file used to generate the lookup table (see add_genacol_item_number_column function).
    :param table_name: A string indicating the name of the SQL table that should received the data frame.
    :param connection_string:A string indicating the connection information to a database.
        Follow this structure https://docs.sqlalchemy.org/en/13/core/engines.html.
        Recommended to pull connection string from Airflow connections.

    :return: A pandas data frame compliant with the database format.
    """
    file_path = get_file_path(folder_path)
    df = pd.read_csv(file_path, header=None, sep='\t', dtype={
        5: object
    })  # Force string on column 5 because values starts with '00'.
    df = df[[0, 1, 2, 3, 4, 5, 7]]
    df.columns = [
        'store_id', 'store_address', 'store_city', 'store_province',
        'store_postalCode', 'CUSTITEMNMBR', 'pos_qty'
    ]
    df = df[df['pos_qty'] != 0]
    df['week_starting'] = extract_date_from_report_name(folder_path)
    add_genacol_item_number_column(data_frame=df,
                                   connection_string=connection_string,
                                   sql_query_file_path=sql_query_file_path)
    df = df.drop(columns=['CUSTITEMNMBR'])
    df['ref_custnmbr'] = 'WALM0000'
    df = df[Variable.get(
        key='databases',
        deserialize_json=True)['asterix_test']['pos_table_column_name_order']]
    df['store_id'] = df['store_id'].apply(int).apply(str)
    df['pos_qty'] = df['pos_qty'].apply(int)

    return df, table_name, connection_string
Exemplo n.º 5
0
def jpc_transform_load(folder_path, sql_query_file_path, table_name,
                       connection_string):
    """
    Restructure the jpc report to be compliant with the database table.
    Here are the mains steps:
    1. Select the column of interest.
    2. Rename the columns.
    3. Remove the rows with a quantity equal to 0.
    4. Generate a column with the date (see extract_date_from_report_name function).
    5. Change the external item number format to string to fit with the lookup table.
    6. Generate the genacol item number (see add_genacol_item_number_column function).
    7. Drop the external item number column.
    8. Generate a column with the banner name (see extract_banner_from_report_name function).
    9. Split the store city to remove the '(' and only obtain the city name.
    10. Reorganize the column order based on the sql table order (see pos_table_column_name_order in Airflow config variables).

    :param folder_path: A string indicating the folder that contains the report.
    :param sql_query_file_path: A string indicating the path to the .sql file used to generate the lookup table (see add_genacol_item_number_column function).
    :param table_name: A string indicating the name of the SQL table that should received the data frame.
    :param connection_string: A string indicating the connection information to a database.
        Follow this structure https://docs.sqlalchemy.org/en/13/core/engines.html.
        Recommended to pull connection string from Airflow connections.

    :return: A pandas data frame compliant with the database format.
    """
    file_path = get_file_path(folder_path)
    df = pd.read_csv(file_path, header=None, encoding='latin1', sep=';')
    df = df[[0, 2, 5, 6, 7, 11, 12]]
    df.columns = [
        'store_id', 'store_address', 'store_city', 'store_postalCode',
        'store_province', 'CUSTITEMNMBR', 'pos_qty'
    ]
    df = df[df['pos_qty'] != 0]
    df['week_starting'] = extract_date_from_report_name(folder_path)
    df['CUSTITEMNMBR'] = df['CUSTITEMNMBR'].astype('str')
    add_genacol_item_number_column(data_frame=df,
                                   connection_string=connection_string,
                                   sql_query_file_path=sql_query_file_path)
    df = df.drop(columns=['CUSTITEMNMBR'])
    df['ref_custnmbr'] = "JEAN0000"
    df['store_city'] = df['store_city'].str.split('(').str[0]
    df = df[Variable.get(
        key='databases',
        deserialize_json=True)['asterix_test']['pos_table_column_name_order']]

    return df, table_name, connection_string
Exemplo n.º 6
0
def shp_stores_transform(folder_path):
    """
    Restructure the shp stores report in order to join it eventually with the shp report.

    :param folder_path: A string indicating where the file to transform reside.
        The structure ensure that only one file at a time is present in the folder.
        This is why we can assume that the first file is the right one.

    :return: A pandas data frame object.
    """
    file_path = get_file_path(folder_path)
    df = pd.read_excel(file_path, header=0, dtype={'Site #': object})
    df = df.iloc[:, [1, 7, 8, 9, 10]]
    df.columns = [
        'store_id', 'store_address', 'store_city', 'store_province',
        'store_postalCode'
    ]
    df['store_province'] = df['store_province'].replace('PQ', 'QC')

    return df
Exemplo n.º 7
0
def shp_transform_load(folder_path, sql_query_file_path, table_name,
                       connection_string, store_master_file_path):
    """
    Restructure the shp report to be compliant with the database table.
    Here are the mains steps:
    1. Select the column of interest.
    2. Rename the columns.
    3. Removes the rows where '-' is in the quantity column (refers to a null quantity).
    4. Generate a column with the date (see extract_date_from_report_name function).
    5. Change the external item number format to string to fit with the lookup table.
    6. Import the shp_stores.csv file (if not exist see init_shp_stores function).
    7. Merge the store ids from the report and the store file to check if new store id are present in the report.
        If so, the process stops and it recommends the user to manually call the 'update_shp_stores' function.
    8. Generate the genacol item number (see add_genacol_item_number_column function).
    9. Drop the external item number column.
    10. Generate a column with the banner name (see extract_banner_from_report_name function).
    11. Reorganize the column order based on the sql table order (see pos_table_column_name_order in Airflow config variables).

    :param folder_path: A string indicating the folder that contains the report.
    :param sql_query_file_path: A string indicating the path to the .sql file used to generate the lookup table (see add_genacol_item_number_column function).
    :param table_name: A string indicating the name of the SQL table that should received the data frame.
    :param connection_string: A string indicating the connection information to a database.
        Follow this structure https://docs.sqlalchemy.org/en/13/core/engines.html.
        Recommended to pull connection string from Airflow connections.

    :return: A pandas data frame compliant with the database format.
    """
    file_path = get_file_path(folder_path)
    df = pd.read_excel(file_path, header=0, dtype={
        'Site #': object
    })  # Force string on column because values starts with '0'.
    df = df.iloc[:, [1, 3, (len(df.columns) - 1)]]
    df.columns = ['store_id', 'CUSTITEMNMBR', 'pos_qty']
    df = df[df['pos_qty'] != '-']
    df['week_starting'] = extract_date_from_report_name(folder_path)
    df['CUSTITEMNMBR'] = df['CUSTITEMNMBR'].astype('str')
    store_infos = pd.read_csv(store_master_file_path,
                              dtype={'store_id': object})

    # Double verification for the store
    df_merged = df.merge(store_infos,
                         how='left',
                         on='store_id',
                         indicator=True)
    left_only = df_merged[df_merged['_merge'] == 'left_only']

    if len(left_only) != 0:
        raise AirflowException(
            "These Store ID in report dont find a match in the master file: %s."
            % left_only.values.tolist())
    else:
        df = df_merged.drop(columns=['_merge'])

    add_genacol_item_number_column(data_frame=df,
                                   connection_string=connection_string,
                                   sql_query_file_path=sql_query_file_path)
    df = df.drop(columns=['CUSTITEMNMBR'])
    df['ref_custnmbr'] = 'SHOP0000'
    df = df[Variable.get(
        key='databases',
        deserialize_json=True)['asterix_test']['pos_table_column_name_order']]

    return df, table_name, connection_string