예제 #1
0
def data_acquisition():
    performance_info_append(time.time(), 'Section_A_Start')
    log_record('Início Secção A...', options_file.project_id)

    current_date, _ = time_tags()

    dfs = []

    for query in [
            options_file.sales_query, options_file.stock_query,
            options_file.product_db_query, options_file.customer_group_query,
            options_file.dealers_query
    ]:
        df = sql_retrieve_df_specified_query(
            options_file.DSN_SRV3_PRD,
            options_file.sql_info['database_source'], options_file, query)
        # df.to_csv(file_name + '.csv')
        dfs.append(df)

    df_sales = dfs[0]
    df_stock = dfs[1]
    df_pdb = dfs[2]
    df_customers = dfs[3]
    df_dealers = dfs[4]

    df_pdb.drop_duplicates(
        subset='VehicleData_Code', inplace=True
    )  # There are repeated VehicleData_Code inside this union between BI_DTR and BI_DW_History

    df_sales['NLR_Code'] = pd.to_numeric(df_sales['NLR_Code'], errors='ignore')

    # Adding missing information regarding customers
    missing_customer_info_treatment(df_sales)

    # Addition of customer information
    df_customers_and_dealers = df_join_function(
        df_dealers,
        df_customers[['Customer_Group_Code',
                      'Customer_Group_Desc']].set_index('Customer_Group_Code'),
        on='Customer_Group_Code',
        how='left')
    df_sales = df_join_function(
        df_sales,
        df_customers_and_dealers[[
            'SLR_Account_CHS_Key', 'NDB_VATGroup_Desc', 'VAT_Number_Display',
            'NDB_Contract_Dealer_Desc', 'NDB_VHE_PerformGroup_Desc',
            'NDB_VHE_Team_Desc', 'Customer_Display', 'Customer_Group_Code',
            'Customer_Group_Desc', 'NDB_Dealer_Code'
        ]].set_index('SLR_Account_CHS_Key'),
        on='SLR_Account_CHS_Key',
        how='left')

    log_record('Fim Secção A.', options_file.project_id)
    performance_info_append(time.time(), 'Section_A_End')
    return df_sales, df_stock, df_pdb, df_customers, df_dealers
예제 #2
0
def missing_customer_info_treatment(df_sales):

    df_vehicles_wo_clients = pd.read_excel(
        base_path + '/dbs/viaturas_sem_cliente_final rb.xlsx',
        usecols=[
            'Chassis_Number', 'Registration_Number',
            'conc / nº cliente navision'
        ],
        dtype={
            'conc / nº cliente navision': str
        }).dropna()
    df_vehicles_wo_clients.rename(
        index=str,
        columns={'conc / nº cliente navision': 'SLR_Account_CHS_Key'},
        inplace=True)
    df_vehicles_wo_clients[
        'SLR_Account_CHS_Key'] = '702_' + df_vehicles_wo_clients[
            'SLR_Account_CHS_Key']
    df_sales = level_1_b_data_processing.df_join_function(
        df_sales,
        df_vehicles_wo_clients[['Chassis_Number', 'SLR_Account_CHS_Key'
                                ]].set_index('Chassis_Number'),
        on='Chassis_Number',
        rsuffix='_new',
        how='left')
    df_sales = level_1_b_data_processing.value_substitution(
        df_sales,
        non_null_column='SLR_Account_CHS_Key_new',
        null_column='SLR_Account_CHS_Key'
    )  # Replaces description by summary when the first is null and second exists
    df_sales.drop(['SLR_Account_CHS_Key_new'], axis=1, inplace=True)

    return df_sales
예제 #3
0
def first_step(requests, labels):
    # Join requests and labels on Request_Num
    requests['Request_Num'] = requests['Request_Num'].apply(coerce_to_unicode)
    labels['Request_Num'] = labels['Request_Num'].apply(coerce_to_unicode)

    # how=inner because I only want to keep requests with matching labels. The unmatched requests are PBI requests;
    labeled_requests = df_join_function(
        requests,
        labels[['Request_Num', 'StemmedDescription', 'Language',
                'Label']].set_index('Request_Num'),
        on='Request_Num',
        how='inner')

    return labeled_requests
예제 #4
0
def file_export_preparation(df, ocn_df, sel_brand):

    if options_file.nlr_code_desc[sel_brand] == 702:
        df_joined = df_join_function(
            df,
            ocn_df[[
                'Model_Code', 'OCN', 'PT_PDB_Model_Desc', 'PT_PDB_Engine_Desc',
                'PT_PDB_Transmission_Type_Desc', 'PT_PDB_Version_Desc'
            ]].set_index([
                'PT_PDB_Model_Desc', 'PT_PDB_Engine_Desc',
                'PT_PDB_Transmission_Type_Desc', 'PT_PDB_Version_Desc'
            ]),
            on=[
                'PT_PDB_Model_Desc', 'PT_PDB_Engine_Desc',
                'PT_PDB_Transmission_Type_Desc', 'PT_PDB_Version_Desc'
            ],
            how='left')
    elif options_file.nlr_code_desc[sel_brand] == 706:
        df_joined = df_join_function(
            df,
            ocn_df[[
                'Model_Code', 'PT_PDB_Model_Desc', 'PT_PDB_Engine_Desc',
                'PT_PDB_Transmission_Type_Desc', 'PT_PDB_Version_Desc'
            ]].set_index([
                'PT_PDB_Model_Desc', 'PT_PDB_Engine_Desc',
                'PT_PDB_Transmission_Type_Desc', 'PT_PDB_Version_Desc'
            ]),
            on=[
                'PT_PDB_Model_Desc', 'PT_PDB_Engine_Desc',
                'PT_PDB_Transmission_Type_Desc', 'PT_PDB_Version_Desc'
            ],
            how='left')
    else:
        raise ValueError('Unknown Selected Brand - {}'.format(sel_brand))

    return df_joined
예제 #5
0
def data_processing(df_facts, df_facts_duration, df_clients, df_pbi_categories, keywords_df):
    performance_info_append(time.time(), 'Section_B_Start')
    log_record('Início Secção B...', options_file.project_id)

    dict_strings_to_replace = {('Description', 'filesibmcognoscbindatacqertmodelsfdfdeeacebedeabeeabbedrtm'): 'files ibm cognos', ('Description', 'cognosapv'): 'cognos apv', ('Description', 'caetanoautopt'): 'caetano auto pt',
                               ('Description', 'autolinecognos'): 'autoline cognos', ('Description', 'realnao'): 'real nao', ('Description', 'booksytner'): 'book sytner'}  # ('Description', 'http://'): 'http://www.', ('Summary', 'http://'): 'http://www.'

    # Remove PBI's categories requests
    log_record('Contagem inicial de pedidos: {}'.format(df_facts['Request_Num'].nunique()), options_file.project_id)
    pbi_categories = remove_rows(df_pbi_categories.copy(), [df_pbi_categories[~df_pbi_categories['Category_Name'].str.contains('Power BI')].index], options_file.project_id)['Category_Id'].values  # Selects the Category ID's which belong to PBI
    log_record('Contagem de pedidos PBI: {}'.format(df_facts[df_facts['Category_Id'].isin(pbi_categories)]['Request_Num'].nunique()), options_file.project_id)
    df_facts = remove_rows(df_facts, [df_facts.loc[df_facts['Category_Id'].isin(pbi_categories)].index], options_file.project_id)  # Removes the rows which belong to PBI;
    log_record('Após o filtro de pedidos PBI, a nova contagem é de: {}'.format(df_facts['Request_Num'].nunique()), options_file.project_id)

    # Lowercase convertion of Summary and Description
    df_facts = lowercase_column_conversion(df_facts, columns=['Summary', 'Description'])

    # Addition of Client/Assignee Information and imputation of some missing values
    df_facts = df_join_function(df_facts, df_facts_duration.set_index('Request_Num'), on='Request_Num')
    df_facts = df_join_function(df_facts, df_clients.set_index('Contact_Id'), on='Contact_Customer_Id')
    df_facts = value_replacement(df_facts, options_file.assignee_id_replacements)
    df_facts = df_join_function(df_facts, df_clients.set_index('Contact_Id'), on='Contact_Assignee_Id', lsuffix='_Customer', rsuffix='_Assignee')
    df_facts = value_replacement(df_facts, options_file.sla_resolution_hours_replacements)

    # Collection of all Client/Assignee possible names
    unique_clients_names_decoded = string_to_list(df_facts, ['Name_Customer'])
    unique_clients_login_decoded = string_to_list(df_facts, ['Login_Name_Customer'])
    unique_assignee_names_decoded = string_to_list(df_facts, ['Name_Assignee'])
    unique_assignee_login_decoded = string_to_list(df_facts, ['Login_Name_Assignee'])

    # Imputation of missing values for Name_Assignee Column
    df_facts = null_handling(df_facts, {'Name_Assignee': 'Fechados pelo Cliente'})

    # Replaces resolve date by close date when the first is null and second exists
    df_facts = value_substitution(df_facts, non_null_column='Close_Date', null_column='Resolve_Date')

    # df_facts = df_facts.groupby('Request_Num').apply(close_and_resolve_date_replacements)  # Currently doing nothing, hence why it's commented

    # Removes duplicate request numbers
    df_facts = duplicate_removal(df_facts, ['Request_Num'])

    # Removes new lines, tabs, etc;
    df_facts = literal_removal(df_facts, 'Description')

    # Replaces string errors, specified in the provided dictionary
    df_facts = string_replacer(df_facts, dict_strings_to_replace)

    df_facts = value_replacement(df_facts, {'Description': options_file.regex_dict['url']})
    df_facts = value_replacement(df_facts, {'Summary': options_file.regex_dict['url']})
    df_facts = value_substitution(df_facts, non_null_column='Summary', null_column='Description')  # Replaces description by summary when the first is null and second exists

    df_facts = language_detection(df_facts, 'Description', 'Language')
    df_facts = string_replacer(df_facts, {('Language', 'ca'): 'es', ('Category_Id', 'pcat:'): ''})

    df_facts = summary_description_null_checkup(df_facts)  # Cleans requests which have the Summary and Description null

    stop_words_list = options_file.words_to_remove_from_description + unique_clients_names_decoded + unique_clients_login_decoded + unique_assignee_names_decoded + unique_assignee_login_decoded
    df_facts['Description'] = df_facts['Description'].apply(stop_words_removal, args=(stop_words_list,))

    if similar_process_flag:
        df_facts = similar_words_handling(df_facts, keywords_df, options_file.testing_dict)

    df_facts = text_preprocess(df_facts, unique_clients_names_decoded + unique_clients_login_decoded + unique_assignee_names_decoded + unique_assignee_login_decoded, options_file)

    df_facts = value_replacement(df_facts, options_file.language_replacements)

    # Checkpoint B.1 - Key Words data frame creation

    df_facts, df_top_words = top_words_processing(df_facts, description_col='StemmedDescription')

    log_record('Após o processamento a contagem de pedidos é de: {}'.format(df_facts['Request_Num'].nunique()), options_file.project_id)
    log_record('Fim Secção B.', options_file.project_id)
    performance_info_append(time.time(), 'Section_B_End')

    return df_facts, df_top_words
예제 #6
0
def data_processing(df_sales, df_pdb_dim, configuration_parameters_cols,
                    range_dates, target):
    performance_info_append(time.time(), 'Section_B_Start')
    log_record('Início Secção B...', options_file.project_id)
    current_date, _ = time_tags()

    try:
        df_ohe = read_csv(
            'dbs/df_hyundai_dataset_ml_version_ohe_{}.csv'.format(
                current_date),
            index_col=0,
            dtype={
                'NDB_VATGroup_Desc': 'category',
                'VAT_Number_Display': 'category',
                'NDB_Contract_Dealer_Desc': 'category',
                'NDB_VHE_PerformGroup_Desc': 'category',
                'NDB_VHE_Team_Desc': 'category',
                'Customer_Display': 'category',
                'Customer_Group_Desc': 'category',
                'SLR_Account_Dealer_Code': 'category',
                'Product_Code': 'category',
                'Sales_Type_Dealer_Code': 'category',
                'Sales_Type_Code': 'category',
                'Vehicle_Type_Code': 'category',
                'Fuel_Type_Code': 'category',
                'PT_PDB_Model_Desc': 'category',
                'PT_PDB_Engine_Desc': 'category',
                'PT_PDB_Transmission_Type_Desc': 'category',
                'PT_PDB_Version_Desc': 'category',
                'PT_PDB_Exterior_Color_Desc': 'category',
                'PT_PDB_Interior_Color_Desc': 'category',
                'NDB_Dealer_Code': 'category'
            })
        df_non_ohe = read_csv(
            'dbs/df_hyundai_dataset_ml_version_{}.csv'.format(current_date),
            index_col=0,
            dtype={
                'NDB_VATGroup_Desc': 'category',
                'VAT_Number_Display': 'category',
                'NDB_Contract_Dealer_Desc': 'category',
                'NDB_VHE_PerformGroup_Desc': 'category',
                'NDB_VHE_Team_Desc': 'category',
                'Customer_Display': 'category',
                'Customer_Group_Desc': 'category',
                'SLR_Account_Dealer_Code': 'category',
                'Product_Code': 'category',
                'Sales_Type_Dealer_Code': 'category',
                'Sales_Type_Code': 'category',
                'Vehicle_Type_Code': 'category',
                'Fuel_Type_Code': 'category',
                'PT_PDB_Model_Desc': 'category',
                'PT_PDB_Engine_Desc': 'category',
                'PT_PDB_Transmission_Type_Desc': 'category',
                'PT_PDB_Version_Desc': 'category',
                'PT_PDB_Exterior_Color_Desc': 'category',
                'PT_PDB_Interior_Color_Desc': 'category',
                'NDB_Dealer_Code': 'category'
            })
        df_sales = read_csv(
            'dbs/df_hyundai_dataset_all_info_{}.csv'.format(current_date),
            index_col=0,
            dtype={
                'SLR_Account_Dealer_Code': object,
                'Immobilized_Number': object
            },
            parse_dates=options_file.date_columns)

        log_record(
            'Dados do dia atual foram encontrados. A passar para a próxima secção...',
            options_file.project_id)
    except FileNotFoundError:
        log_record('Dados do dia atual não foram encontrados. A processar...',
                   options_file.project_id)

        # Step 1 - Dataset cleaning and transforming to 1 line per sale
        columns_to_convert_to_datetime = [
            'Ship_Arrival_Date', 'SLR_Document_Date_CHS',
            'Registration_Request_Date', 'SLR_Document_Date_RGN'
        ]
        for column in columns_to_convert_to_datetime:
            df_sales[column] = pd.to_datetime(df_sales[column])

        # Filtering
        log_record(
            '1 - Contagem Inicial de Chassis únicos: {}'.format(
                df_sales['Chassis_Number'].nunique()), options_file.project_id)
        log_record(
            '1 - Contagem Inicial de Matrículas únicas: {}'.format(
                df_sales['Registration_Number'].nunique()),
            options_file.project_id)

        print(
            'Removal of 49-VG-94 Registration Plate, which presents two Chassis Number'
        )
        df_sales = df_sales[~(
            df_sales['Registration_Number'] == '49-VG-94')].copy()

        # Sorting
        df_sales.sort_values([
            'Ship_Arrival_Date', 'SLR_Document_Date_CHS',
            'Registration_Request_Date', 'SLR_Document_Date_RGN'
        ])

        df_sales['No_Registration_Number_Flag'] = 0
        df_sales['Registration_Number_No_SLR_Document_RGN_Flag'] = 0
        df_sales['SLR_Document_RGN_Flag'] = 0
        df_sales['Undefined_VHE_Status'] = 0

        df_sales_grouped_3 = df_sales.groupby(
            ['Chassis_Number', 'Registration_Number'])
        df_sales = na_fill_hyundai(df_sales_grouped_3)

        # New Column Creation
        # df_sales_grouped = df_sales.groupby(['VehicleData_Code'])
        # df_sales['Quantity_Sold'] = df_sales_grouped['Quantity_CHS'].transform('sum')
        # df_sales['Quantity_Sold'] = df_sales['Quantity_Sold'].astype(np.int64, errors='ignore')

        # df_sales_unique_chassis = df_sales.drop_duplicates(subset=['VehicleData_Code', 'Chassis_Number']).copy()
        # df_sales_grouped_2 = df_sales_unique_chassis.groupby(['VehicleData_Code'])
        # df_sales['Average_DaysInStock_Global'] = df_sales_grouped_2['DaysInStock_Global'].transform('mean').round(3)

        # df_sales.to_csv('dbs/df_sales_importador_processed_{}.csv'.format(current_date))

        # Step 2: BI Processing
        # print('Number of unique Chassis: {} and number of rows: {}'.format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0]))
        df_sales = df_join_function(
            df_sales,
            df_pdb_dim[['VehicleData_Code'] + configuration_parameters_cols +
                       range_dates].set_index('VehicleData_Code'),
            on='VehicleData_Code',
            how='left')
        df_sales = update_new_gamas(df_sales, df_pdb_dim)

        df_sales = lowercase_column_conversion(df_sales,
                                               configuration_parameters_cols)

        # Filtering rows with no relevant information
        # print('1 - Number of unique Chassis: {} and number of rows: {}'.format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0]))
        # df_sales = df_sales[df_sales['NLR_Code'] == 702]  # Escolha de viaturas apenas Hyundai
        # log_record('1 - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}'.format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0]), options_file.project_id)
        df_sales = df_sales[df_sales['VehicleData_Code'] != 1]
        log_record(
            '2 - Remoção de Viaturas não parametrizadas - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}'
            .format(df_sales['Chassis_Number'].nunique(),
                    df_sales.shape[0]), options_file.project_id)
        df_sales = df_sales[df_sales['Sales_Type_Dealer_Code'] != 'Demo']
        log_record(
            '3 - Remoção de Viaturas de Demonstração - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}'
            .format(df_sales['Chassis_Number'].nunique(),
                    df_sales.shape[0]), options_file.project_id)
        # df_sales = df_sales[df_sales['Sales_Type_Code_DMS'].isin(['RAC', 'STOCK', 'VENDA'])]
        # log_record('4 - Seleção de apenas Viaturas de RAC, Stock e Venda - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}'.format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0]), options_file.project_id)
        df_sales = df_sales[~df_sales['Dispatch_Type_Code'].
                            isin(['AMBULÂNCIA', 'TAXI', 'PSP'])]
        log_record(
            '5 - Remoção de Viaturas Especiais (Ambulâncias, Táxis, PSP) - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}'
            .format(df_sales['Chassis_Number'].nunique(),
                    df_sales.shape[0]), options_file.project_id)
        df_sales = df_sales[
            df_sales['DaysInStock_Global'] >=
            0]  # Filters rows where, for some odd reason, the days in stock are negative
        log_record(
            '6 - Remoção de Viaturas com Dias em Stock Global negativos - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}'
            .format(df_sales['Chassis_Number'].nunique(),
                    df_sales.shape[0]), options_file.project_id)
        df_sales = df_sales[
            df_sales['Registration_Number'] !=
            'G.FORCE']  # Filters rows where, for some odd reason, the days in stock are negative
        log_record(
            '7 - Remoção de Viaturas com Matrículas Inválidas (G.Force) - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}'
            .format(df_sales['Chassis_Number'].nunique(),
                    df_sales.shape[0]), options_file.project_id)
        # df_sales = df_sales[df_sales['Customer_Group_Code'].notnull()]  # Filters rows where there is no client information;
        # log_record('8 - Remoção de Viaturas sem informação de cliente - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}'.format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0]), options_file.project_id)
        df_sales = df_sales[df_sales['DaysInStock_Distributor'].notnull()]
        log_record(
            '9 - Remoção de Viaturas sem informação de Dias em Stock - Distribuidor - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}'
            .format(df_sales['Chassis_Number'].nunique(),
                    df_sales.shape[0]), options_file.project_id)
        df_sales = df_sales[df_sales['DaysInStock_Dealer'].notnull()]
        log_record(
            '10 - Remoção de Viaturas sem informação de Dias em Stock - Dealer - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}'
            .format(df_sales['Chassis_Number'].nunique(),
                    df_sales.shape[0]), options_file.project_id)
        df_sales = df_sales[df_sales['PT_PDB_Model_Desc'] != 'não definido']
        log_record(
            '11 - Remoção de Viaturas sem informação de Modelo na PDB - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}'
            .format(df_sales['Chassis_Number'].nunique(),
                    df_sales.shape[0]), options_file.project_id)

        df_sales = new_features(df_sales, configuration_parameters_cols,
                                options_file.project_id)

        # Specific Measures Calculation
        df_sales = measures_calculation_hyundai(df_sales)

        # Fill values
        df_sales['Total_Discount_%'] = df_sales['Total_Discount_%'].replace(
            [np.inf, np.nan, -np.inf],
            0)  # Is this correct? This is caused by Total Sales = 0
        df_sales['Fixed_Margin_I_%'] = df_sales['Fixed_Margin_I_%'].replace(
            [np.inf, np.nan, -np.inf],
            0)  # Is this correct? This is caused by Total Net Sales = 0

        df_sales = lowercase_column_conversion(
            df_sales, configuration_parameters_cols
        )  # Lowercases the strings of these columns

        # df_sales = parameter_processing_hyundai(df_sales, options_file, configuration_parameters_cols)

        translation_dictionaries = [
            options_file.transmission_translation,
            options_file.ext_color_translation,
            options_file.int_color_translation
        ]
        # grouping_dictionaries = [options_file.motor_grouping, options_file.transmission_grouping, options_file.version_grouping, options_file.ext_color_grouping, options_file.int_color_grouping]

        # Parameter Translation
        # df_sales = col_group(df_sales, [x for x in configuration_parameters_cols if 'Model' not in x], translation_dictionaries, options_file.project_id)
        df_sales = col_group(df_sales, [
            'PT_PDB_Transmission_Type_Desc', 'PT_PDB_Exterior_Color_Desc',
            'PT_PDB_Interior_Color_Desc'
        ], translation_dictionaries, options_file.project_id)
        df_sales = df_sales[
            df_sales['PT_PDB_Version_Desc'] != 'NÃO_PARAMETRIZADOS']
        log_record(
            '9 - Remoção de Viaturas sem versão parametrizada - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}'
            .format(df_sales['Chassis_Number'].nunique(),
                    df_sales.shape[0]), options_file.project_id)
        project_units_count_checkup(df_sales,
                                    'Chassis_Number',
                                    options_file,
                                    sql_check=1)

        # Parameter Grouping
        print('### NO GROUPING ###')
        # df_sales = col_group(df_sales, [x for x in configuration_parameters_cols if 'Model' not in x], grouping_dictionaries, options_file.project_id)

        log_record(
            'Contagem de VehicleData_Code únicos: {}'.format(
                df_sales['VehicleData_Code'].nunique()),
            options_file.project_id)
        df_sales_grouped_conf_cols = df_sales.groupby(
            configuration_parameters_cols)

        log_record(
            'Contagem de Configurações: {}'.format(
                len(df_sales_grouped_conf_cols)), options_file.project_id)

        # New VehicleData_Code Creation
        df_sales['ML_VehicleData_Code'] = df_sales.groupby(
            configuration_parameters_cols).ngroup()
        # df_sales.to_csv('dbs/df_hyundai_dataset_all_info_{}.csv'.format(current_date))

    log_record('Fim Secção B.', options_file.project_id)
    performance_info_append(time.time(), 'Section_B_End')
    return df_sales