def data_acquisition(): performance_info_append(time.time(), 'Section_A_Start') log_record('Início Secção A...', options_file.project_id) current_date, _ = time_tags() dfs = [] for query in [ options_file.sales_query, options_file.stock_query, options_file.product_db_query, options_file.customer_group_query, options_file.dealers_query ]: df = sql_retrieve_df_specified_query( options_file.DSN_SRV3_PRD, options_file.sql_info['database_source'], options_file, query) # df.to_csv(file_name + '.csv') dfs.append(df) df_sales = dfs[0] df_stock = dfs[1] df_pdb = dfs[2] df_customers = dfs[3] df_dealers = dfs[4] df_pdb.drop_duplicates( subset='VehicleData_Code', inplace=True ) # There are repeated VehicleData_Code inside this union between BI_DTR and BI_DW_History df_sales['NLR_Code'] = pd.to_numeric(df_sales['NLR_Code'], errors='ignore') # Adding missing information regarding customers missing_customer_info_treatment(df_sales) # Addition of customer information df_customers_and_dealers = df_join_function( df_dealers, df_customers[['Customer_Group_Code', 'Customer_Group_Desc']].set_index('Customer_Group_Code'), on='Customer_Group_Code', how='left') df_sales = df_join_function( df_sales, df_customers_and_dealers[[ 'SLR_Account_CHS_Key', 'NDB_VATGroup_Desc', 'VAT_Number_Display', 'NDB_Contract_Dealer_Desc', 'NDB_VHE_PerformGroup_Desc', 'NDB_VHE_Team_Desc', 'Customer_Display', 'Customer_Group_Code', 'Customer_Group_Desc', 'NDB_Dealer_Code' ]].set_index('SLR_Account_CHS_Key'), on='SLR_Account_CHS_Key', how='left') log_record('Fim Secção A.', options_file.project_id) performance_info_append(time.time(), 'Section_A_End') return df_sales, df_stock, df_pdb, df_customers, df_dealers
def missing_customer_info_treatment(df_sales): df_vehicles_wo_clients = pd.read_excel( base_path + '/dbs/viaturas_sem_cliente_final rb.xlsx', usecols=[ 'Chassis_Number', 'Registration_Number', 'conc / nº cliente navision' ], dtype={ 'conc / nº cliente navision': str }).dropna() df_vehicles_wo_clients.rename( index=str, columns={'conc / nº cliente navision': 'SLR_Account_CHS_Key'}, inplace=True) df_vehicles_wo_clients[ 'SLR_Account_CHS_Key'] = '702_' + df_vehicles_wo_clients[ 'SLR_Account_CHS_Key'] df_sales = level_1_b_data_processing.df_join_function( df_sales, df_vehicles_wo_clients[['Chassis_Number', 'SLR_Account_CHS_Key' ]].set_index('Chassis_Number'), on='Chassis_Number', rsuffix='_new', how='left') df_sales = level_1_b_data_processing.value_substitution( df_sales, non_null_column='SLR_Account_CHS_Key_new', null_column='SLR_Account_CHS_Key' ) # Replaces description by summary when the first is null and second exists df_sales.drop(['SLR_Account_CHS_Key_new'], axis=1, inplace=True) return df_sales
def first_step(requests, labels): # Join requests and labels on Request_Num requests['Request_Num'] = requests['Request_Num'].apply(coerce_to_unicode) labels['Request_Num'] = labels['Request_Num'].apply(coerce_to_unicode) # how=inner because I only want to keep requests with matching labels. The unmatched requests are PBI requests; labeled_requests = df_join_function( requests, labels[['Request_Num', 'StemmedDescription', 'Language', 'Label']].set_index('Request_Num'), on='Request_Num', how='inner') return labeled_requests
def file_export_preparation(df, ocn_df, sel_brand): if options_file.nlr_code_desc[sel_brand] == 702: df_joined = df_join_function( df, ocn_df[[ 'Model_Code', 'OCN', 'PT_PDB_Model_Desc', 'PT_PDB_Engine_Desc', 'PT_PDB_Transmission_Type_Desc', 'PT_PDB_Version_Desc' ]].set_index([ 'PT_PDB_Model_Desc', 'PT_PDB_Engine_Desc', 'PT_PDB_Transmission_Type_Desc', 'PT_PDB_Version_Desc' ]), on=[ 'PT_PDB_Model_Desc', 'PT_PDB_Engine_Desc', 'PT_PDB_Transmission_Type_Desc', 'PT_PDB_Version_Desc' ], how='left') elif options_file.nlr_code_desc[sel_brand] == 706: df_joined = df_join_function( df, ocn_df[[ 'Model_Code', 'PT_PDB_Model_Desc', 'PT_PDB_Engine_Desc', 'PT_PDB_Transmission_Type_Desc', 'PT_PDB_Version_Desc' ]].set_index([ 'PT_PDB_Model_Desc', 'PT_PDB_Engine_Desc', 'PT_PDB_Transmission_Type_Desc', 'PT_PDB_Version_Desc' ]), on=[ 'PT_PDB_Model_Desc', 'PT_PDB_Engine_Desc', 'PT_PDB_Transmission_Type_Desc', 'PT_PDB_Version_Desc' ], how='left') else: raise ValueError('Unknown Selected Brand - {}'.format(sel_brand)) return df_joined
def data_processing(df_facts, df_facts_duration, df_clients, df_pbi_categories, keywords_df): performance_info_append(time.time(), 'Section_B_Start') log_record('Início Secção B...', options_file.project_id) dict_strings_to_replace = {('Description', 'filesibmcognoscbindatacqertmodelsfdfdeeacebedeabeeabbedrtm'): 'files ibm cognos', ('Description', 'cognosapv'): 'cognos apv', ('Description', 'caetanoautopt'): 'caetano auto pt', ('Description', 'autolinecognos'): 'autoline cognos', ('Description', 'realnao'): 'real nao', ('Description', 'booksytner'): 'book sytner'} # ('Description', 'http://'): 'http://www.', ('Summary', 'http://'): 'http://www.' # Remove PBI's categories requests log_record('Contagem inicial de pedidos: {}'.format(df_facts['Request_Num'].nunique()), options_file.project_id) pbi_categories = remove_rows(df_pbi_categories.copy(), [df_pbi_categories[~df_pbi_categories['Category_Name'].str.contains('Power BI')].index], options_file.project_id)['Category_Id'].values # Selects the Category ID's which belong to PBI log_record('Contagem de pedidos PBI: {}'.format(df_facts[df_facts['Category_Id'].isin(pbi_categories)]['Request_Num'].nunique()), options_file.project_id) df_facts = remove_rows(df_facts, [df_facts.loc[df_facts['Category_Id'].isin(pbi_categories)].index], options_file.project_id) # Removes the rows which belong to PBI; log_record('Após o filtro de pedidos PBI, a nova contagem é de: {}'.format(df_facts['Request_Num'].nunique()), options_file.project_id) # Lowercase convertion of Summary and Description df_facts = lowercase_column_conversion(df_facts, columns=['Summary', 'Description']) # Addition of Client/Assignee Information and imputation of some missing values df_facts = df_join_function(df_facts, df_facts_duration.set_index('Request_Num'), on='Request_Num') df_facts = df_join_function(df_facts, df_clients.set_index('Contact_Id'), on='Contact_Customer_Id') df_facts = value_replacement(df_facts, options_file.assignee_id_replacements) df_facts = df_join_function(df_facts, df_clients.set_index('Contact_Id'), on='Contact_Assignee_Id', lsuffix='_Customer', rsuffix='_Assignee') df_facts = value_replacement(df_facts, options_file.sla_resolution_hours_replacements) # Collection of all Client/Assignee possible names unique_clients_names_decoded = string_to_list(df_facts, ['Name_Customer']) unique_clients_login_decoded = string_to_list(df_facts, ['Login_Name_Customer']) unique_assignee_names_decoded = string_to_list(df_facts, ['Name_Assignee']) unique_assignee_login_decoded = string_to_list(df_facts, ['Login_Name_Assignee']) # Imputation of missing values for Name_Assignee Column df_facts = null_handling(df_facts, {'Name_Assignee': 'Fechados pelo Cliente'}) # Replaces resolve date by close date when the first is null and second exists df_facts = value_substitution(df_facts, non_null_column='Close_Date', null_column='Resolve_Date') # df_facts = df_facts.groupby('Request_Num').apply(close_and_resolve_date_replacements) # Currently doing nothing, hence why it's commented # Removes duplicate request numbers df_facts = duplicate_removal(df_facts, ['Request_Num']) # Removes new lines, tabs, etc; df_facts = literal_removal(df_facts, 'Description') # Replaces string errors, specified in the provided dictionary df_facts = string_replacer(df_facts, dict_strings_to_replace) df_facts = value_replacement(df_facts, {'Description': options_file.regex_dict['url']}) df_facts = value_replacement(df_facts, {'Summary': options_file.regex_dict['url']}) df_facts = value_substitution(df_facts, non_null_column='Summary', null_column='Description') # Replaces description by summary when the first is null and second exists df_facts = language_detection(df_facts, 'Description', 'Language') df_facts = string_replacer(df_facts, {('Language', 'ca'): 'es', ('Category_Id', 'pcat:'): ''}) df_facts = summary_description_null_checkup(df_facts) # Cleans requests which have the Summary and Description null stop_words_list = options_file.words_to_remove_from_description + unique_clients_names_decoded + unique_clients_login_decoded + unique_assignee_names_decoded + unique_assignee_login_decoded df_facts['Description'] = df_facts['Description'].apply(stop_words_removal, args=(stop_words_list,)) if similar_process_flag: df_facts = similar_words_handling(df_facts, keywords_df, options_file.testing_dict) df_facts = text_preprocess(df_facts, unique_clients_names_decoded + unique_clients_login_decoded + unique_assignee_names_decoded + unique_assignee_login_decoded, options_file) df_facts = value_replacement(df_facts, options_file.language_replacements) # Checkpoint B.1 - Key Words data frame creation df_facts, df_top_words = top_words_processing(df_facts, description_col='StemmedDescription') log_record('Após o processamento a contagem de pedidos é de: {}'.format(df_facts['Request_Num'].nunique()), options_file.project_id) log_record('Fim Secção B.', options_file.project_id) performance_info_append(time.time(), 'Section_B_End') return df_facts, df_top_words
def data_processing(df_sales, df_pdb_dim, configuration_parameters_cols, range_dates, target): performance_info_append(time.time(), 'Section_B_Start') log_record('Início Secção B...', options_file.project_id) current_date, _ = time_tags() try: df_ohe = read_csv( 'dbs/df_hyundai_dataset_ml_version_ohe_{}.csv'.format( current_date), index_col=0, dtype={ 'NDB_VATGroup_Desc': 'category', 'VAT_Number_Display': 'category', 'NDB_Contract_Dealer_Desc': 'category', 'NDB_VHE_PerformGroup_Desc': 'category', 'NDB_VHE_Team_Desc': 'category', 'Customer_Display': 'category', 'Customer_Group_Desc': 'category', 'SLR_Account_Dealer_Code': 'category', 'Product_Code': 'category', 'Sales_Type_Dealer_Code': 'category', 'Sales_Type_Code': 'category', 'Vehicle_Type_Code': 'category', 'Fuel_Type_Code': 'category', 'PT_PDB_Model_Desc': 'category', 'PT_PDB_Engine_Desc': 'category', 'PT_PDB_Transmission_Type_Desc': 'category', 'PT_PDB_Version_Desc': 'category', 'PT_PDB_Exterior_Color_Desc': 'category', 'PT_PDB_Interior_Color_Desc': 'category', 'NDB_Dealer_Code': 'category' }) df_non_ohe = read_csv( 'dbs/df_hyundai_dataset_ml_version_{}.csv'.format(current_date), index_col=0, dtype={ 'NDB_VATGroup_Desc': 'category', 'VAT_Number_Display': 'category', 'NDB_Contract_Dealer_Desc': 'category', 'NDB_VHE_PerformGroup_Desc': 'category', 'NDB_VHE_Team_Desc': 'category', 'Customer_Display': 'category', 'Customer_Group_Desc': 'category', 'SLR_Account_Dealer_Code': 'category', 'Product_Code': 'category', 'Sales_Type_Dealer_Code': 'category', 'Sales_Type_Code': 'category', 'Vehicle_Type_Code': 'category', 'Fuel_Type_Code': 'category', 'PT_PDB_Model_Desc': 'category', 'PT_PDB_Engine_Desc': 'category', 'PT_PDB_Transmission_Type_Desc': 'category', 'PT_PDB_Version_Desc': 'category', 'PT_PDB_Exterior_Color_Desc': 'category', 'PT_PDB_Interior_Color_Desc': 'category', 'NDB_Dealer_Code': 'category' }) df_sales = read_csv( 'dbs/df_hyundai_dataset_all_info_{}.csv'.format(current_date), index_col=0, dtype={ 'SLR_Account_Dealer_Code': object, 'Immobilized_Number': object }, parse_dates=options_file.date_columns) log_record( 'Dados do dia atual foram encontrados. A passar para a próxima secção...', options_file.project_id) except FileNotFoundError: log_record('Dados do dia atual não foram encontrados. A processar...', options_file.project_id) # Step 1 - Dataset cleaning and transforming to 1 line per sale columns_to_convert_to_datetime = [ 'Ship_Arrival_Date', 'SLR_Document_Date_CHS', 'Registration_Request_Date', 'SLR_Document_Date_RGN' ] for column in columns_to_convert_to_datetime: df_sales[column] = pd.to_datetime(df_sales[column]) # Filtering log_record( '1 - Contagem Inicial de Chassis únicos: {}'.format( df_sales['Chassis_Number'].nunique()), options_file.project_id) log_record( '1 - Contagem Inicial de Matrículas únicas: {}'.format( df_sales['Registration_Number'].nunique()), options_file.project_id) print( 'Removal of 49-VG-94 Registration Plate, which presents two Chassis Number' ) df_sales = df_sales[~( df_sales['Registration_Number'] == '49-VG-94')].copy() # Sorting df_sales.sort_values([ 'Ship_Arrival_Date', 'SLR_Document_Date_CHS', 'Registration_Request_Date', 'SLR_Document_Date_RGN' ]) df_sales['No_Registration_Number_Flag'] = 0 df_sales['Registration_Number_No_SLR_Document_RGN_Flag'] = 0 df_sales['SLR_Document_RGN_Flag'] = 0 df_sales['Undefined_VHE_Status'] = 0 df_sales_grouped_3 = df_sales.groupby( ['Chassis_Number', 'Registration_Number']) df_sales = na_fill_hyundai(df_sales_grouped_3) # New Column Creation # df_sales_grouped = df_sales.groupby(['VehicleData_Code']) # df_sales['Quantity_Sold'] = df_sales_grouped['Quantity_CHS'].transform('sum') # df_sales['Quantity_Sold'] = df_sales['Quantity_Sold'].astype(np.int64, errors='ignore') # df_sales_unique_chassis = df_sales.drop_duplicates(subset=['VehicleData_Code', 'Chassis_Number']).copy() # df_sales_grouped_2 = df_sales_unique_chassis.groupby(['VehicleData_Code']) # df_sales['Average_DaysInStock_Global'] = df_sales_grouped_2['DaysInStock_Global'].transform('mean').round(3) # df_sales.to_csv('dbs/df_sales_importador_processed_{}.csv'.format(current_date)) # Step 2: BI Processing # print('Number of unique Chassis: {} and number of rows: {}'.format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0])) df_sales = df_join_function( df_sales, df_pdb_dim[['VehicleData_Code'] + configuration_parameters_cols + range_dates].set_index('VehicleData_Code'), on='VehicleData_Code', how='left') df_sales = update_new_gamas(df_sales, df_pdb_dim) df_sales = lowercase_column_conversion(df_sales, configuration_parameters_cols) # Filtering rows with no relevant information # print('1 - Number of unique Chassis: {} and number of rows: {}'.format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0])) # df_sales = df_sales[df_sales['NLR_Code'] == 702] # Escolha de viaturas apenas Hyundai # log_record('1 - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}'.format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0]), options_file.project_id) df_sales = df_sales[df_sales['VehicleData_Code'] != 1] log_record( '2 - Remoção de Viaturas não parametrizadas - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}' .format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0]), options_file.project_id) df_sales = df_sales[df_sales['Sales_Type_Dealer_Code'] != 'Demo'] log_record( '3 - Remoção de Viaturas de Demonstração - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}' .format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0]), options_file.project_id) # df_sales = df_sales[df_sales['Sales_Type_Code_DMS'].isin(['RAC', 'STOCK', 'VENDA'])] # log_record('4 - Seleção de apenas Viaturas de RAC, Stock e Venda - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}'.format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0]), options_file.project_id) df_sales = df_sales[~df_sales['Dispatch_Type_Code']. isin(['AMBULÂNCIA', 'TAXI', 'PSP'])] log_record( '5 - Remoção de Viaturas Especiais (Ambulâncias, Táxis, PSP) - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}' .format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0]), options_file.project_id) df_sales = df_sales[ df_sales['DaysInStock_Global'] >= 0] # Filters rows where, for some odd reason, the days in stock are negative log_record( '6 - Remoção de Viaturas com Dias em Stock Global negativos - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}' .format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0]), options_file.project_id) df_sales = df_sales[ df_sales['Registration_Number'] != 'G.FORCE'] # Filters rows where, for some odd reason, the days in stock are negative log_record( '7 - Remoção de Viaturas com Matrículas Inválidas (G.Force) - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}' .format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0]), options_file.project_id) # df_sales = df_sales[df_sales['Customer_Group_Code'].notnull()] # Filters rows where there is no client information; # log_record('8 - Remoção de Viaturas sem informação de cliente - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}'.format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0]), options_file.project_id) df_sales = df_sales[df_sales['DaysInStock_Distributor'].notnull()] log_record( '9 - Remoção de Viaturas sem informação de Dias em Stock - Distribuidor - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}' .format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0]), options_file.project_id) df_sales = df_sales[df_sales['DaysInStock_Dealer'].notnull()] log_record( '10 - Remoção de Viaturas sem informação de Dias em Stock - Dealer - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}' .format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0]), options_file.project_id) df_sales = df_sales[df_sales['PT_PDB_Model_Desc'] != 'não definido'] log_record( '11 - Remoção de Viaturas sem informação de Modelo na PDB - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}' .format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0]), options_file.project_id) df_sales = new_features(df_sales, configuration_parameters_cols, options_file.project_id) # Specific Measures Calculation df_sales = measures_calculation_hyundai(df_sales) # Fill values df_sales['Total_Discount_%'] = df_sales['Total_Discount_%'].replace( [np.inf, np.nan, -np.inf], 0) # Is this correct? This is caused by Total Sales = 0 df_sales['Fixed_Margin_I_%'] = df_sales['Fixed_Margin_I_%'].replace( [np.inf, np.nan, -np.inf], 0) # Is this correct? This is caused by Total Net Sales = 0 df_sales = lowercase_column_conversion( df_sales, configuration_parameters_cols ) # Lowercases the strings of these columns # df_sales = parameter_processing_hyundai(df_sales, options_file, configuration_parameters_cols) translation_dictionaries = [ options_file.transmission_translation, options_file.ext_color_translation, options_file.int_color_translation ] # grouping_dictionaries = [options_file.motor_grouping, options_file.transmission_grouping, options_file.version_grouping, options_file.ext_color_grouping, options_file.int_color_grouping] # Parameter Translation # df_sales = col_group(df_sales, [x for x in configuration_parameters_cols if 'Model' not in x], translation_dictionaries, options_file.project_id) df_sales = col_group(df_sales, [ 'PT_PDB_Transmission_Type_Desc', 'PT_PDB_Exterior_Color_Desc', 'PT_PDB_Interior_Color_Desc' ], translation_dictionaries, options_file.project_id) df_sales = df_sales[ df_sales['PT_PDB_Version_Desc'] != 'NÃO_PARAMETRIZADOS'] log_record( '9 - Remoção de Viaturas sem versão parametrizada - Contagem de Chassis únicos: {} com o seguinte número de linhas: {}' .format(df_sales['Chassis_Number'].nunique(), df_sales.shape[0]), options_file.project_id) project_units_count_checkup(df_sales, 'Chassis_Number', options_file, sql_check=1) # Parameter Grouping print('### NO GROUPING ###') # df_sales = col_group(df_sales, [x for x in configuration_parameters_cols if 'Model' not in x], grouping_dictionaries, options_file.project_id) log_record( 'Contagem de VehicleData_Code únicos: {}'.format( df_sales['VehicleData_Code'].nunique()), options_file.project_id) df_sales_grouped_conf_cols = df_sales.groupby( configuration_parameters_cols) log_record( 'Contagem de Configurações: {}'.format( len(df_sales_grouped_conf_cols)), options_file.project_id) # New VehicleData_Code Creation df_sales['ML_VehicleData_Code'] = df_sales.groupby( configuration_parameters_cols).ngroup() # df_sales.to_csv('dbs/df_hyundai_dataset_all_info_{}.csv'.format(current_date)) log_record('Fim Secção B.', options_file.project_id) performance_info_append(time.time(), 'Section_B_End') return df_sales