Пример #1
0
def solution_saving(df, sel_model, client_lvl_cols_in, client_lvl_sels):
    truncate_query_part_2 = ' '.join([
        'and {} = \'{}\''.format(x, y)
        for x, y in zip(client_lvl_cols_in, client_lvl_sels) if y != '-'
    ])

    df = client_replacement(
        df, client_lvl_cols_in, client_lvl_sels
    )  # Replaces the values of Client's Levels by the actual values selected for this solution

    level_1_e_deployment.sql_truncate(
        options_file.DSN_SRV3_PRD,
        options_file,
        options_file.sql_info['database_source'],
        options_file.sql_info['optimization_solution_table'],
        query=truncate_query.format(sel_model) + truncate_query_part_2)

    level_1_e_deployment.sql_inject(
        df,
        options_file.DSN_SRV3_PRD,
        options_file.sql_info['database_source'],
        options_file.sql_info['optimization_solution_table'],
        options_file,
        configuration_parameters + client_lvl_cols_in +
        ['Quantity', 'Average_Score_Euros', 'ML_VehicleData_Code'],
        check_date=1)

    st.write('Sugestão gravada com sucesso.')
    return
Пример #2
0
def deployment(df, db, view):
    performance_info_append(time.time(), 'Section_E_Start')
    log_record('Início Secção E...', project_id)

    if df is not None:
        df['NLR_Code'] = level_2_optionals_cdsu_options.nlr_code
        # df = column_rename(df, list(level_2_optionals_cdsu_options.column_sql_renaming.keys()), list(level_2_optionals_cdsu_options.column_sql_renaming.values()))
        df = df.rename(
            columns=level_2_optionals_cdsu_options.column_sql_renaming)
        control_prints(df, 'before deployment, after renaming', head=1)
        sql_delete(
            level_2_optionals_cdsu_options.DSN_MLG_PRD, db, view,
            level_2_optionals_cdsu_options,
            {'NLR_Code': '{}'.format(level_2_optionals_cdsu_options.nlr_code)})
        sql_inject(df,
                   level_2_optionals_cdsu_options.DSN_MLG_PRD,
                   db,
                   view,
                   level_2_optionals_cdsu_options,
                   list(level_2_optionals_cdsu_options.
                        column_checkpoint_sql_renaming.values()),
                   check_date=1)

    log_record('Fim Secção E.', project_id)
    performance_info_append(time.time(), 'Section_E_End')
    return
Пример #3
0
def sql_upload(df, db, view):
    df['Totals'] = df.sum(axis=1)
    df.index.rename('Actual', inplace=True)
    df.reset_index(inplace=True)

    sql_inject(df, options_file.DSN_MLG_PRD, db, view, options_file, list(df), truncate=1, check_date=1)

    return
Пример #4
0
def update_family(df, new_family_classification, df_product_group):
    new_family_classification_code = family_code_convertion(new_family_classification, df_product_group)

    df['New_Product_Group_DW'] = new_family_classification_code
    df.rename(columns={'Product_Group_DW': 'Old_Product_Group_DW'}, inplace=True)
    level_1_e_deployment.sql_inject(df, options_file.DSN_MLG_PRD, options_file.sql_info['database_final'], options_file.sql_info['parts_classification_refs'], options_file, ['Part_Ref', 'Part_Description', 'Part_Cost', 'Part_PVP', 'Client_ID', 'Old_Product_Group_DW', 'New_Product_Group_DW', 'Classification', 'Classification_Prob'], check_date=1)

    st.write('Famílias das referências selecionadas alteradas com sucesso.')

    return
Пример #5
0
def deployment(df):
    performance_info_append(time.time(), 'Section_E_Start')
    log_record('Início Secção E...', options_file.project_id)
    df = df.astype(object).where(pd.notnull(df), None)

    sql_inject(df, options_file.DSN_SRV3_PRD, options_file.sql_info['database_source'], options_file.sql_info['final_table'], options_file, ['Request_Num', 'StemmedDescription', 'Description', 'Language', 'Open_Date', 'Label', 'Classification_Flag'], truncate=1)

    log_record('Fim Secção E.', options_file.project_id)
    performance_info_append(time.time(), 'Section_E_End')

    return
Пример #6
0
def model_performance_saving(df, options_file):

    level_1_e_deployment.sql_inject(
        df,
        level_0_performance_report.DSN_MLG_PRD,
        level_0_performance_report.performance_sql_info['DB'],
        level_0_performance_report.
        performance_sql_info['performance_algorithm_results'],
        options_file,
        list(df),
        check_date=1)

    return
Пример #7
0
def deployment(df, main_families_cm, other_families_cm):
    sql_upload(main_families_cm, options_file.sql_info['database_final'], options_file.sql_info['matrix_lvl_1'])
    sql_upload(other_families_cm, options_file.sql_info['database_final'], options_file.sql_info['matrix_lvl_2'])

    df.rename(columns={'Client_Id': 'Client_ID', 'Part_Desc_concat': 'Part_Description', 'Average_Cost_avg': 'Part_Cost', 'PVP_1_avg': 'Part_PVP', 'prediction': 'Classification', 'Max_Prob': 'Classification_Prob'}, inplace=True)
    df['Classification_Flag'] = 0
    df['Classification_Prob'] = df['Classification_Prob'].round(2)
    df['Part_Cost'] = df['Part_Cost'].round(2)
    df['Part_PVP'] = df['Part_PVP'].round(2)
    df = df.astype({'Part_Ref': 'str', 'Client_ID': 'str', 'Part_Cost': 'str', 'Part_PVP': 'str', 'Classification_Prob': 'str'})
    df['Part_Description'] = df['Part_Description'].fillna("")
    df.dropna(subset=['Classification'], axis=0, inplace=True)
    sql_inject(df, options_file.DSN_MLG_PRD, options_file.sql_info['database_final'], options_file.sql_info['parts_classification_table'], options_file, columns=['Part_Ref', 'Part_Description', 'Part_Cost', 'Part_PVP', 'Client_ID', 'Product_Group_DW', 'Classification', 'Classification_Prob', 'Classification_Flag'], truncate=1, check_date=1)
    sql_inject(df, options_file.DSN_SRV3_PRD, options_file.sql_info['database_BI_GSC'], options_file.sql_info['parts_classification_table'], options_file, columns=['Part_Ref', 'Part_Description', 'Part_Cost', 'Part_PVP', 'Client_ID', 'Product_Group_DW', 'Classification', 'Classification_Prob', 'Classification_Flag'], truncate=1, check_date=1)
    sql_sp_run(options_file.DSN_SRV3_PRD, options_file.sql_info['database_BI_GSC'], options_file)
    return
Пример #8
0
def deployment(df, db, view):
    performance_info_append(time.time(), 'Section_E_Start')
    log_record('Início Secção E...', project_id)

    for col in list(df):
        df[col] = df[col].astype(str)

    df['NLR_Code'] = level_2_optionals_baviera_options.nlr_code

    if df is not None:
        df = column_rename(
            df,
            list(level_2_optionals_baviera_options.column_sql_renaming.keys()),
            list(level_2_optionals_baviera_options.column_sql_renaming.values(
            )))
        if model_training_check:
            sql_delete(
                level_2_optionals_baviera_options.DSN_MLG_PRD, db, view,
                level_2_optionals_baviera_options, {
                    'NLR_Code':
                    '{}'.format(level_2_optionals_baviera_options.nlr_code)
                })
            sql_inject(df,
                       level_2_optionals_baviera_options.DSN_MLG_PRD,
                       db,
                       view,
                       level_2_optionals_baviera_options,
                       level_2_optionals_baviera_options.columns_for_sql,
                       check_date=1)
        else:
            sql_delete(
                level_2_optionals_baviera_options.DSN_MLG_PRD, db, view,
                level_2_optionals_baviera_options, {
                    'NLR_Code':
                    '{}'.format(level_2_optionals_baviera_options.nlr_code)
                })
            sql_inject(df,
                       level_2_optionals_baviera_options.DSN_MLG_PRD,
                       db,
                       view,
                       level_2_optionals_baviera_options,
                       level_2_optionals_baviera_options.columns_for_sql_temp,
                       check_date=1)

    log_record('Fim Secção E.', project_id)
    performance_info_append(time.time(), 'Section_E_End')
    return
Пример #9
0
def deployment(df, db, view):
    performance_info_append(time.time(), 'Section_E_Start')
    log_record('Início Secção E...', options_file.project_id)

    if df is not None:
        sel_df = df.loc[:, options_file.sql_columns_vhe_fact_bi].copy()

        sel_df['NLR_Posting_Date'] = sel_df['NLR_Posting_Date'].astype(
            object).where(sel_df['NLR_Posting_Date'].notnull(), None)
        sel_df['SLR_Document_Date_CHS'] = sel_df[
            'SLR_Document_Date_CHS'].astype(object).where(
                sel_df['SLR_Document_Date_CHS'].notnull(), None)
        sel_df['SLR_Document_Date_RGN'] = sel_df[
            'SLR_Document_Date_RGN'].astype(object).where(
                sel_df['SLR_Document_Date_RGN'].notnull(), None)
        sel_df['Ship_Arrival_Date'] = sel_df['Ship_Arrival_Date'].astype(
            object).where(sel_df['Ship_Arrival_Date'].notnull(), None)
        sel_df['Registration_Request_Date'] = sel_df[
            'Registration_Request_Date'].astype(object).where(
                sel_df['Registration_Request_Date'].notnull(), None)
        sel_df['Registration_Date'] = sel_df['Registration_Date'].astype(
            object).where(sel_df['Registration_Date'].notnull(), None)
        sel_df['PDB_Start_Order_Date'] = sel_df['PDB_Start_Order_Date'].astype(
            object).where(sel_df['PDB_Start_Order_Date'].notnull(), None)
        sel_df['PDB_End_Order_Date'] = sel_df['PDB_End_Order_Date'].astype(
            object).where(sel_df['PDB_End_Order_Date'].notnull(), None)
        sel_df['Fixed_Margin_II'] = sel_df['Fixed_Margin_II'].round(2)
        sel_df = sel_df.where(sel_df.notnull(), None)
        sel_df.rename(columns={
            'prev_sales_check': 'Previous_Sales_Flag',
            'number_prev_sales': 'Previous_Sales_Count'
        },
                      inplace=True)

        sql_inject(sel_df,
                   options_file.DSN_SRV3_PRD,
                   db,
                   view,
                   options_file,
                   list(sel_df),
                   truncate=1,
                   check_date=1)

    log_record('Fim Secção E.', options_file.project_id)
    performance_info_append(time.time(), 'Section_E_End')
Пример #10
0
def solution_saving(df_solution, group_name, group_name_original):

    level_1_e_deployment.sql_truncate(
        options_file.DSN_MLG_PRD,
        options_file,
        options_file.sql_info['database_final'],
        options_file.sql_info['optimization_solution_table'],
        query=truncate_query.format(
            options_file.sql_info['optimization_solution_table'], group_name))

    level_1_e_deployment.sql_inject(
        df_solution,
        options_file.DSN_MLG_PRD,
        options_file.sql_info['database_final'],
        options_file.sql_info['optimization_solution_table'],
        options_file,
        list(df_solution[options_file.columns_sql_solver_solution]),
        check_date=1)

    st.write('Sugestão gravada com sucesso - {}'.format(group_name_original))
    return
Пример #11
0
def deployment(df_solver, df_part_ref_ta, pse_code):
    performance_info_append(time.time(), 'Section_E_Start')
    log_record('Início Secção E...', options_file.project_id)

    df_solver = column_rename(df_solver, list(options_file.column_sql_renaming.keys()), list(options_file.column_sql_renaming.values()))
    df_solver = df_solver.dropna(subset=[options_file.column_sql_renaming['Group']])
    df_solver['Cost'] = pd.to_numeric(df_solver['Cost'], errors='coerce')
    df_solver.dropna(axis=0, subset=['Cost'], inplace=True)

    df_part_ref_ta = column_rename(df_part_ref_ta, ['Group'], [options_file.column_sql_renaming['Group']])

    sql_truncate(options_file.DSN_MLG_DEV, options_file, options_file.sql_info['database_final'], options_file.sql_info['final_table'], query=options_file.truncate_table_query.format(options_file.sql_info['final_table'], pse_code))
    sql_inject(df_solver, options_file.DSN_MLG_DEV, options_file.sql_info['database_final'], options_file.sql_info['final_table'], options_file, columns=list(options_file.column_sql_renaming.values()), check_date=1)

    sql_truncate(options_file.DSN_MLG_DEV, options_file, options_file.sql_info['database_final'], options_file.sql_info['ta_table'], query=options_file.truncate_table_query.format(options_file.sql_info['ta_table'], pse_code))
    df_part_ref_ta.dropna(subset=['Part_Ref_Group_Desc'], inplace=True)
    sql_inject(df_part_ref_ta, options_file.DSN_MLG_DEV, options_file.sql_info['database_final'], options_file.sql_info['ta_table'], options_file, columns=list(df_part_ref_ta), check_date=1)

    log_record('Fim Secção E.', options_file.project_id)
    performance_info_append(time.time(), 'Section_E_End')
    return
Пример #12
0
def save_classification_rule(df_product_group, text, text_option, sel_family_sel_overwrite, sel_cost_max, max_cost, sel_cost_min, min_cost, sel_pvp_max, max_pvp, sel_pvp_min, min_pvp):
    family_code = family_code_convertion(sel_family_sel_overwrite, df_product_group)
    time_tag, _ = level_1_e_deployment.time_tags(format_date="%Y%m%d")

    # st.write(text, text_option, family_code, sel_cost_max, max_cost, sel_cost_min, min_cost, sel_pvp_max, max_pvp, sel_pvp_min, min_pvp, time_tag)

    df_rules = pd.DataFrame()
    df_rules['Matching_Rule'] = [text_option]
    df_rules['Word'] = text
    df_rules['Product_Group_DW'] = family_code
    df_rules['Sel_Max_Cost'] = sel_cost_max
    df_rules['Max_Cost'] = max_cost
    df_rules['Sel_Min_Cost'] = sel_cost_min
    df_rules['Min_Cost'] = min_cost
    df_rules['Sel_Max_PVP'] = sel_pvp_max
    df_rules['Max_PVP'] = max_pvp
    df_rules['Sel_Min_PVP'] = sel_pvp_min
    df_rules['Min_PVP'] = min_pvp
    df_rules['Date'] = time_tag

    level_1_e_deployment.sql_inject(df_rules, options_file.DSN_MLG_PRD, options_file.sql_info['database_final'], options_file.sql_info['parts_classification_rules'], options_file, columns=list(df_rules))
    return
Пример #13
0
def model_choice_upload(flag, name, value, options_file):
    df_model_result = pd.DataFrame(columns={
        'Model_Choice_Flag', 'Chosen_Model', 'Metric', 'Value', 'Message'
    })
    message = None

    df_model_result['Model_Choice_Flag'] = [flag]
    df_model_result['Project_Id'] = [options_file.project_id]
    if not flag:
        message = 'Nenhum dos modelos treinados atinge os valores mínimos definidos.'
        df_model_result['Chosen_Model'] = [0]
        df_model_result['Metric'] = [0]
        df_model_result['Value'] = [0]
    elif flag:
        if flag == 1:
            message = 'Modelo anterior com melhor performance do que o atual.'
        if flag == 2:
            message = 'Modelo anterior substituído pelo atual.'
        if flag == 3:
            message = 'Modelo anterior substituído pelo atual, com pequenas variações de performance.'
        if flag == 4:
            message = 'Novo modelo com performance igual ao anterior.'
        df_model_result['Chosen_Model'] = [name]
        df_model_result['Metric'] = [options_file.metric]
        df_model_result['Value'] = [value]
    df_model_result['Message'] = [message]
    level_1_e_deployment.sql_inject(
        df_model_result,
        options_file.DSN_MLG_PRD,
        level_0_performance_report.performance_sql_info['DB'],
        level_0_performance_report.performance_sql_info['model_choices'],
        options_file,
        list(df_model_result),
        check_date=1)

    return message
Пример #14
0
def performance_evaluation_classification(models, best_models, running_times,
                                          datasets, options_file, project_id):
    # models -> list with models names;
    # best_models -> dict with models name as key and the best clf after gridsearch as values;
    # classes -> clf.classes_
    # running_times -> dict with models name as key and the training time as values
    # datasets -> dict with the datasets required - train_x, test_x, train_y, test_y

    results_train, results_test = [], []
    predictions, feat_importance = {}, pd.DataFrame(index=list(
        datasets['train_x']),
                                                    columns={'Importance'})
    for model in models:
        prediction_train = best_models[model].predict(datasets['train_x'])
        prediction_test = best_models[model].predict(datasets['test_x'])
        evaluation_training = ClassificationEvaluation(
            groundtruth=datasets['train_y'], prediction=prediction_train)
        evaluation_test = ClassificationEvaluation(
            groundtruth=datasets['test_y'], prediction=prediction_test)
        predictions[model] = [
            prediction_train.astype(int, copy=False),
            prediction_test.astype(int, copy=False)
        ]

        # plot_conf_matrix(datasets['train_y'], prediction_train, classes, model, project_id)
        # plot_conf_matrix(datasets['test_y'], prediction_test, classes, model, project_id)
        try:
            feat_importance['Importance'] = best_models[
                model].feature_importances_
            feat_importance.sort_values(by='Importance',
                                        ascending=False,
                                        inplace=True)
            feat_importance.to_csv(base_path + '/output/' +
                                   'feature_importance_' + str(model) + '.csv')
        except AttributeError:
            pass

        row_train = {
            'Micro_F1':
            getattr(evaluation_training, 'micro'),
            'Average_F1':
            getattr(evaluation_training, 'average'),
            'Macro_F1':
            getattr(evaluation_training, 'macro'),
            'Accuracy':
            getattr(evaluation_training, 'accuracy'),
            'ROC_Curve':
            getattr(evaluation_training, 'roc_auc_curve'),
            # ('Precision_Class_' + str(classes[0])): getattr(evaluation_training, 'precision')[0],
            ('Precision_Class_' + str(best_models[model].classes_[0])):
            getattr(evaluation_training, 'precision')[0],
            ('Precision_Class_' + str(best_models[model].classes_[1])):
            getattr(evaluation_training, 'precision')[1],
            ('Recall_Class_' + str(best_models[model].classes_[0])):
            getattr(evaluation_training, 'recall')[0],
            ('Recall_Class_' + str(best_models[model].classes_[1])):
            getattr(evaluation_training, 'recall')[1],
            # 'Precision_Micro_Class_' + str(classes[0]): getattr(evaluation_training, 'precision_multiclass_micro')[0],
            # 'Precision_Micro_Class_' + str(classes[1]): getattr(evaluation_training, 'precision_multiclass_micro')[1],
            # 'Precision_Macro_Class_' + str(classes[0]): getattr(evaluation_training, 'precision_multiclass_macro')[0],
            # 'Precision_Macro_Class_' + str(classes[1]): getattr(evaluation_training, 'precision_multiclass_macro')[1],
            # 'Precision_Average_Class_' + str(classes[0]): getattr(evaluation_training, 'precision_multiclass_average')[0],
            # 'Precision_Average_Class_' + str(classes[1]): getattr(evaluation_training, 'precision_multiclass_average')[1],
            #
            # 'Recall_Micro_Class_' + str(classes[0]): getattr(evaluation_training, 'recall_multiclass_micro')[0],
            # 'Recall_Micro_Class_' + str(classes[1]): getattr(evaluation_training, 'recall_multiclass_micro')[1],
            # 'Recall_Macro_Class_' + str(classes[0]): getattr(evaluation_training, 'recall_multiclass_macro')[0],
            # 'Recall_Macro_Class_' + str(classes[1]): getattr(evaluation_training, 'recall_multiclass_macro')[1],
            # 'Recall_Average_Class_' + str(classes[0]): getattr(evaluation_training, 'recall_multiclass_average')[0],
            # 'Recall_Average_Class_' + str(classes[1]): getattr(evaluation_training, 'recall_multiclass_average')[1],
            'Running_Time':
            running_times[model]
        }

        row_test = {
            'Micro_F1':
            getattr(evaluation_test, 'micro'),
            'Average_F1':
            getattr(evaluation_test, 'average'),
            'Macro_F1':
            getattr(evaluation_test, 'macro'),
            'Accuracy':
            getattr(evaluation_test, 'accuracy'),
            'ROC_Curve':
            getattr(evaluation_test, 'roc_auc_curve'),
            ('Precision_Class_' + str(best_models[model].classes_[0])):
            getattr(evaluation_test, 'precision')[0],
            ('Precision_Class_' + str(best_models[model].classes_[1])):
            getattr(evaluation_test, 'precision')[1],
            ('Recall_Class_' + str(best_models[model].classes_[0])):
            getattr(evaluation_test, 'recall')[0],
            ('Recall_Class_' + str(best_models[model].classes_[1])):
            getattr(evaluation_test, 'recall')[1],
            # 'Precision_Micro_Class_' + str(classes[0]): getattr(evaluation_test, 'precision_multiclass_micro')[0],
            # 'Precision_Micro_Class_' + str(classes[1]): getattr(evaluation_test, 'precision_multiclass_micro')[1],
            # 'Precision_Macro_Class_' + str(classes[0]): getattr(evaluation_test, 'precision_multiclass_macro')[0],
            # 'Precision_Macro_Class_' + str(classes[1]): getattr(evaluation_test, 'precision_multiclass_macro')[1],
            # 'Precision_Average_Class_' + str(classes[0]): getattr(evaluation_test, 'precision_multiclass_average')[0],
            # 'Precision_Average_Class_' + str(classes[1]): getattr(evaluation_test, 'precision_multiclass_average')[1],
            #
            # 'Recall_Micro_Class_' + str(classes[0]): getattr(evaluation_test, 'recall_multiclass_micro')[0],
            # 'Recall_Micro_Class_' + str(classes[1]): getattr(evaluation_test, 'recall_multiclass_micro')[1],
            # 'Recall_Macro_Class_' + str(classes[0]): getattr(evaluation_test, 'recall_multiclass_macro')[0],
            # 'Recall_Macro_Class_' + str(classes[1]): getattr(evaluation_test, 'recall_multiclass_macro')[1],
            # 'Recall_Average_Class_' + str(classes[0]): getattr(evaluation_test, 'recall_multiclass_average')[0],
            # 'Recall_Average_Class_' + str(classes[1]): getattr(evaluation_test, 'recall_multiclass_average')[1],
            'Running_Time':
            running_times[model]
        }

        results_train.append(row_train)
        results_test.append(row_test)

    df_results_train = pd.DataFrame(results_train, index=models)
    df_results_train['Algorithms'] = df_results_train.index
    df_results_train['Dataset'] = ['Train'] * df_results_train.shape[0]
    df_results_train['Project_Id'] = [project_id] * df_results_train.shape[0]
    df_results_test = pd.DataFrame(results_test, index=models)
    df_results_test['Algorithms'] = df_results_test.index
    df_results_test['Dataset'] = ['Test'] * df_results_train.shape[0]
    df_results_test['Project_Id'] = [project_id] * df_results_train.shape[0]

    metric_bar_plot(df_results_train,
                    'project_{}_train_dataset'.format(project_id))
    metric_bar_plot(df_results_test,
                    'project_{}_test_dataset'.format(project_id))

    model_performance_saving(pd.concat([df_results_train, df_results_test]),
                             options_file)
    level_1_e_deployment.sql_inject(
        pd.concat([df_results_train, df_results_test]),
        level_0_performance_report.performance_sql_info['DSN'],
        level_0_performance_report.performance_sql_info['DB'],
        level_0_performance_report.
        performance_sql_info['performance_algorithm_results'],
        options_file,
        list(df_results_train),
        check_date=1)

    return df_results_train, df_results_test, predictions
Пример #15
0
def feature_contribution(df, configuration_parameters, col_to_group_by,
                         options_file, project_id):
    configuration_parameters.remove(col_to_group_by)

    boolean_parameters = [
        x for x in configuration_parameters
        if list(df[x].unique()) == [0, 1] or list(df[x].unique()) == [1, 0]
    ]
    non_boolean_parameters = [
        x for x in configuration_parameters if x not in boolean_parameters
    ]
    df_feature_contribution_total = pd.DataFrame()

    for model in df[col_to_group_by].unique():
        model_mask = df[col_to_group_by] == model
        df_model = df.loc[df[model_mask].index, :]

        mask_class_1 = df_model['score_class_gt'] == 1
        mask_class_0 = df_model['score_class_gt'] == 0
        class_1 = df_model.loc[df_model[mask_class_1].index, :]
        class_0 = df_model.loc[df_model[mask_class_0].index, :]
        differences_boolean, differences_non_boolean, features_boolean, features_non_boolean = [], [], [], []
        differences_feature, features, model_tag = [], [], []

        for feature in configuration_parameters:
            if feature in boolean_parameters:
                c1_f1 = class_1.loc[class_1[feature] == 1, :].shape[0]
                c1_f0 = class_1.loc[class_1[feature] == 0, :].shape[0]
                c0_f1 = class_0.loc[class_0[feature] == 1, :].shape[0]
                c0_f0 = class_0.loc[class_0[feature] == 0, :].shape[0]
                f1 = c1_f1 + c0_f1
                f0 = c1_f0 + c0_f0

                try:
                    p_c1_f1 = c1_f1 / f1 * 1.
                    p_c1_f0 = c1_f0 / f0 * 1.
                    differences_boolean.append(p_c1_f1 - p_c1_f0)
                    features_boolean.append(feature + '_sim')
                except ZeroDivisionError:
                    continue

            elif feature in non_boolean_parameters:
                for value in df_model[feature].unique():
                    if value == 'outros':
                        continue

                    c1_f1 = class_1.loc[class_1[feature] == value, :].shape[0]
                    c1_f0 = class_1.loc[class_1[feature] != value, :].shape[0]
                    c0_f1 = class_0.loc[class_0[feature] == value, :].shape[0]
                    c0_f0 = class_0.loc[class_0[feature] != value, :].shape[0]

                    # ToDo: There might be cases where only one value for a feature is available if the df is too small (Only Preto as Cor_Interior, e.g.). I should add a try/exception to catch these for the conditions there feature != value

                    f1 = c1_f1 + c0_f1
                    f0 = c1_f0 + c0_f0

                    try:
                        p_c1_f1 = c1_f1 / f1 * 1.
                        p_c1_f0 = c1_f0 / f0 * 1.
                    except ZeroDivisionError:
                        # log_record('Insufficient data for feature ' + str(feature) + ' and value ' + str(value) + '.', project_id, flag=1)
                        level_0_performance_report.log_record(
                            'Dados insuficientes para a feature {} com valor {}.'
                            .format(feature, value),
                            project_id,
                            flag=1)

                        continue

                    differences_non_boolean.append(p_c1_f1 - p_c1_f0)
                    features_non_boolean.append(feature + '_' + value)

        differences_feature.extend(differences_boolean)
        differences_feature.extend(differences_non_boolean)
        features.extend(features_boolean)
        features.extend(features_non_boolean)
        model_tag.extend(
            [model] *
            (len(differences_boolean) + len(differences_non_boolean)))

        df_feature_contribution = pd.DataFrame()
        df_feature_contribution['Features'] = features
        df_feature_contribution['Differences'] = differences_feature
        df_feature_contribution['Model_Code'] = model_tag

        if abs(df_feature_contribution['Differences'].min()
               ) > df_feature_contribution['Differences'].max():
            max_range_value = abs(df_feature_contribution['Differences'].min())
            min_range_value = df_feature_contribution['Differences'].min()
        else:
            max_range_value = df_feature_contribution['Differences'].max()
            min_range_value = df_feature_contribution['Differences'].max() * -1
        df_feature_contribution[
            'Differences_Normalized'] = 2 * df_feature_contribution[
                'Differences'] / (max_range_value - min_range_value)

        df_feature_contribution_total = pd.concat(
            [df_feature_contribution_total, df_feature_contribution])

    level_1_e_deployment.sql_inject(
        df_feature_contribution_total,
        options_file.DSN_MLG_PRD,
        options_file.sql_info['database_final'],
        options_file.sql_info['feature_contribution'],
        options_file,
        list(df_feature_contribution_total),
        truncate=1)
Пример #16
0
def performance_evaluation_regression(models, best_models, running_times,
                                      datasets, datasets_non_ohe, options_file,
                                      project_id):

    results_train, results_test = [], []
    predictions, feat_importance = {}, pd.DataFrame(index=list(
        datasets['train_x']),
                                                    columns={'Importance'})
    for model in models:
        if model == 'lgb':
            train_x, test_x = datasets_non_ohe['train_x'], datasets_non_ohe[
                'test_x']
            train_y, test_y = datasets_non_ohe['train_y'], datasets_non_ohe[
                'test_y']
        else:
            train_x, test_x = datasets['train_x'], datasets['test_x']
            train_y, test_y = datasets['train_y'], datasets['test_y']

        prediction_train = best_models[model].predict(train_x)
        prediction_test = best_models[model].predict(test_x)
        evaluation_training = RegressionEvaluation(groundtruth=train_y,
                                                   prediction=prediction_train)
        evaluation_test = RegressionEvaluation(groundtruth=test_y,
                                               prediction=prediction_test)
        predictions[model] = [
            prediction_train.astype(int, copy=False),
            prediction_test.astype(int, copy=False)
        ]

        # try:
        #     feat_importance['Importance'] = best_models[model].feature_importances_
        #     feat_importance.sort_values(by='Importance', ascending=False, inplace=True)
        #     feat_importance.to_csv(base_path + '/output/' + 'feature_importance_' + str(model) + '.csv')
        # except AttributeError:
        #     pass

        row_train = {
            'R2': getattr(evaluation_training, 'r2_score'),
            'MSE': getattr(evaluation_training, 'mse'),
            'RMSE': np.sqrt(getattr(evaluation_training, 'mse')),
            'Running_Time': running_times[model]
        }

        row_test = {
            'R2': getattr(evaluation_test, 'r2_score'),
            'MSE': getattr(evaluation_test, 'mse'),
            'RMSE': np.sqrt(getattr(evaluation_test, 'mse')),
            'Running_Time': running_times[model]
        }

        results_train.append(row_train)
        results_test.append(row_test)

    df_results_train = pd.DataFrame(results_train, index=models)
    df_results_train['Algorithms'] = df_results_train.index
    df_results_train['Dataset'] = ['Train'] * df_results_train.shape[0]
    df_results_train['Project_Id'] = [project_id] * df_results_train.shape[0]
    df_results_test = pd.DataFrame(results_test, index=models)
    df_results_test['Algorithms'] = df_results_test.index
    df_results_test['Dataset'] = ['Test'] * df_results_train.shape[0]
    df_results_test['Project_Id'] = [project_id] * df_results_train.shape[0]

    # metric_bar_plot(df_results_train, 'project_{}_train_dataset'.format(project_id))
    # metric_bar_plot(df_results_test, 'project_{}_test_dataset'.format(project_id))

    level_1_e_deployment.sql_inject(
        pd.concat([df_results_train, df_results_test]),
        level_0_performance_report.performance_sql_info['DSN'],
        level_0_performance_report.performance_sql_info['DB'],
        level_0_performance_report.
        performance_sql_info['performance_algorithm_results'],
        options_file,
        list(df_results_train),
        check_date=1)

    return df_results_train, df_results_test, predictions