def train_test_classifier():
		w = fdm.train_model_disp_mist(x_train, y_train, x_control_train, loss_function, EPS, cons_params)

		train_score, test_score, cov_all_train, cov_all_test, s_attr_to_fp_fn_train, s_attr_to_fp_fn_test = fdm.get_clf_stats(w, x_train, y_train, x_control_train, x_test, y_test, x_control_test, sensitive_attrs)

		
		# accuracy and FPR are for the test because we need of for plotting
		return w, test_score, s_attr_to_fp_fn_test
    def train_test_classifier():
        w = fdm.train_model_disp_mist(x_train, y_train, x_control_train,
                                      loss_function, EPS, cons_params)

        train_score, test_score, cov_all_train, cov_all_test, s_attr_to_fp_fn_train, s_attr_to_fp_fn_test = fdm.get_clf_stats(
            w, x_train, y_train, x_control_train, x_test, y_test,
            x_control_test, sensitive_attrs)

        # accuracy and FPR are for the test because we need of for plotting
        # the covariance is for train, because we need it for setting the thresholds
        return w, test_score, s_attr_to_fp_fn_test, cov_all_train
示例#3
0
def run_for_all_train_test_matrices(file_path, train_config,
                                    train_test_matrices, entity_to_attrib,
                                    out_path, test_results_table,
                                    test_results_schema):

    print("TRUNCATING........")
    truncate_query = """
    truncate table triage_metadata.zafar_models;
    """
    conn.execute(truncate_query)

    truncate_query = """
    truncate table triage_metadata.zafar_model_groups;
    """
    conn.execute(truncate_query)

    truncate_query = """
    truncate table %s.%s
    """ % (str(test_results_schema), str(test_results_table))
    conn.execute(truncate_query)

    model_group_id = 9999
    df_model_group_insert = pd.DataFrame({
        'model_group_id': [model_group_id],
        'model_type':
        'zafar_model_group',
        'hyperparameters': [Json(train_config)]
        #'feature_list': [],
        #'model_config: []
    })

    df_model_group_insert.to_sql('zafar_model_groups',
                                 conn,
                                 schema='triage_metadata',
                                 index=False,
                                 if_exists='append')

    for i in range(len(train_test_matrices)):
        print("Doing for " + str(i))
        print("--" * 20)
        train_matrix_uuid = train_test_matrices[i][0][0]
        test_matrix_uuid = train_test_matrices[i][0][1]

        out_file = os.path.join(out_path, test_matrix_uuid + "_Scores")
        if (not os.path.exists(out_file)):
            df_train = load_matrix(file_path, train_matrix_uuid,
                                   entity_to_attrib, demo_col, label_col)

            # converting as of date
            df_train['as_of_date'] = pd.to_datetime(df_train['as_of_date'])

            # getting entities and as of dates
            train_as_of_dates = df_train['as_of_date'].values
            train_entity_ids = df_train['entity_id'].values

            # doing the same for df_test
            df_test = load_matrix(file_path, test_matrix_uuid,
                                  entity_to_attrib, demo_col, label_col)

            df_test['as_of_date'] = pd.to_datetime(df_test['as_of_date'])

            test_as_of_dates = df_test['as_of_date'].values
            test_entity_ids = df_test['entity_id'].values

            # Adding intercept
            df_train['intercept'] = 1
            df_test['intercept'] = 1

            # This filtering of columns is different than for what we will run zafar.
            old_exclude_cols = ['entity_id', 'as_of_date', label_col]

            # This is the matrix for which we will run Regression
            x_temp = df_train[[
                c for c in df_train.columns if c not in old_exclude_cols
            ]].values
            y_temp = df_train[label_col].values

            # Perform Scaled Logistic Regression to include only relevant features.
            dsapp_lr = ScaledLogisticRegression(penalty="l1", C=0.1)
            dsapp_lr.fit(x_temp, y_temp)

            all_columns = [
                c for c in df_train.columns if c not in old_exclude_cols
            ]
            keep_cols = []

            for i, col in enumerate(all_columns):
                if dsapp_lr.coef_[0][i] != 0:
                    keep_cols.append(col)

            keep_cols = keep_cols + ['intercept']

            x_train = df_train[[c for c in df_train.columns
                                if c in keep_cols]].values
            y_train = df_train[label_col].values
            x_control_train = {demo_col: df_train[demo_col].values}

            x_test = df_test[[c for c in df_test.columns
                              if c in keep_cols]].values
            y_test = df_test[label_col].values
            x_control_test = {demo_col: df_test[demo_col].values}

            x = x_train
            y = y_train
            x_control = x_control_train

            max_iters = train_config['max_iters']
            max_iters_dccp = train_config['max_iters_dccp']

            num_points, num_features = x.shape
            w = cvxpy.Variable(num_features)

            np.random.seed(train_config['random_seed'])
            w.value = np.random.rand(x.shape[1])

            constraints = []
            loss = cvxpy.sum(cvxpy.logistic(cvxpy.multiply(
                -y, x * w))) / num_points
            prob = cvxpy.Problem(cvxpy.Minimize(loss), constraints)

            tau = float(train_config['tau'])
            mu = float(train_config['mu'])

            loss_function = "logreg"  # perform the experiments with logistic regression
            #EPS = 1e-4
            EPS = float(train_config['EPS'])

            prob.solve(method='dccp',
                       tau=tau,
                       mu=mu,
                       tau_max=1e10,
                       solver=cvxpy.ECOS,
                       verbose=True,
                       feastol=EPS,
                       abstol=EPS,
                       reltol=EPS,
                       feastol_inacc=EPS,
                       abstol_inacc=EPS,
                       reltol_inacc=EPS,
                       max_iters=max_iters,
                       max_iter=max_iters_dccp)

            ret_w = np.array(w.value).flatten()
            sensitive_attrs = list(x_control_train.keys())

            print("INPUT TRAIN:" + str(pd.value_counts(y_train)))
            print("INPUT TEST:" + str(pd.value_counts(y_test)))

            train_score, test_score, cov_all_train, cov_all_test, s_attr_to_fp_fn_train, s_attr_to_fp_fn_test = fdm.get_clf_stats(
                ret_w, x_train, y_train, x_control, x_test, y_test,
                x_control_test, list(sensitive_attrs))

            s_attr = sensitive_attrs[0]
            distances_boundary_test = fdm.get_distance_boundary(
                ret_w, x_test, x_control_test[s_attr])

            print("SAVING PREDICTIONS")
            save_predictions(distances_boundary_test, y_test, x_control_test,
                             sensitive_attrs, test_as_of_dates,
                             test_entity_ids, train_matrix_uuid,
                             test_matrix_uuid, train_config,
                             test_results_table, test_results_schema, out_path)

            k = 500
            all_class_labels_assigned_test = label_top_k(
                distances_boundary_test, k)
            #all_class_labels_assigned_test = np.sign(distances_boundary_test)

            prec_k = calc_prec(all_class_labels_assigned_test, y_test)
            print('prec@%s_abs: %.5f' % (k, prec_k))

            s_attr_to_fp_fn_test = fdm.get_fpr_fnr_sensitive_features(
                y_test, all_class_labels_assigned_test, x_control_test,
                sensitive_attrs, False)

            for s_attr in s_attr_to_fp_fn_test.keys():
                print("S_Attr=" + str(s_attr))
                for s_val in s_attr_to_fp_fn_test[s_attr].keys():
                    print("S_VAL=" + str(s_val))
                    s_attr_to_fp_fn_test[s_attr][s_val][
                        'recall'] = 1.000 - s_attr_to_fp_fn_test[s_attr][
                            s_val]['fnr']

                #recall_white = s_attr_to_fp_fn_test['race'][0]['recall']
                #recall_nonwhite = s_attr_to_fp_fn_test['race'][1]['recall']
                #recall_highest = s_attr_to_fp_fn_test['plevel'][0]['recall']
                #recall_nonhighest = s_attr_to_fp_fn_test['plevel'][1]['recall']
                #recall_overage = s_attr_to_fp_fn_test['ovg'][0]['recall']
                #recall_non_overage = s_attr_to_fp_fn_test['ovg'][1]['recall']
                recall_under = s_attr_to_fp_fn_test['median_income'][0][
                    'recall']
                recall_over = s_attr_to_fp_fn_test['median_income'][1][
                    'recall']

                #print('recall white: %.6f' % recall_white)
                #print('recall non-white: %.6f' % recall_nonwhite)
                #print('recall ratio: %.6f' % float(recall_white/recall_nonwhite))
                #print('recall highest: %.6f' % recall_highest)
                #print('recall non-highest: %.6f' % recall_nonhighest)
                #print('recall ratio: %.6f' % float(recall_highest/(recall_nonhighest+1e-6)))
                #print('recall overage: %.6f' % recall_overage)
                #print('recall non overage: %.6f' % recall_non_overage)
                print('recall under55k: %.6f' % recall_under)
                print('recall over55k: %.6f' % recall_over)

            save_output_info(out_path, test_matrix_uuid, train_score,
                             test_score, cov_all_train, cov_all_test,
                             s_attr_to_fp_fn_train, s_attr_to_fp_fn_test,
                             recall_under, recall_over)

    return None