def aequitas_group(df, score_column, label_column, protected_class): # To measure Bias towards protected_class, filter DataFrame # to score, label (ground truth), and protected class data_scored = df[[ score_column, label_column, protected_class, ]] # Aequitas expects ground truth under 'label_value' data_scored = data_scored.rename(columns={label_column: "label_value"}) # Process DataFrame data_scored_processed, _ = preprocess_input_df(data_scored) # Group Metrics g = Group() xtab, _ = g.get_crosstabs(data_scored_processed) # Absolute metrics, such as 'tpr', 'tnr','precision', etc. absolute_metrics = g.list_absolute_metrics(xtab) # DataFrame of calculated absolute metrics for each sample population group absolute_metrics_df = xtab[["attribute_name", "attribute_value"] + absolute_metrics].round(2) # For example: """ attribute_name attribute_value tpr tnr ... precision 0 gender female 0.60 0.88 ... 0.75 1 gender male 0.49 0.90 ... 0.64 """ return absolute_metrics_df
def tabla_metrica_grupo(data): g = Group() xtab, _ = g.get_crosstabs(data) absolute_metrics = g.list_absolute_metrics(xtab) tabla_grupo = xtab[['attribute_name', 'attribute_value'] + absolute_metrics].round(2) return tabla_grupo
def get_bias_metrics(data): bias = Bias() group = Group() old_columns = ['predictions', 'loan_status', 'forty_plus_indicator'] new_columns = ['score', 'label_value', 'forty_plus_indicator'] scored_data = data.loc[:, old_columns] renamer = dict(zip(scored_data.columns, new_columns)) scored_data = scored_data.rename(columns = renamer) data_processed, _ = preprocess_input_df(scored_data) xtab, _ = group.get_crosstabs(data_processed) attribute_columns = ['attribute_name', 'attribute_value'] absolute_metrics = group.list_absolute_metrics(xtab) absolute_metrics_df = xtab[attribute_columns + absolute_metrics].round(2) bias_df = bias.get_disparity_predefined_groups( xtab, original_df=data_processed, ref_groups_dict={'forty_plus_indicator': 'Under Forty'}, alpha=0.05, mask_significance=True ) calculated_disparities = bias.list_disparities(bias_df) disparity_metrics_df = bias_df[attribute_columns + calculated_disparities] abs_metrics = absolute_metrics_df.where(pd.notnull(absolute_metrics_df), None).to_dict(orient='records') disp_metrics = disparity_metrics_df.where(pd.notnull(disparity_metrics_df), None).to_dict(orient='records') return dict(absolute_metrics = abs_metrics, disparity_metrics = disp_metrics)
def metrics(data): data = pd.DataFrame(data) # To measure Bias towards gender, filter DataFrame # to "score", "label_value" (ground truth), and # "gender" (protected attribute) data_scored = data[["score", "label_value", "gender"]] # Process DataFrame data_scored_processed, _ = preprocess_input_df(data_scored) # Group Metrics g = Group() xtab, _ = g.get_crosstabs(data_scored_processed) # Absolute metrics, such as 'tpr', 'tnr','precision', etc. absolute_metrics = g.list_absolute_metrics(xtab) # DataFrame of calculated absolute metrics for each sample population group absolute_metrics_df = xtab[['attribute_name', 'attribute_value'] + absolute_metrics].round(2) # For example: """ attribute_name attribute_value tpr tnr ... precision 0 gender female 0.60 0.88 ... 0.75 1 gender male 0.49 0.90 ... 0.64 """ # Bias Metrics b = Bias() # Disparities calculated in relation gender for "male" and "female" bias_df = b.get_disparity_predefined_groups( xtab, original_df=data_scored_processed, ref_groups_dict={'gender': 'male'}, alpha=0.05, mask_significance=True) # Disparity metrics added to bias DataFrame calculated_disparities = b.list_disparities(bias_df) disparity_metrics_df = bias_df[['attribute_name', 'attribute_value'] + calculated_disparities] # For example: """ attribute_name attribute_value ppr_disparity precision_disparity 0 gender female 0.714286 1.41791 1 gender male 1.000000 1.000000 """ output_metrics_df = disparity_metrics_df # or absolute_metrics_df # Output a JSON object of calculated metrics yield output_metrics_df.to_dict(orient="records")
def group(df): """ Function to print absolute and relative metrics by group. :param df: aequitas-compliant dataframe. :return: None """ g = Group() print("Módulo de grupo") print("-"*30) xtab, atts = g.get_crosstabs(df, attr_cols=["delegacion"]) absolute_metrics = g.list_absolute_metrics(xtab) print(f"El grupo a analizar es:{atts}") print("Conteo de frecuencias por grupos:") print(xtab[[col for col in xtab.columns if col not in absolute_metrics]]) print() print("Conteo de absolutos por grupos:") print(xtab[['attribute_name', 'attribute_value']+[col for col in xtab.columns if col in absolute_metrics]].round(2))
def get_bias_metrics(data): # To measure Bias towards gender, filter DataFrame # to "score", "label_value" (ground truth), and # "gender" (protected attribute) data_scored = data[["score", "label_value", "gender"]] # Process DataFrame data_scored_processed, _ = preprocess_input_df(data_scored) # Group Metrics g = Group() xtab, _ = g.get_crosstabs(data_scored_processed) # Absolute metrics, such as 'tpr', 'tnr','precision', etc. absolute_metrics = g.list_absolute_metrics(xtab) # DataFrame of calculated absolute metrics for each sample population group absolute_metrics_df = xtab[["attribute_name", "attribute_value"] + absolute_metrics].round(2) # Bias Metrics b = Bias() # Disparities calculated in relation gender for "male" and "female" bias_df = b.get_disparity_predefined_groups( xtab, original_df=data_scored_processed, ref_groups_dict={"gender": "male"}, alpha=0.05, mask_significance=True, ) # Disparity metrics added to bias DataFrame calculated_disparities = b.list_disparities(bias_df) disparity_metrics_df = bias_df[["attribute_name", "attribute_value"] + calculated_disparities] output_metrics_df = disparity_metrics_df # or absolute_metrics_df # Output a JSON object of calculated metrics return output_metrics_df.to_dict(orient="records")
def group(df_aeq): """ args: df (dataframe):Recibe el data frame que tiene los features sobre los que queremos medir el sesgo entre los diferentes grupos. returns: - """ # print("Métricas de ") #tables g = Group() xtab, attrbs = g.get_crosstabs(df_aeq) absolute_metrics = g.list_absolute_metrics(xtab) conteos_grupo = xtab[[ col for col in xtab.columns if col not in absolute_metrics ]] metricas_absolutas = xtab[ ['attribute_name', 'attribute_value'] + [col for col in xtab.columns if col in absolute_metrics]].round(2) return xtab, conteos_grupo, metricas_absolutas, absolute_metrics
def fairness(df): """ Genera todo el módulo de Equidad """ print("Módulo de Equidad") print("-"*30) f = Fairness() bias_ = Bias() g = Group() xtab, atts = g.get_crosstabs(df, attr_cols=["delegacion"]) absolute_metrics = g.list_absolute_metrics(xtab) bdf = bias_.get_disparity_predefined_groups(xtab, original_df = df, ref_groups_dict = {'delegacion': 'IZTAPALAPA'}, alpha=0.05) fdf = f.get_group_value_fairness(bdf) parity_determinations = f.list_parities(fdf) print("Imprimiendo tabla de métricas (conteos en frecuencias):") print(fdf[['attribute_name', 'attribute_value'] + absolute_metrics + bias_.list_disparities(fdf) + parity_determinations].round(2)) print("Impriendo métricas generales") gof = f.get_overall_fairness(fdf) print(gof) print("Aequitas analysis completed.") print("-"*30)
def bias(df): """ Function to print bias metrics. :param df: Aequitas-compliant dataframe. """ print("Módulo de Sesgo") print("-"*30) bias_ = Bias() g = Group() xtab, atts = g.get_crosstabs(df, attr_cols=["delegacion"]) absolute_metrics = g.list_absolute_metrics(xtab) bdf = bias_.get_disparity_predefined_groups(xtab, original_df = df, ref_groups_dict = {'delegacion': 'IZTAPALAPA'}, alpha=0.05) print("Disparities:") print(bdf[['attribute_name', 'attribute_value'] + bias_.list_disparities(bdf)].round(2)) print("Minority Analysis:") min_bdf = bias_.get_disparity_min_metric(xtab, original_df=df) print(min_bdf[['attribute_name', 'attribute_value'] + bias_.list_disparities(min_bdf)].round(2)) print("Majority Analysis:") majority_bdf = bias_.get_disparity_major_group(xtab, original_df=df) print(majority_bdf[['attribute_name', 'attribute_value'] + bias_.list_disparities(majority_bdf)].round(2))
def get_bias_metrics(data): bias = Bias() group = Group() old_columns = ['predictions', 'loan_status', 'forty_plus_indicator'] new_columns = ['score', 'label_value', 'forty_plus_indicator'] scored_data = data.loc[:, old_columns] renamer = dict(zip(scored_data.columns, new_columns)) scored_data = scored_data.rename(columns=renamer) data_processed, _ = preprocess_input_df(scored_data) xtab, _ = group.get_crosstabs(data_processed) attribute_columns = ['attribute_name', 'attribute_value'] absolute_metrics = group.list_absolute_metrics(xtab) absolute_metrics_df = xtab[attribute_columns + absolute_metrics].round(2) bias_df = bias.get_disparity_predefined_groups( xtab, original_df=data_processed, ref_groups_dict={'forty_plus_indicator': 'Under Forty'}, alpha=0.05, mask_significance=True) calculated_disparities = bias.list_disparities(bias_df) disparity_metrics_df = bias_df[attribute_columns + calculated_disparities] abs_metrics = absolute_metrics_df.where(pd.notnull(absolute_metrics_df), None).to_dict(orient='records') disp_metrics = disparity_metrics_df.where(pd.notnull(disparity_metrics_df), None).to_dict(orient='records') return { "attributeAudited": "forty_plus_indicator", "referenceGroup": "Under Forty", "fairnessThreshold": "80%", "fairnessMeasures": [{ "label": "Predicted Positive Group Rate Parity", "result": "Passed", "group": "Over Forty", "disparity": disp_metrics[0]['pprev_disparity'] }, { "label": "Predicted Positive Rate Parity", "result": "Failed", "group": "Over Forty", "disparity": disp_metrics[0]['ppr_disparity'] }, { "label": "Proportional Parity", "result": "Passed", "group": "Over Forty", "disparity": disp_metrics[0]['precision_disparity'] }, { "label": "False Positive Rate Parity", "result": "Passed", "group": "Over Forty", "disparity": disp_metrics[0]['fpr_disparity'] }, { "label": "False Discovery Rate Parity", "result": "Passed", "group": "Over Forty", "disparity": disp_metrics[0]['fdr_disparity'] }, { "label": "False Negative Rate Parity", "result": "Passed", "group": "Over Forty", "disparity": disp_metrics[0]['fnr_disparity'] }, { "label": "False Omission Rate Parity", "result": "Passed", "group": "Over Forty", "disparity": disp_metrics[0]['for_disparity'] }] }
def execeute(self): model = self.download_model() tabla_3 = pd.read_sql_table('centers', self.engine, schema="transformed") tabla_4 = pd.read_sql_table('inspections', self.engine, schema="transformed") centros = tabla_3.copy() centros.rename(columns={"dc_id": "center_id"}, inplace=True) inspecciones = tabla_4.copy() last_inspections = inspecciones.sort_values( by="inspectiondate").drop_duplicates(subset=["center_id"], keep="last") centros = centros.drop([ 'centername', 'legalname', 'building', 'street', 'zipcode', 'phone', 'permitnumber', 'permitexp', 'status', 'agerange', 'childcaretype', 'bin', 'url', 'datepermitted', 'actual', 'violationratepercent', 'violationavgratepercent', 'publichealthhazardviolationrate', 'averagepublichealthhazardiolationrate', 'criticalviolationrate', 'avgcriticalviolationrate' ], axis=1) centros = centros.reset_index(drop=True) tabla_5 = pd.merge(last_inspections, centros) tabla_5.sort_values(['inspectiondate'], ascending=[False], inplace=True) tabla_5['maximumcapacity'] = tabla_5['maximumcapacity'].astype(int) tabla_5['totaleducationalworkers'] = tabla_5[ 'totaleducationalworkers'].astype(int) tabla_5['totaleducationalworkers'] = tabla_5[ 'totaleducationalworkers'].astype(int) tabla_5['averagetotaleducationalworkers'] = tabla_5[ 'averagetotaleducationalworkers'].astype(float) tabla_5 = tabla_5.drop([ 'regulationsummary', 'healthcodesubsection', 'violationstatus', 'borough', 'reason', 'inspectiondate', 'violationcategory_nan' ], axis=1) tabla_5 = tabla_5.set_index(['center_id']) tabla_5 = tabla_5.fillna(0) for col in tabla_5.select_dtypes(object): tabla_5[col] = tabla_5[col].astype(float) tabla_5 = tabla_5.fillna(0) prds = model.predict( tabla_5.drop(['violationcategory_public_health_hazard'], axis=1)) probas = model.predict_proba( tabla_5.drop(['violationcategory_public_health_hazard'], axis=1)) res = pd.DataFrame({ "center": tabla_5.index, "etiqueta": prds, "proba_0": probas[:, 0], "proba_1": probas[:, 1] }) res.loc[res['proba_0'] > res['proba_1'], 'score'] = res['proba_0'] res.loc[res['proba_0'] < res['proba_1'], 'score'] = res['proba_1'] categorias_1 = [ "programtype_all_age_camp", "programtype_infant_toddler", "programtype_preschool", "programtype_preschool_camp", "programtype_school_age_camp" ] programtype = pd.get_dummies(centros[categorias_1]).idxmax(1) categorias_2 = [ "borough_bronx", "borough_brooklyn", "borough_manhattan", "borough_queens", "borough_staten_island" ] borough = pd.get_dummies(centros[categorias_2]).idxmax(1) ambas = pd.concat( [borough, programtype], axis=1, ) ambas = ambas.rename(columns={0: 'borough', 1: 'programtype'}) tabla_1 = pd.concat([centros, ambas], axis=1) tabla_2 = pd.merge(res, tabla_1, left_on='center', right_on='center_id') for i in list(tabla_2.index): if str(tabla_2.iloc[i].borough_bronx) == "1": tabla_2.loc[tabla_2.index == i, "borough"] = "bronx" elif str(tabla_2.iloc[i].borough_brooklyn) == "1": tabla_2.loc[tabla_2.index == i, "borough"] = "brooklyn" elif str(tabla_2.iloc[i].borough_manhattan) == "1": tabla_2.loc[tabla_2.index == i, "borough"] = "manhattan" elif str(tabla_2.iloc[i].borough_queens) == "1": tabla_2.loc[tabla_2.index == i, "borough"] = "queens" elif str(tabla_2.iloc[i].borough_staten_island) == "1": tabla_2.loc[tabla_2.index == i, "borough"] = "staten_island" tabla_2.drop(categorias_2, axis=1, inplace=True) for i in list(tabla_2.index): if str(tabla_2.iloc[i].programtype_all_age_camp) == "1": tabla_2.loc[tabla_2.index == i, "programtype"] = "all_age_camp" elif str(tabla_2.iloc[i].programtype_infant_toddler) == "1": tabla_2.loc[tabla_2.index == i, "programtype"] = "infant_toddler" elif str(tabla_2.iloc[i].programtype_preschool) == "1": tabla_2.loc[tabla_2.index == i, "programtype"] = "preschool" elif str(tabla_2.iloc[i].programtype_preschool_camp) == "1": tabla_2.loc[tabla_2.index == i, "programtype"] = "preschool_camp" elif str(tabla_2.iloc[i].programtype_school_age_camp) == "1": tabla_2.loc[tabla_2.index == i, "programtype"] = "school_age_camp" tabla_2.drop(categorias_1, axis=1, inplace=True) tabla_6 = tabla_2.loc[:, [ 'center', 'etiqueta', 'score', 'borough', 'programtype' ]] tabla_6 = tabla_6.rename(columns={'etiqueta': 'label_value'}) tabla_6.set_index('center', inplace=True) g = Group() xtab, _ = g.get_crosstabs(tabla_6) absolute_metrics = g.list_absolute_metrics(xtab) df_group = xtab[[ col for col in xtab.columns if col not in absolute_metrics ]] df_group['model_id'] = self.model_id df_group['date'] = self.date_param self.output_table = df_group return [tuple(x) for x in df_group.to_numpy() ], [(c, 'VARCHAR') for c in list(df_group.columns)]
data=dataset[dataset.race.isin( ['Black', 'White', 'Hispanic'])], palette=aq_palette_label) plt.savefig('./figures/LAW_DATA/label_race_law.png') plt.clf() label_sex = sns.countplot(x="sex", hue="label_value", data=dataset, palette=aq_palette_label) plt.savefig('./figures/LAW_DATA/label_sex_law.png') plt.clf() # Calculamos la tabla de metricas de grupo g = Group() xtab, _ = g.get_crosstabs(dataset) absolute_metrics = g.list_absolute_metrics(xtab) # Mostramos la tabla por pantalla tabla_grupo = xtab[['attribute_name', 'attribute_value'] + absolute_metrics].round(2) print(tabla_grupo) aqp = Plot() # Plot de los valores de las metricas de grupo para FNR fnr = aqp.plot_group_metric(xtab, 'fnr') # Plot de los valores de las metricas de grupo para FNR eliminando poblaciones con umbral de individuos fnr = aqp.plot_group_metric(xtab, 'fnr', min_group_size=0.05) # Metricas de grupo para todas las elegidas p = aqp.plot_group_metric_all(xtab, metrics=['ppr', 'pprev', 'fnr', 'fpr'], ncols=4)
def fun_bias_fair(a_zip, a_type, fea_eng, model): X = fea_eng.drop([ 'aka_name', 'facility_type', 'address', 'inspection_date', 'inspection_type', 'violations', 'results', 'pass' ], axis=1) y_pred = model.predict(X) xt = pd.DataFrame([ fea_eng['zip'].astype(float), fea_eng['facility_type'], fea_eng['pass'], y_pred ]).transpose() a_zip['zip'] = a_zip['zip'].astype(float) compas = pd.merge(left=xt, right=a_zip, how='left', left_on='zip', right_on='zip') compas = pd.merge(left=compas, right=a_type, how='left', left_on='facility_type', right_on='facility_type') compas = compas.rename(columns={ 'Unnamed 0': 'score', 'pass': '******' }) compas.pop('zip') compas.pop('facility_type') compas['zone'] = compas['zone'].astype(str) compas['score'] = compas['score'].astype(int) compas['label_value'] = compas['label_value'].astype(int) from aequitas.group import Group from aequitas.bias import Bias from aequitas.fairness import Fairness #Group g = Group() xtab, attrbs = g.get_crosstabs(compas) absolute_metrics = g.list_absolute_metrics(xtab) xtab[[col for col in xtab.columns if col not in absolute_metrics]] group_df = xtab[['attribute_name', 'attribute_value'] + [col for col in xtab.columns if col in absolute_metrics]].round(4) abs_gpo = xtab[['attribute_name', 'attribute_value'] + [col for col in xtab.columns if col in absolute_metrics]].round(4) #Bias bias = Bias() bdf = bias.get_disparity_predefined_groups(xtab, original_df=compas, ref_groups_dict={ 'zone': 'West', 'facility_group': 'grocery' }, alpha=0.05) # View disparity metrics added to dataframe bias_bdf = bdf[['attribute_name', 'attribute_value'] + bias.list_disparities(bdf)].round(2) majority_bdf = bias.get_disparity_major_group(xtab, original_df=compas) bias_maj_bdf = majority_bdf[['attribute_name', 'attribute_value'] + bias.list_disparities(majority_bdf)].round(2) min_bdf = bias.get_disparity_min_metric(xtab, original_df=compas) bias_min_bdf = min_bdf[['attribute_name', 'attribute_value'] + bias.list_disparities(min_bdf)].round(2) min_bdf[['attribute_name', 'attribute_value'] + bias.list_disparities(min_bdf)].round(2) #Fairness fair = Fairness() fdf = fair.get_group_value_fairness(bdf) parity_determinations = fair.list_parities(fdf) fair_fdf = fdf[['attribute_name', 'attribute_value'] + absolute_metrics + bias.list_disparities(fdf) + parity_determinations].round(2) gaf = fair.get_group_attribute_fairness(fdf) fairness_df = fdf.copy() gof = fair.get_overall_fairness(fdf) tab_bias_fair = fair_fdf[[ 'attribute_name', 'attribute_value', 'for', 'fnr', 'for_disparity', 'fnr_disparity', 'FOR Parity', 'FNR Parity' ]] tab_bias_fair.rename(columns={ 'attribute_value': 'group_name', 'FOR Parity': 'for_parity', 'FNR Parity': 'fnr_parity', 'for': 'for_' }, inplace=True) print(tab_bias_fair) return tab_bias_fair
def aq_analysis(arguments): # Import result set from best model. result_set = pd.read_csv(arguments.result_set) \ .drop_duplicates(subset="block_group") \ .rename(columns={"successful": "label_value"}) # Drop columns not needed for analysis. features_to_drop = [ column for column in result_set.columns if column in DROP_COLUMN_KEYWORDS and "count" not in column ] result_set = result_set.drop(columns=features_to_drop) # Initialize base comparison attributes dictionary. base_comparison = {"pct_white": None, "pct_high_income": None} base_comparison_label = "_".join(base_comparison.keys()) # Preprocess outside of Aequitas because preprocess_input_df() doesn't work. for column in result_set.columns: if column == "score": result_set[column] = result_set[column].astype(float) elif column == "label_value": result_set[column] = result_set[column].astype(int) else: if result_set[column].nunique() > 1: result_set[column], bins = pd.qcut(x=result_set[column], q=4, precision=2, duplicates="drop", retbins=True) # Save label of highest quartile for base comparison attributes. if column in base_comparison: lb = str(round(bins[3], 2)) ub = str(round(bins[4], 2)) base_comparison[column] = "(" + lb + ", " + ub + "]" result_set[column] = result_set[column].astype(str) # Initialize Aequitas objects and export directory. aqg, aqb, aqf, aqp = Group(), Bias(), Fairness(), Plot() directory = "aequitas" if not os.path.exists(directory): os.makedirs(directory) # Calculate crosstabs by distinct group. crosstabs, _ = aqg.get_crosstabs( df=result_set, score_thresholds={"score": [float(arguments.threshold)]}) absolute_metrics = aqg.list_absolute_metrics(crosstabs) crosstabs[["attribute_name", "attribute_value"] + absolute_metrics] \ .round(2) \ .to_csv(directory + "/aequitas_crosstabs.csv", index=False) # Plot bias and fairness with respect to white, high income communities. disparity_white_hiinc = aqb.get_disparity_predefined_groups( crosstabs.loc[crosstabs["attribute_name"].isin( base_comparison.keys())], result_set, base_comparison) a = aqp.plot_disparity_all(disparity_white_hiinc, metrics=METRICS, show_figure=False) a_filename = "bias_ref_" + base_comparison_label + ".png" a.savefig(directory + "/" + a_filename) b = aqp.plot_fairness_disparity_all( aqf.get_group_value_fairness(disparity_white_hiinc), metrics=METRICS, show_figure=False) b_filename = "fairness_ref_" + base_comparison_label + ".png" b.savefig(directory + "/" + b_filename)