def _load_data(self): compas_data = COMPASDataset() # use race as the sensitive attribute race = compas_data.df['race'] s = race.where(race == 'Caucasian', 1) s.where(s == 1, 0, inplace=True) s = s.values.reshape(-1, 1) # Use juvenile felonies, juvenile misdemeanors, juvenile others, prior conviction x = whiten(data=compas_data.df[[ 'juv_fel_count', 'juv_misd_count', 'juv_other_count', 'priors_count' ]].values.astype(float)) # Charge Degree categories in one hot encoding for category in compas_data.df['c_charge_degree'].unique(): degree_category = compas_data.df['c_charge_degree'].where( compas_data.df['c_charge_degree'] == category, 0) degree_category.where(degree_category == 0, 1, inplace=True) x = np.hstack((x, degree_category.values.reshape(-1, 1))) # use actual recidivisim as target variable y = compas_data.df[compas_data.target].values.reshape(-1, 1) return x.astype(float), s.astype(float), y.astype(float)
def compas(): """ COMPAS dataset - race as sensitive attributes """ print('Fetching COMPAS data with sensitive_attribute=race ...') from responsibly.dataset import COMPASDataset compas_ds = COMPASDataset() X = compas_ds.df[[ 'sex', 'age', 'c_charge_degree', 'age_cat', 'score_text', 'priors_count', 'days_b_screening_arrest', 'decile_score', 'length_of_stay' ]] X.loc[:, 'length_of_stay'] = X['length_of_stay'].dt.days X = X.fillna(0) X = pd.get_dummies(X) Y = compas_ds.df['is_recid'].values A = preprocessing.LabelEncoder().fit_transform(compas_ds.df['race']) return X, Y, A
def __init__(self, config_file) : f = open(config_file, 'r') config = json.loads(f.read()) self.type = config['dataset'] _dataset = COMPASDataset() if self.type == 'COMPAS' else GermanDataset() self._sensitive_attributes = config["sensitive_attributes"] self._non_numeric_attributes = config["non_numeric_attributes"] self._n_attributes_dict = config["numeric_attributes"] self._numeric_attributes = list(self._n_attributes_dict.keys()) if self._n_attributes_dict else [] self._date_attributes_dict = config["date_attributes"] self._date_attributes = list(self._date_attributes_dict.keys()) if self._date_attributes_dict else [] self._target = config['ground_truth'] self._predictions = config['predictions'] self._data_x_readable = pd.DataFrame(_dataset.df [ self._sensitive_attributes+self._non_numeric_attributes+self._numeric_attributes+self._date_attributes]) self._data_y_readable = pd.DataFrame(_dataset.df[self._target]) self._encoder_dict_x = defaultdict(LabelEncoder) self._encoder_dict_y = defaultdict(LabelEncoder) self._data_x, self._data_y = self._preprocess_data()
def compas_ds(): return COMPASDataset()
def compas(): # Get the whole dataset, already nicely filtered for us from this library compas_ds = COMPASDataset() # Make the dataframe cdf = compas_ds.df """ There are some columns that need to be adjusted, and a bunch that need to be dropped - length jail sentence becomes one column instead of c_jail_in and c_jail_out - time in custody becomes one column instead of cusotdy_in and custody_out - I encode binary attributes 0,1 where 0 is majority class 1 is minority class Male => 0 Female => 1, Misdemeanor => 0, Felony => 1 """ # Turn the length of jail sentence a single variable c_jail_out = pd.to_datetime(cdf['c_jail_out']) c_jail_in = pd.to_datetime(cdf['c_jail_in']) c_jail_time = (c_jail_out - c_jail_in).apply(lambda x: x.days + x.seconds / 3600) cdf["c_jail_time"] = c_jail_time # Turn the length of custody into a single variable custody_in = pd.to_datetime(cdf['in_custody']) custody_out = pd.to_datetime(cdf['out_custody']) custody_delta = (custody_out - custody_in).apply(lambda x: x.days + x.seconds / 3600) cdf["custody_length"] = custody_delta # Encode Male Female cdf = cdf.replace({'sex': {'Male': 0, 'Female': 1}}) # Encode Charge Degree cdf = cdf.replace({'c_charge_degree': {'M': 0, 'F': 1}}) # One Hot Encode Race cdf = one_hot(cdf, "race") # Remove Nans (not even sure how those show up for crimes?) cdf = cdf.replace({np.nan: "other"}) charges = cdf["c_charge_desc"].unique() # I dropped all of these columns because they didn't seem useful (idk what I was saying earlier) # If you disagree just commit it out I guess idrc (this is still true) cdf = cdf.drop([ "name", "id", "dob", "first", "last", "compas_screening_date", "age_cat", "c_case_number", "r_case_number", "vr_case_number", "decile_score.1", "type_of_assessment", "score_text", "screening_date", "v_type_of_assessment", "priors_count.1", "v_score_text", "v_screening_date", "in_custody", "out_custody", "length_of_stay", "c_jail_out", "c_jail_in", "age_cat", "c_charge_desc", "c_offense_date", "c_arrest_date", "c_offense_date", "r_charge_degree", "r_days_from_arrest", "r_offense_date", "r_charge_desc", "r_jail_in", "r_jail_out", "violent_recid", "vr_charge_degree", "vr_offense_date", "score_factor", "vr_charge_desc", "v_decile_score", "c_days_from_compas", "start", "end", "event", "days_b_screening_arrest" ], axis=1) return cdf
def compas_ds(): ds = COMPASDataset() df = ds.df df = df[df['race'].isin(['African-American', 'Caucasian'])] ds.df = df return ds