示例#1
0
    def repair(self, data_to_repair):

        # Convert the "feature_to_repair" into a pseudo-categorical feature by
        # applying binning on that column.
        binned_data = [row[:] for row in data_to_repair]
        index_bins = make_histogram_bins(bin_calculator, data_to_repair,
                                         self.feature_to_repair)

        category_medians = {}
        for i, index_bin in enumerate(index_bins):
            bin_name = "BIN_{}".format(
                i)  # IE, the "category" to replace numeric values.
            for j in index_bin:
                binned_data[j][self.feature_to_repair] = bin_name
            category_vals = [
                data_to_repair[j][self.feature_to_repair] for j in index_bin
            ]
            category_medians[bin_name] = get_median(category_vals)

        repaired_data = self.categoric_repairer.repair(binned_data)

        # Replace the "feature_to_repair" column with the median numeric value.
        for i in xrange(len(repaired_data)):
            if self.repair_level > 0:
                rep_category = repaired_data[i][self.feature_to_repair]
                repaired_data[i][
                    self.feature_to_repair] = category_medians[rep_category]
            else:
                repaired_data[i][self.feature_to_repair] = data_to_repair[i][
                    self.feature_to_repair]
        return repaired_data
示例#2
0
def test_sample():
    data = [[float(i), float(i) * 2, 1] for i in xrange(0, 150)]
    feature_to_repair = 0
    repairer = Repairer(data, feature_to_repair, 0.5)
    repaired_data = repairer.repair(data)
    print "repaired_data altered?", repaired_data != data

    median = get_median([row[feature_to_repair] for row in data])
    print "median replaces column?", all(row[feature_to_repair] == median
                                         for row in repaired_data)

    repairer = Repairer(data, feature_to_repair, 0.0)
    repaired_data = repairer.repair(data)
    print "repaired_data unaltered for repair level=0?", repaired_data == data
示例#3
0
def get_median_per_category(categories, categories_count_norm):
    return {
        cat: get_median(categories_count_norm[cat], False)
        for cat in categories
    }
示例#4
0
    def repair(self, data_to_repair):
        num_cols = len(data_to_repair[0])
        col_ids = range(num_cols)

        # Get column type information
        col_types = ["Y"] * len(col_ids)
        for i, col in enumerate(col_ids):
            if i in self.features_to_ignore:
                col_types[i] = "I"
            elif i == self.feature_to_repair:
                col_types[i] = "X"

        col_type_dict = {
            col_id: col_type
            for col_id, col_type in zip(col_ids, col_types)
        }

        not_I_col_ids = filter(lambda x: col_type_dict[x] != "I", col_ids)

        if self.kdd:
            cols_to_repair = filter(lambda x: col_type_dict[x] == "Y", col_ids)
        else:
            cols_to_repair = filter(lambda x: col_type_dict[x] in "YX",
                                    col_ids)

        # To prevent potential perils with user-provided column names, map them to safe column names
        safe_stratify_cols = [self.feature_to_repair]

        # Extract column values for each attribute in data
        # Begin byled code will usually be created in the same directory as the .py file. initializing keys and values in dictionary
        data_dict = {col_id: [] for col_id in col_ids}

        # Populate each attribute with its column values
        for row in data_to_repair:
            for i in col_ids:
                data_dict[i].append(row[i])

        repair_types = {}
        for col_id, values in data_dict.items():
            if all(isinstance(value, float) for value in values):
                repair_types[col_id] = float
            elif all(isinstance(value, int) for value in values):
                repair_types[col_id] = int
            else:
                repair_types[col_id] = str
        """
     Create unique value structures: When performing repairs, we choose median values. If repair is partial, then values will be modified to some intermediate value between the original and the median value. However, the partially repaired value will only be chosen out of values that exist in the data set.  This prevents choosing values that might not make any sense in the data's context.  To do this, for each column, we need to sort all unique values and create two data structures: a list of values, and a dict mapping values to their positions in that list. Example: There are unique_col_vals[col] = [1, 2, 5, 7, 10, 14, 20] in the column. A value 2 must be repaired to 14, but the user requests that data only be repaired by 50%. We do this by finding the value at the right index:
       index_lookup[col][2] = 1
       index_lookup[col][14] = 5
       this tells us that unique_col_vals[col][3] = 7 is 50% of the way from 2 to 14.
    """
        unique_col_vals = {}
        index_lookup = {}
        for col_id in not_I_col_ids:
            col_values = data_dict[col_id]
            # extract unique values from column and sort
            col_values = sorted(list(set(col_values)))
            unique_col_vals[col_id] = col_values
            # look up a value, get its position
            index_lookup[col_id] = {
                col_values[i]: i
                for i in range(len(col_values))
            }
        """
     Make a list of unique values per each stratified column.  Then make a list of combinations of stratified groups. Example: race and gender cols are stratified: [(white, female), (white, male), (black, female), (black, male)] The combinations are tuples because they can be hashed and used as dictionary keys.  From these, find the sizes of these groups.
    """
        unique_stratify_values = [
            unique_col_vals[i] for i in safe_stratify_cols
        ]
        all_stratified_groups = list(product(*unique_stratify_values))
        # look up a stratified group, and get a list of indices corresponding to that group in the data
        stratified_group_indices = defaultdict(list)

        # Find the number of unique values for each strat-group, organized per column.
        val_sets = {
            group: {col_id: set()
                    for col_id in cols_to_repair}
            for group in all_stratified_groups
        }
        for i, row in enumerate(data_to_repair):
            group = tuple(row[col] for col in safe_stratify_cols)
            for col_id in cols_to_repair:
                val_sets[group][col_id].add(row[col_id])

            # Also remember that this row pertains to this strat-group.
            stratified_group_indices[group].append(i)
        """
     Separate data by stratified group to perform repair on each Y column's values given that their corresponding protected attribute is a particular stratified group. We need to keep track of each Y column's values corresponding to each particular stratified group, as well as each value's index, so that when we repair the data, we can modify the correct value in the original data. Example: Supposing there is a Y column, "Score1", in which the 3rd and 5th scores, 70 and 90 respectively, belonged to black women, the data structure would look like: {("Black", "Woman"): {Score1: [(70,2),(90,4)]}}
    """
        stratified_group_data = {group: {} for group in all_stratified_groups}
        for group in all_stratified_groups:
            for col_id, col_dict in data_dict.items():
                # Get the indices at which each value occurs.
                indices = {}
                for i in stratified_group_indices[group]:
                    value = col_dict[i]
                    if value not in indices:
                        indices[value] = []
                    indices[value].append(i)

                stratified_col_values = [(occurs, val)
                                         for val, occurs in indices.items()]
                stratified_col_values.sort(key=lambda tup: tup[1])
                stratified_group_data[group][col_id] = stratified_col_values

        mode_feature_to_repair = get_mode(data_dict[self.feature_to_repair])

        # Repair Data and retrieve the results
        for col_id in cols_to_repair:
            # which bucket value we're repairing
            group_offsets = {group: 0 for group in all_stratified_groups}
            col = data_dict[col_id]

            num_quantiles = min(
                len(val_sets[group][col_id])
                for group in all_stratified_groups)
            quantile_unit = 1.0 / num_quantiles

            if repair_types[col_id] in {int, float}:
                for quantile in range(num_quantiles):
                    median_at_quantiles = []
                    indices_per_group = {}

                    for group in all_stratified_groups:
                        group_data_at_col = stratified_group_data[group][
                            col_id]
                        num_vals = len(group_data_at_col)
                        offset = int(round(group_offsets[group] * num_vals))
                        number_to_get = int(
                            round((group_offsets[group] + quantile_unit) *
                                  num_vals) - offset)
                        group_offsets[group] += quantile_unit

                        if number_to_get > 0:

                            # Get data at this quantile from this Y column such that stratified X = group
                            offset_data = group_data_at_col[offset:offset +
                                                            number_to_get]
                            indices_per_group[group] = [
                                i for val_indices, _ in offset_data
                                for i in val_indices
                            ]
                            values = sorted(
                                [float(val) for _, val in offset_data])

                            # Find this group's median value at this quantile
                            median_at_quantiles.append(
                                get_median(values, self.kdd))

                    # Find the median value of all groups at this quantile (chosen from each group's medians)
                    median = get_median(median_at_quantiles, self.kdd)
                    median_val_pos = index_lookup[col_id][median]

                    # Update values to repair the dataset.
                    for group in all_stratified_groups:
                        for index in indices_per_group[group]:
                            original_value = col[index]

                            current_val_pos = index_lookup[col_id][
                                original_value]
                            distance = median_val_pos - current_val_pos  # distance between indices
                            distance_to_repair = int(
                                round(distance * self.repair_level))
                            index_of_repair_value = current_val_pos + distance_to_repair
                            repaired_value = unique_col_vals[col_id][
                                index_of_repair_value]

                            # Update data to repaired valued
                            data_dict[col_id][index] = repaired_value

            #Categorical Repair is done below
            elif repair_types[col_id] in {str}:
                feature = CategoricalFeature(col)
                categories = feature.bin_index_dict.keys()

                group_features = get_group_data(all_stratified_groups,
                                                stratified_group_data, col_id)

                categories_count = get_categories_count(
                    categories, all_stratified_groups, group_features)

                categories_count_norm = get_categories_count_norm(
                    categories, all_stratified_groups, categories_count,
                    group_features)

                median = get_median_per_category(categories,
                                                 categories_count_norm)

                # Partially fill-out the generator functions to simplify later calls.
                dist_generator = lambda group_index, category: gen_desired_dist(
                    group_index, category, col_id, median, self.repair_level,
                    categories_count_norm, self.feature_to_repair,
                    mode_feature_to_repair)

                count_generator = lambda group_index, group, category: gen_desired_count(
                    group_index, group, category, median, group_features, self.
                    repair_level, categories_count)

                group_features, overflow = flow_on_group_features(
                    all_stratified_groups, group_features, count_generator)

                group_features, assigned_overflow, distribution = assign_overflow(
                    all_stratified_groups, categories, overflow,
                    group_features, dist_generator)

                # Return our repaired feature in the form of our original dataset
                for group in all_stratified_groups:
                    indices = stratified_group_indices[group]
                    for i, index in enumerate(indices):
                        repaired_value = group_features[group].data[i]
                        data_dict[col_id][index] = repaired_value

        # Replace stratified groups with their mode value, to remove it's information
        repaired_data = []
        for i, orig_row in enumerate(data_to_repair):
            new_row = [
                orig_row[j] if j not in cols_to_repair else data_dict[j][i]
                for j in col_ids
            ]
            repaired_data.append(new_row)
        return repaired_data