Python clean_data примеры использования

Язык программирования: Python

Пространство имен/Пакет: util.clusteringutil

Метод/Функция: clean_data

Примеров на hotexamples.com: 4

Python clean_data - 4 примера найдено. Это лучшие примеры Python кода для util.clusteringutil.clean_data, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Пример #1

Показать файл

def fair_clustering_large_cluster(dataset,
                                  config_file,
                                  data_dir,
                                  num_clusters,
                                  deltas,
                                  max_points,
                                  L=0,
                                  p_acc=1.0,
                                  ml_model_flag=False):
    config = configparser.ConfigParser(converters={'list': read_list})
    config.read(config_file)

    # Read data in from a given csv_file found in config
    # df (pd.DataFrame) : holds the data
    df = read_data(config, dataset)

    # Subsample data if needed
    if max_points and len(df) > max_points:
        df = df.head(max_points)

    # Clean the data (bucketize text data)
    df, _ = clean_data(df, config, dataset)

    # variable_of_interest (list[str]) : variables that we would like to collect statistics for
    variable_of_interest = config[dataset].getlist("fairness_variable")

    # NOTE: this code only handles one color per vertex
    assert len(variable_of_interest) == 1

    # Assign each data point to a color, based on config file
    # attributes (dict[str -> defaultdict[int -> list[int]]]) : holds indices of points for each color class
    # color_flag (dict[str -> list[int]]) : holds map from point to color class it belongs to (reverse of `attributes`)
    attributes, color_flag, prob_vecs, prob_thresh = {}, {}, {}, {}
    for variable in variable_of_interest:
        colors = defaultdict(list)
        this_color_flag = [0] * len(df)

        condition_str = variable + "_conditions"
        bucket_conditions = config[dataset].getlist(condition_str)

        # For each row, if the row passes the bucket condition,
        # then the row is added to that color class
        for i, row in df.iterrows():
            for bucket_idx, bucket in enumerate(bucket_conditions):
                if eval(bucket)(row[variable]):
                    colors[bucket_idx].append(
                        i)  # add the point to the list of its colors
                    this_color_flag[
                        i] = bucket_idx  # record the color for this given point

        # NOTE: colors is a dict, this_color_flag is a list
        attributes[variable] = colors
        color_flag[variable] = this_color_flag

        if ml_model_flag == False:
            prob_vecs[variable] = create_prob_vecs(len(df), p_acc, len(colors),
                                                   this_color_flag)
        else:
            ml_model_path = 'MLModels' + '/' + dataset
            prob_vecs_path = ml_model_path + '_prob_vecs.npy'
            n = len(df)
            prob_vecs[variable] = np.load(prob_vecs_path)[0:n, :]

    # representation (dict[str -> dict[int -> float]]) : representation of each color compared to the whole dataset
    representation = {}

    for var in variable_of_interest:
        color_proportions = np.sum(prob_vecs[var], axis=0) / len(df)
        dict_ = {}
        for j in range(color_proportions.shape[0]):
            dict_.update({j: color_proportions[j]})

        representation[var] = dict_

    # Select only the desired columns
    selected_columns = config[dataset].getlist("columns")
    df = df[[col for col in selected_columns]]

    # NOTE: this code only handles one membership criterion
    (_, fair_vals), = representation.items()

    # NOTE: this handles the case when a color is missing in the sampled vertices
    num_colors = max(fair_vals.keys()) + 1

    # Scale data if desired
    scaling = config["DEFAULT"].getboolean("scaling")
    if scaling:
        df = scale_data(df)

    # Cluster the data -- using the objective specified by clustering_method
    clustering_method = config["DEFAULT"]["clustering_method"]

    t1 = time.monotonic()

    initial_score, pred, cluster_centers = vanilla_clustering(
        df, num_clusters, clustering_method)
    t2 = time.monotonic()
    cluster_time = t2 - t1
    print("Clustering time: {}".format(cluster_time))

    # sizes (list[int]) : sizes of clusters
    sizes = [0 for _ in range(num_clusters)]
    for p in pred:
        sizes[p] += 1

    # dataset_ratio : Ratios for colors in the dataset
    dataset_ratio = {}
    for attr, color_dict in attributes.items():
        dataset_ratio[attr] = {
            int(color): len(points_in_color) / len(df)
            for color, points_in_color in color_dict.items()
        }

    # fairness_vars (list[str]) : Variables to perform fairness balancing on
    fairness_vars = config[dataset].getlist("fairness_variable")

    # NOTE: here is where you set the upper and lower bounds
    # NOTE: accross all different values within the same attribute you have the same multipliers up and down
    for delta in deltas:
        #   alpha_i = a_val * (representation of color i in dataset)
        #   beta_i  = b_val * (representation of color i in dataset)
        alpha, beta = {}, {}
        a_val, b_val = 1 / (1 - delta), 1 - delta
        for var, bucket_dict in attributes.items():
            alpha[var] = {
                k: a_val * representation[var][k]
                for k in bucket_dict.keys()
            }
            beta[var] = {
                k: b_val * representation[var][k]
                for k in bucket_dict.keys()
            }

        # NOTE: Sample color values
        if ml_model_flag == False:
            color_flag[var] = sample_colors(color_flag[var], num_colors, p_acc)
        else:
            color_flag[var] = sample_colors_ml_model(prob_vecs[var],
                                                     num_colors)

        fp_color_flag, fp_alpha, fp_beta = (take_by_key(
            color_flag, fairness_vars), take_by_key(alpha, fairness_vars),
                                            take_by_key(beta, fairness_vars))

        # Solves partial assignment and then performs rounding to get integral assignment
        t1 = time.monotonic()
        res, nf_time = fair_partial_assignment_large_cluster(
            df, cluster_centers, fp_alpha, fp_beta, fp_color_flag,
            clustering_method, num_colors, L)
        t2 = time.monotonic()
        lp_time = t2 - t1

        ### Output / Writing data to a file
        # output is a dictionary which will hold the data to be written to the
        #   outfile as key-value pairs. Outfile will be written in JSON format.
        output = {}

        # num_clusters for re-running trial
        output["num_clusters"] = num_clusters

        # Whether or not the LP found a solution
        output["partial_success"] = res["partial_success"]

        # Nonzero status -> error occurred
        output["partial_success"] = res["partial_success"]

        output["dataset_distribution"] = dataset_ratio

        # Save alphas and betas from trials
        output['prob_proportions'] = representation
        output["alpha"] = alpha
        output["beta"] = beta

        # Save original clustering score
        output["unfair_score"] = initial_score

        # Original Color Blind Assignments
        output["unfair_assignments"] = pred.tolist()

        # Clustering score after addition of fairness
        output["objective"] = res["objective"]

        # Clustering score after initial LP
        output["partial_fair_score"] = res["partial_objective"]

        # Save size of each cluster
        output["sizes"] = sizes

        output["attributes"] = attributes

        # These included at end because their data is large
        # Save points, colors for re-running trial
        # Partial assignments -- list bc. ndarray not serializable
        output["centers"] = [list(center) for center in cluster_centers]
        output["points"] = [list(point) for point in df.values]
        output["assignment"] = res["assignment"]

        output["partial_assignment"] = res["partial_assignment"]

        output["name"] = dataset
        output["clustering_method"] = clustering_method
        output["scaling"] = scaling
        output["delta"] = delta
        output["time"] = lp_time
        output["cluster_time"] = cluster_time

        # NOTE: record proprtions
        output['partial_proportions'] = res['partial_proportions']
        output['proportions'] = res['proportions']

        output['partial_proportions_normalized'] = res[
            'partial_proportions_normalized']
        output['proportions_normalized'] = res['proportions_normalized']

        # Record Lower Bound L
        output['Cluster_Size_Lower_Bound'] = L

        # Record Classifier Accurecy
        output['p_acc'] = p_acc

        #
        output['nf_time'] = nf_time

        # Record probability vecs
        for k, v in prob_vecs.items():
            prob_vecs = v

        output['prob_vecs'] = prob_vecs.ravel().tolist()

        # Record Probability Vector
        # NOTE: TODO

        # Writes the data in `output` to a file in data_dir
        write_fairness_trial(output, data_dir)

        # Added because sometimes the LP for the next iteration solves so
        # fast that `write_fairness_trial` cannot write to disk
        time.sleep(1)

        return output

Пример #2

Показать файл

Файл: fair_clustering_metric_membership.py Проект: Seyed2357/Probabilistic-Fair-Clustering

def fair_clustering_metric_membership(dataset,
                                      config_file,
                                      data_dir,
                                      num_clusters,
                                      deltas,
                                      max_points,
                                      L=0):
    num_colors = 2
    config = configparser.ConfigParser(converters={'list': read_list})
    config.read(config_file)

    # Read data in from a given csv_file found in config
    df = read_data(config, dataset)
    # Subsample data if needed
    if max_points and len(df) > max_points:
        df = df.head(max_points)
        # below if you wish to shuffle
        #df= df.sample( frac=1, random_state=1).reset_index(drop=True)

    # Clean the data (bucketize text data)
    df, _ = clean_data(df, config, dataset)

    # variable_of_interest (list[str]) : variables that we would like to collect statistics for
    variable_of_interest = config[dataset].getlist("fairness_variable")

    # NOTE: this code only handles one color per vertex
    assert len(variable_of_interest) == 1

    # Assign each data point to a color, based on config file
    # attributes (dict[str -> defaultdict[int -> list[int]]]) : holds indices of points for each color class
    # color_flag (dict[str -> list[int]]) : holds map from point to color class it belongs to (reverse of `attributes`)
    attributes, color_flag, prob_vecs, prob_vals, prob_thresh = {}, {}, {}, {}, {}
    for variable in variable_of_interest:
        prob_vals[variable] = df[variable].tolist()

    (_, prob_vals_process), = prob_vals.items()
    min_val = min(prob_vals_process)
    prob_vals_process = [(p - min_val) for p in prob_vals_process]
    R_max = max(prob_vals_process)
    assert min(prob_vals_process) == 0

    # put the data back in
    for k, v in prob_vals.items():
        prob_vals[k] = prob_vals_process

    representation = {}
    for var in variable_of_interest:
        representation[var] = sum(prob_vals[var]) / len(df)

    # NOTE: this code only handles one membership criterion
    (_, fair_vals), = representation.items()

    # drop uneeded columns
    selected_columns = config[dataset].getlist("columns")
    df = df[[col for col in selected_columns]]

    # Scale data if desired
    scaling = config["DEFAULT"].getboolean("scaling")
    if scaling:
        df = scale_data(df)

    # Cluster the data -- using the objective specified by clustering_method
    clustering_method = config["DEFAULT"]["clustering_method"]

    t1 = time.monotonic()
    # NOTE: initial_score is the value of the objective at the solution
    # NOTE: This is where the color-blind algorithm is ran
    if type(num_clusters) is list:
        num_clusters = num_clusters[0]

    initial_score, pred, cluster_centers = vanilla_clustering(
        df, num_clusters, clustering_method)
    t2 = time.monotonic()
    cluster_time = t2 - t1
    print("Clustering time: {}".format(cluster_time))

    # For each point in the dataset, assign it to the cluster and color it belongs too
    cluster_color_proportions = np.zeros((num_clusters, num_colors))

    # sizes (list[int]) : sizes of clusters
    sizes = [0 for _ in range(num_clusters)]
    for p in pred:
        sizes[p] += 1

    # dataset_ratio : Ratios for colors in the dataset

    # fairness_vars (list[str]) : Variables to perform fairness balancing on
    fairness_vars = config[dataset].getlist("fairness_variable")

    # NOTE: here is where you set the upper and lower bounds
    # NOTE: accross all different values within the same attribute you have the same multipliers up and down
    for delta in deltas:
        #   alpha_i = a_val * (representation of color i in dataset)
        #   beta_i  = b_val * (representation of color i in dataset)
        alpha, beta = {}, {}
        a_val, b_val = 1 / (1 - delta), 1 - delta
        #a_val, b_val = 1,1
        # NOTE: 2 color case
        for var in variable_of_interest:
            alpha[var] = a_val * representation[var]
            beta[var] = b_val * representation[var]

        fp_color_flag = prob_vals
        fp_alpha = alpha
        fp_beta = beta

        # Solves partial assignment and then performs rounding to get integral assignment
        t1 = time.monotonic()

        res = fair_partial_assignment_2_color(df, cluster_centers, fp_alpha,
                                              fp_beta, fp_color_flag,
                                              clustering_method, num_colors, L)
        t2 = time.monotonic()
        lp_time = t2 - t1

        ### Output / Writing data to a file
        # output is a dictionary which will hold the data to be written to the
        #   outfile as key-value pairs. Outfile will be written in JSON format.
        output = {}

        # num_clusters for re-running trial
        output["num_clusters"] = num_clusters

        # Whether or not the LP found a solution
        output["partial_success"] = res["partial_success"]

        # Nonzero status -> error occurred
        output["status"] = res["partial_status"]

        #output["dataset_distribution"] = dataset_ratio

        # Save alphas and betas from trials
        output['prob_proportions'] = representation
        output["alpha"] = alpha
        output["beta"] = beta

        # Save size of each cluster
        output["sizes"] = sizes

        output["attributes"] = attributes

        # These included at end because their data is large
        # Save points, colors for re-running trial
        # Partial assignments -- list bc. ndarray not serializable
        ''' IMPORTANT '''
        output["centers"] = [list(center) for center in cluster_centers]
        output["points"] = [list(point) for point in df.values]

        # Save original clustering score
        output["unfair_score"] = initial_score
        # Original Color Blind Assignments
        if type(pred) is not list:
            pred = pred.tolist()

        output["unfair_assignments"] = pred

        # Record Assignments
        output["partial_assignment"] = res["partial_assignment"]
        output["assignment"] = res["assignment"]

        # Clustering score after addition of fairness
        output["objective"] = res["objective"]

        # Clustering score after initial LP
        output["partial_objective"] = res["partial_objective"]

        output['prob_values'] = prob_vals

        # Record Lower Bound L
        output['Cluster_Size_Lower_Bound'] = L

        # Record probability vecs

        for k, v in prob_vecs.items():
            prob_vecs = v

        # Record Probability Vector
        #output['prob_vecs'] = prob_vecs.ravel().tolist()

        # NOTE: TODO
        output["name"] = dataset
        output["clustering_method"] = clustering_method
        output["scaling"] = scaling
        output["delta"] = delta
        output["time"] = lp_time
        output["cluster_time"] = cluster_time

        # Record R_max
        output["R_max"] = R_max

        # Writes the data in `output` to a file in data_dir
        write_fairness_trial(output, data_dir)

        # Added because sometimes the LP for the next iteration solves so
        # fast that `write_fairness_trial` cannot write to disk
        time.sleep(1)

        return output

Пример #3

Показать файл

Файл: fair_clustering.py Проект: sfjiang1990/Coresets-for-Clustering-with-Fairness-Constraints

def fair_clustering(dataset, config_file, data_dir, num_clusters, deltas,
                    max_points, violating, violation):
    config = configparser.ConfigParser(converters={'list': read_list})
    config.read(config_file)

    # Read data in from a given csv_file found in config
    # df (pd.DataFrame) : holds the data
    df = read_data(config, dataset)

    # Subsample data if needed
    if max_points and len(df) > max_points:
        df = subsample_data(df, max_points)

    # Clean the data (bucketize text data)
    df, _, weight = clean_data(df, config, dataset)
    #print(weight)

    # variable_of_interest (list[str]) : variables that we would like to collect statistics for
    variable_of_interest = config[dataset].getlist("variable_of_interest")

    print("DATA READ")

    T0 = time.monotonic()
    # Assign each data point to a color, based on config file
    # attributes (dict[str -> defaultdict[int -> list[int]]]) : holds indices of points for each color class
    # color_flag (dict[str -> list[int]]) : holds map from point to color class it belongs to (reverse of `attributes`)
    attributes, color_flag = {}, {}
    for variable in variable_of_interest:
        colors = defaultdict(list)
        this_color_flag = [0] * len(df)

        condition_str = variable + "_conditions"
        bucket_conditions = config[dataset].getlist(condition_str)

        # For each row, if the row passes the bucket condition,
        # then the row is added to that color class
        for i, row in df.iterrows():
            for bucket_idx, bucket in enumerate(bucket_conditions):
                if eval(bucket)(row[variable]):
                    colors[bucket_idx].append(i)
                    this_color_flag[i] = bucket_idx

        attributes[variable] = colors
        color_flag[variable] = this_color_flag

    print("COLOR BUILT")

    # representation (dict[str -> dict[int -> float]]) : representation of each color compared to the whole dataset
    representation = {}
    for var, bucket_dict in attributes.items():
        #representation[var] = {k : (len(bucket_dict[k]) / len(df)) for k in bucket_dict.keys()}
        representation[var] = {
            k: ((weighted_size_by_idx(weight, bucket_dict[k])) /
                weighted_size(weight))
            for k in bucket_dict.keys()
        }

    # Select only the desired columns
    selected_columns = config[dataset].getlist("columns")
    df = df[[col for col in selected_columns]]

    # Scale data if desired
    scaling = config["DEFAULT"].getboolean("scaling")
    if scaling:
        df = scale_data(df)

    # Cluster the data -- using the objective specified by clustering_method
    clustering_method = config["DEFAULT"]["clustering_method"]

    print("READY TO CLUSTER")

    if not violating:
        t1 = time.monotonic()
        initial_score, pred, cluster_centers = vanilla_clustering_weighted(
            df, weight, num_clusters, clustering_method)
        t2 = time.monotonic()
        cluster_time = t2 - t1
        print("Clustering time: {}".format(cluster_time))

        ### Calculate fairness statistics
        # fairness ( dict[str -> defaultdict[int-> defaultdict[int -> int]]] )
        # fairness : is used to hold how much of each color belongs to each cluster
        #fairness = {}
        # For each point in the dataset, assign it to the cluster and color it belongs too
        #for attr, colors in attributes.items():
        #    fairness[attr] = defaultdict(partial(defaultdict, int))
        #    for i, row in enumerate(df.iterrows()):
        #        cluster = pred[i]
        #        for color in colors:
        #            if i in colors[color]:
        #                fairness[attr][cluster][color] += 1
        #                continue

        #print("FAIRNESS BUILT")
        # sizes (list[int]) : sizes of clusters
        sizes = [0 for _ in range(num_clusters)]
        for i, p in enumerate(pred):
            sizes[p] += weight[i]

        # ratios (dict[str -> dict[int -> list[float]]]): Ratios for colors in a cluster
        #ratios = {}
        #for attr, colors in attributes.items():
        #    attr_ratio = {}
        #    for cluster in range(num_clusters):
        #        attr_ratio[cluster] = [fairness[attr][cluster][color] / sizes[cluster]
        #                        for color in sorted(colors.keys())]
        #    ratios[attr] = attr_ratio
    else:
        # These added so that output format is consistent among violating and
        # non-violating trials
        cluster_time, initial_score = 0, 0
        fairness, ratios = {}, {}
        sizes, cluster_centers = [], []

    # dataset_ratio : Ratios for colors in the dataset
    #dataset_ratio = {}
    #for attr, color_dict in attributes.items():
    #    dataset_ratio[attr] = {int(color) : len(points_in_color) / len(df)
    #                        for color, points_in_color in color_dict.items()}

    # fairness_vars (list[str]) : Variables to perform fairness balancing on
    fairness_vars = config[dataset].getlist("fairness_variable")
    for delta in deltas:
        #   alpha_i = a_val * (representation of color i in dataset)
        #   beta_i  = b_val * (representation of color i in dataset)
        alpha, beta = {}, {}
        a_val, b_val = 1 / (1 - delta), 1 - delta
        for var, bucket_dict in attributes.items():
            alpha[var] = {
                k: a_val * representation[var][k]
                for k in bucket_dict.keys()
            }
            beta[var] = {
                k: b_val * representation[var][k]
                for k in bucket_dict.keys()
            }

        # Only include the entries for the variables we want to perform fairness on
        # (in `fairness_vars`). The others are kept for statistics.
        fp_color_flag, fp_alpha, fp_beta = (take_by_key(
            color_flag, fairness_vars), take_by_key(alpha, fairness_vars),
                                            take_by_key(beta, fairness_vars))

        # Solves partial assignment and then performs rounding to get integral assignment
        if not violating:
            t1 = time.monotonic()
            res = fair_partial_assignment(df, weight, cluster_centers,
                                          fp_alpha, fp_beta, fp_color_flag,
                                          clustering_method)
            t2 = time.monotonic()
            lp_time = t2 - t1

        else:
            t1 = time.monotonic()
            res = violating_lp_clustering(df, num_clusters, fp_alpha, fp_beta,
                                          fp_color_flag, clustering_method,
                                          violation)
            t2 = time.monotonic()
            lp_time = t2 - t1

            # Added so that output formatting is consistent among violating
            # and non-violating trials
            res["partial_objective"] = 0
            res["partial_assignment"] = []

        TOT_TIME = time.monotonic() - T0
        print(TOT_TIME)

        ### Output / Writing data to a file
        # output is a dictionary which will hold the data to be written to the
        #   outfile as key-value pairs. Outfile will be written in JSON format.
        output = {}

        # num_clusters for re-running trial
        #output["num_clusters"] = num_clusters

        # Whether or not the LP found a solution
        output["success"] = res["success"]

        # Nonzero status -> error occurred
        output["status"] = res["status"]

        #output["dataset_distribution"] = dataset_ratio

        # Save alphas and betas from trials
        output["alpha"] = alpha
        output["beta"] = beta

        # Save original clustering score
        output["unfair_score"] = initial_score

        # Clustering score after addition of fairness
        output["fair_score"] = res["objective"]

        # Clustering score after initial LP
        output["partial_fair_score"] = res["partial_objective"]

        # Save size of each cluster
        output["sizes"] = sizes

        #output["attributes"] = attributes

        # Save the ratio of each color in its cluster
        #output["ratios"] = ratios

        # These included at end because their data is large
        # Save points, colors for re-running trial
        # Partial assignments -- list bc. ndarray not serializable
        output["centers"] = [list(center) for center in cluster_centers]
        #output["points"] = [list(point) for point in df.values]
        #output["assignment"] = res["assignment"]

        #output["partial_assignment"] = res["partial_assignment"]

        output["name"] = dataset
        output["clustering_method"] = clustering_method
        output["scaling"] = scaling
        output["delta"] = delta
        output["time"] = lp_time
        output["total_time"] = TOT_TIME
        output["cluster_time"] = cluster_time
        output["violating"] = violating
        output["violation"] = violation

        # Writes the data in `output` to a file in data_dir
        write_fairness_trial(output, data_dir, dataset)

        # Added because sometimes the LP for the next iteration solves so
        # fast that `write_fairness_trial` cannot write to disk
        time.sleep(1)

Пример #4

Показать файл

Файл: fair_clustering_2_color.py Проект: Seyed2357/Probabilistic-Fair-Clustering

def fair_clustering_2_color(dataset,
                            config_file,
                            data_dir,
                            num_clusters,
                            deltas,
                            max_points,
                            L=0,
                            p_acc=1.0):
    # NOTE: thos code works for 2 colors
    num_colors = 2

    config = configparser.ConfigParser(converters={'list': read_list})
    config.read(config_file)

    # Read data in from a given csv_file found in config
    df = read_data(config, dataset)
    # Subsample data if needed
    if max_points and len(df) > max_points:
        # NOTE: comment the block and second and unccomment the second block. changed to exclude randomization effect
        #rows = [0,1,2,3,4,5,20,21,23,50,126,134,135]
        #df = df.iloc[rows,:]
        #df = df.reset_index()

        df = df.head(max_points)
        # below if you wish to shuffle
        # df= df.sample( frac=1, random_state=1).reset_index(drop=True)

    # Clean the data (bucketize text data)
    df, _ = clean_data(df, config, dataset)

    # variable_of_interest (list[str]) : variables that we would like to collect statistics for
    variable_of_interest = config[dataset].getlist("fairness_variable")

    # NOTE: this code only handles one color per vertex
    assert len(variable_of_interest) == 1

    # Assign each data point to a color, based on config file
    # attributes (dict[str -> defaultdict[int -> list[int]]]) : holds indices of points for each color class
    # color_flag (dict[str -> list[int]]) : holds map from point to color class it belongs to (reverse of `attributes`)
    attributes, color_flag, prob_vecs, prob_vals, prob_vals_thresh, prob_thresh = {}, {}, {}, {}, {}, {}
    for variable in variable_of_interest:
        colors = defaultdict(list)
        this_color_flag = [0] * len(df)

        condition_str = variable + "_conditions"
        bucket_conditions = config[dataset].getlist(condition_str)

        # For each row, if the row passes the bucket condition,
        # then the row is added to that color class
        for i, row in df.iterrows():
            for bucket_idx, bucket in enumerate(bucket_conditions):
                if eval(bucket)(row[variable]):
                    colors[bucket_idx].append(
                        i)  # add the point to the list of its colors
                    this_color_flag[
                        i] = bucket_idx  # record the color for this given point

        attributes[variable] = colors
        color_flag[variable] = this_color_flag

        # NOT: generate probabilities according to the perturbation descired in section 5.2
        prob_vals[variable] = [
            perturb_2_color(color, p_acc) for color in this_color_flag
        ]

    # representation (dict[str -> dict[int -> float]]) : representation of each color compared to the whole dataset
    representation = {}
    for var in variable_of_interest:
        representation[var] = sum(prob_vals[var]) / len(df)

    (_, fair_vals), = representation.items()

    # drop uneeded columns
    selected_columns = config[dataset].getlist("columns")
    df = df[[col for col in selected_columns]]

    # Scale data if desired
    scaling = config["DEFAULT"].getboolean("scaling")
    if scaling:
        df = scale_data(df)

    # Cluster the data -- using the objective specified by clustering_method
    clustering_method = config["DEFAULT"]["clustering_method"]

    t1 = time.monotonic()
    # NOTE: initial_score is the value of the objective at the solution
    # NOTE: This is where the color-blind algorithm is ran
    if type(num_clusters) is list:
        num_clusters = num_clusters[0]

    initial_score, pred, cluster_centers = vanilla_clustering(
        df, num_clusters, clustering_method)
    t2 = time.monotonic()
    cluster_time = t2 - t1
    print("Clustering time: {}".format(cluster_time))

    # sizes (list[int]) : sizes of clusters
    sizes = [0 for _ in range(num_clusters)]
    for p in pred:
        sizes[p] += 1

    # fairness_vars (list[str]) : Variables to perform fairness balancing on
    fairness_vars = config[dataset].getlist("fairness_variable")

    # NOTE: here is where you set the upper and lower bounds
    # NOTE: accross all different values within the same attribute you have the same multipliers up and down
    for delta in deltas:
        alpha, beta = {}, {}
        a_val, b_val = 1 / (1 - delta), 1 - delta

        # NOTE: 2 color case
        for var, bucket_dict in attributes.items():
            alpha[var] = a_val * representation[var]
            beta[var] = b_val * representation[var]

        fp_color_flag = prob_vals
        fp_alpha = alpha
        fp_beta = beta

        # Solves partial assignment and then performs rounding to get integral assignment
        t1 = time.monotonic()
        res = fair_partial_assignment_2_color(df, cluster_centers, fp_alpha,
                                              fp_beta, fp_color_flag,
                                              clustering_method, num_colors, L)
        t2 = time.monotonic()
        lp_time = t2 - t1

        ### Output / Writing data to a file
        # output is a dictionary which will hold the data to be written to the
        #   outfile as key-value pairs. Outfile will be written in JSON format.
        output = {}

        # num_clusters for re-running trial
        output["num_clusters"] = num_clusters

        # Whether or not the LP found a solution
        output["partial_success"] = res["partial_success"]

        # Nonzero status -> error occurred
        output["partial_status"] = res["partial_status"]

        #output["dataset_distribution"] = dataset_ratio

        # Save alphas and betas from trials
        output['prob_proportions'] = representation
        output["alpha"] = alpha
        output["beta"] = beta

        # Save size of each cluster
        output["sizes"] = sizes

        output["attributes"] = attributes

        # These included at end because their data is large
        # Save points, colors for re-running trial
        # Partial assignments -- list bc. ndarray not serializable
        output["centers"] = [list(center) for center in cluster_centers]
        output["points"] = [list(point) for point in df.values]

        # Save original clustering score
        output["unfair_score"] = initial_score
        # Original Color Blind Assignments
        if type(pred) is not list:
            pred = pred.tolist()

        output["unfair_assignments"] = pred

        # Record Assignments
        output["partial_assignment"] = res["partial_assignment"]
        output["assignment"] = res["assignment"]

        # Clustering score after addition of fairness
        output["objective"] = res["objective"]

        # Clustering score after initial LP
        output["partial_objective"] = res["partial_objective"]

        output['prob_values'] = prob_vals

        # Record Lower Bound L
        output['Cluster_Size_Lower_Bound'] = L

        # Record Classifier Accurecy
        output['p_acc'] = p_acc

        # Record probability vecs
        output["name"] = dataset
        output["clustering_method"] = clustering_method
        output["scaling"] = scaling
        output["delta"] = delta
        output["time"] = lp_time
        output["cluster_time"] = cluster_time

        # Writes the data in `output` to a file in data_dir
        write_fairness_trial(output, data_dir)

        # Added because sometimes the LP for the next iteration solves so
        # fast that `write_fairness_trial` cannot write to disk
        time.sleep(1)

        return output