Python pd_df_diff примеры использования

Язык программирования: Python

Пространство имен/Пакет: cleaningbenchmark.Utils.Utils

Метод/Функция: pd_df_diff

Примеров на hotexamples.com: 6

Python pd_df_diff - 6 примеров найдено. Это лучшие примеры Python кода для cleaningbenchmark.Utils.Utils.pd_df_diff, полученные из open source проектов. Вы можете ставить оценку каждому примеру, чтобы помочь нам улучшить качество примеров.

Пример #1

0

Показать файл

Файл: dataset_prep_utils.py Проект: wazzed/RVAE_MixedTypes

def save_datasets(path_saving, df_data, train_idxs, validation_idxs, test_idxs,
                  noised_data_df, cells_changed, tuples_changed):

    ## train dataset

    # clean data
    df_train = df_data.iloc[train_idxs, :]
    df_train = df_train.reset_index(drop=True)

    # dirty data
    df_train_noised = noised_data_df.iloc[train_idxs, :]
    df_train_noised = df_train_noised.reset_index(drop=True)

    # ground-truth of errors
    df_changes_train, _, _ = pd_df_diff(df_train, df_train_noised)

    # ground-truth of errors - error matrix (i.e. the matrix completion matrix)
    cells_changed_train = cells_changed[train_idxs, :]

    # ground-truth of errors - rows with errors
    tuples_changed_train = tuples_changed[train_idxs]

    # save
    df_train.to_csv(path_saving + "/train/" + "data_clean.csv", index=False)
    df_train_noised.to_csv(path_saving + "/train/" + "data_noised.csv",
                           index=False)
    df_changes_train.to_csv(path_saving + "/train/" + "changes_summary.csv")

    df_cells_changed_train = pd.DataFrame(cells_changed_train,
                                          columns=df_data.columns)
    df_cells_changed_train.to_csv(path_saving + "/train/" +
                                  "cells_changed_mtx.csv",
                                  index=False)

    df_tuples_changed_train = pd.DataFrame(tuples_changed_train,
                                           columns=["rows_with_outlier"])
    df_tuples_changed_train.to_csv(path_saving + "/train/" +
                                   "tuples_changed_mtx.csv",
                                   index=False)

    df_train_idxs = pd.DataFrame(train_idxs, columns=["original_idxs"])
    df_train_idxs.to_csv(path_saving + "/train/" + "original_idxs.csv",
                         index=False)

    ## validation dataset

    # clean data
    df_validation = df_data.iloc[validation_idxs, :]
    df_validation = df_validation.reset_index(drop=True)

    # dirty data
    df_validation_noised = noised_data_df.iloc[validation_idxs, :]
    df_validation_noised = df_validation_noised.reset_index(drop=True)

    # ground-truth of errors - dataframe value changes
    df_changes_validation, _, _ = pd_df_diff(df_validation,
                                             df_validation_noised)

    # ground-truth of errors - error matrix (i.e. the matrix completion matrix)
    cells_changed_validation = cells_changed[validation_idxs, :]

    # ground-truth of errors - rows with errors
    tuples_changed_validation = tuples_changed[validation_idxs]

    # save
    df_validation.to_csv(path_saving + "/validation/" + "data_clean.csv",
                         index=False)
    df_validation_noised.to_csv(path_saving + "/validation/" +
                                "data_noised.csv",
                                index=False)
    df_changes_validation.to_csv(path_saving + "/validation/" +
                                 "changes_summary.csv")

    df_cells_changed_validation = pd.DataFrame(cells_changed_validation,
                                               columns=df_data.columns)
    df_cells_changed_validation.to_csv(path_saving + "/validation/" +
                                       "cells_changed_mtx.csv",
                                       index=False)

    df_tuples_changed_validation = pd.DataFrame(tuples_changed_validation,
                                                columns=["rows_with_outlier"])
    df_tuples_changed_validation.to_csv(path_saving + "/validation/" +
                                        "tuples_changed_mtx.csv",
                                        index=False)

    df_validation_idxs = pd.DataFrame(validation_idxs,
                                      columns=["original_idxs"])
    df_validation_idxs.to_csv(path_saving + "/validation/" +
                              "original_idxs.csv",
                              index=False)

    ## test dataset

    # clean data
    df_test = df_data.iloc[test_idxs, :]
    df_test = df_test.reset_index(drop=True)

    # dirty data
    df_test_noised = noised_data_df.iloc[test_idxs, :]
    df_test_noised = df_test_noised.reset_index(drop=True)

    # ground-truth of errors
    df_changes_test, _, _ = pd_df_diff(df_test, df_test_noised)

    # ground-truth of errors - error matrix (i.e. the matrix completion matrix)
    cells_changed_test = cells_changed[test_idxs, :]

    # ground-truth of errors - rows with errors
    tuples_changed_test = tuples_changed[test_idxs]

    # save
    df_test.to_csv(path_saving + "/test/" + "data_clean.csv", index=False)
    df_test_noised.to_csv(path_saving + "/test/" + "data_noised.csv",
                          index=False)
    df_changes_test.to_csv(path_saving + "/test/" + "changes_summary.csv")

    df_cells_changed_test = pd.DataFrame(cells_changed_test,
                                         columns=df_data.columns)
    df_cells_changed_test.to_csv(path_saving + "/test/" +
                                 "cells_changed_mtx.csv",
                                 index=False)

    df_tuples_changed_test = pd.DataFrame(tuples_changed_test,
                                          columns=["rows_with_outlier"])
    df_tuples_changed_test.to_csv(path_saving + "/test/" +
                                  "tuples_changed_mtx.csv",
                                  index=False)

    df_test_idxs = pd.DataFrame(test_idxs, columns=["original_idxs"])
    df_test_idxs.to_csv(path_saving + "/test/" + "original_idxs.csv",
                        index=False)

    ## full dataset

    # ground-truth of errors
    df_changes_full, _, _ = pd_df_diff(df_data, noised_data_df)

    # save
    df_data.to_csv(path_saving + "/full/" + "data_clean.csv", index=False)
    noised_data_df.to_csv(path_saving + "/full/" + "data_noised.csv",
                          index=False)
    df_changes_full.to_csv(path_saving + "/full/" + "changes_summary.csv")

    df_cells_changed = pd.DataFrame(cells_changed, columns=df_data.columns)
    df_cells_changed.to_csv(path_saving + "/full/" + "cells_changed_mtx.csv",
                            index=False)

    df_tuples_changed = pd.DataFrame(tuples_changed,
                                     columns=["rows_with_outlier"])
    df_tuples_changed.to_csv(path_saving + "/full/" + "tuples_changed_mtx.csv",
                             index=False)

Пример #2

0

Показать файл

Файл: dataset_prep_utils.py Проект: wazzed/RVAE_MixedTypes

def dirtify_and_save_image_data(array_data,
                                run_stats,
                                path_to_folder,
                                shape_dim=3,
                                image_dim_in=[28, 28]):
    """
    array_data := must be numpy array with image data (examples, x_dim, y_dim)

    run_stats := definitions to dirtify dataset

    path_to_folder := folder where to save datasets
    """

    ## create folders
    path_saving = create_data_folders(run_stats, path_to_folder)

    if shape_dim == 2:
        # lat_shape_img = int(np.sqrt(array_data.shape[1]))
        array_data = array_data.reshape(-1, image_dim_in[0], image_dim_in[1])

    # helper definitions
    num_points = array_data.shape[0]
    x_dim = array_data.shape[1]
    y_dim = array_data.shape[2]
    img_num_pixels = x_dim * y_dim
    col_names = ['pixel_{}'.format(n) for n in range(img_num_pixels)]

    if run_stats['conv_to_int']:
        array_data = array_data.astype(int)

    ## dirtify dataset

    # define numerical noise model
    if run_stats["type"] == "SaltnPepper":

        # standard Salt and Pepper Noise
        noise_mdl = ImageSaltnPepper(
            array_data.shape,
            probability=run_stats["p_img"],
            one_cell_flag=run_stats["one_cell_per_row"],
            min_val=run_stats["min_val"],
            max_val=run_stats["max_val"],
            p_min=run_stats['p_min'],
            p_pixel=run_stats['p_pixel'],
            conv_to_int=run_stats['conv_to_int'])

    elif run_stats["type"] == "AdditiveGaussian":

        noise_mdl = ImageAdditiveGaussianNoise(array_data.shape,
                                               probability=run_stats["p_img"],
                                               one_cell_flag=False,
                                               min_val=run_stats["min_val"],
                                               max_val=run_stats["max_val"],
                                               mu=run_stats["mu"],
                                               sigma=run_stats["sigma"],
                                               scale=np.array(
                                                   [run_stats["scale"]]),
                                               p_pixel=run_stats['p_pixel'])

    else:
        print("Numerical noise model does not exist!!")
        return

    # apply noise model to data
    noised_data, _ = noise_mdl.apply(array_data)

    # reshape image dataset to traditional format (examples, features) dims
    array_data_collapsed = array_data.reshape((num_points, -1))
    noised_data_collapsed = noised_data.reshape((num_points, -1))

    # save array data to DataFrames
    df_data = pd.DataFrame(array_data_collapsed, columns=col_names)
    df_noised_data = pd.DataFrame(noised_data_collapsed,
                                  columns=col_names,
                                  index=df_data.index)

    # get ground-truth
    df_changes, cells_changed, tuples_changed = pd_df_diff(
        df_data, df_noised_data)

    cells_changed = cells_changed.astype(int)
    tuples_changed = tuples_changed.astype(int)

    ## get dataset splits (Train; Valid; Test) and entire dataset
    train_idxs, validation_idxs, test_idxs = create_data_splits(
        run_stats, df_data)

    ## create dataset splits and save to folders NOTE: this needs to be changed for images?
    save_datasets(path_saving, df_data, train_idxs, validation_idxs, test_idxs,
                  df_noised_data, cells_changed, tuples_changed)

    ## save num_cols and cat_cols names for future reading by models
    if run_stats['conv_to_int']:
        num_cols_names = []
        cat_cols_names = df_data.columns.tolist()
    else:
        num_cols_names = df_data.columns.tolist()
        cat_cols_names = []

    cols_info = {
        "cat_cols_names": cat_cols_names,
        "num_cols_names": num_cols_names,
        "dataset_type": "image"
    }

    with open(path_saving + 'cols_info.json', 'w') as outfile:
        json.dump(cols_info, outfile, indent=4, sort_keys=True)

    with open(path_saving + 'noising_info.json', 'w') as outfile:
        json.dump(run_stats, outfile, indent=4, sort_keys=True)

Пример #3

0

Показать файл

Файл: dataset_prep_utils.py Проект: wazzed/RVAE_MixedTypes

def dirtify_and_save_mixed_simple(df_data, run_stats, path_to_folder):
    """ Simple:

        assumes same noise model for the categorical features;
        assumes same noise model for the continous features
    """

    ### create folders
    path_saving = create_data_folders(run_stats, path_to_folder)

    ### dirtify dataset

    # convert categorical features to categorical type (note must be converted to string first)
    df_data[run_stats["cat_cols_names"]] = df_data[
        run_stats["cat_cols_names"]].apply(lambda x: x.astype(str))
    df_data[run_stats["cat_cols_names"]] = df_data[
        run_stats["cat_cols_names"]].apply(lambda x: x.astype('category'))

    # dictionaries mapping the indexes between full dataframe and helper structures
    idx_map_cat = dict([(df_data.columns.get_loc(col), i)
                        for i, col in enumerate(run_stats["cat_cols_names"])])
    idx_map_num = dict([(df_data.columns.get_loc(col), i)
                        for i, col in enumerate(run_stats["num_cols_names"])])

    ## categorical columns definition
    dict_categories_per_feat = dict([
        (feat_name, [x for x in df_data[feat_name].cat.categories.tolist()]
         )  #  if x != '?'
        for feat_name in run_stats["cat_cols_names"]
    ])

    categories_per_feat_list = [
        dict_categories_per_feat[col] for col in run_stats["cat_cols_names"]
    ]

    if run_stats["cat_model"]["use_cat_probs"]:
        cats_probs_list = [
            np.array(df_data[col].value_counts()[
                dict_categories_per_feat[col]].values /
                     float(df_data[col].value_counts()[
                         dict_categories_per_feat[col]].sum()))
            for col in run_stats["cat_cols_names"]
        ]
    else:
        cats_probs_list = []

    # boolean array defining which features are categories (following the same order as the dataframe)
    cat_array_bool = (
        df_data.dtypes.apply(lambda t: str(t)) == 'category').values

    # define categorical noise model
    cat_mdl = CategoricalNoiseModel(
        df_data.shape,
        categories_per_feat_list,
        cats_probs_list=cats_probs_list,
        typo_prob=run_stats["cat_model"]["typo_prob"],
        alpha_prob=run_stats["cat_model"]["alpha_prob"])

    ## numerical column definition
    means_df = df_data[run_stats["num_cols_names"]].mean(
    ).values  # means of numerical features
    stds_df = df_data[run_stats["num_cols_names"]].std(
    ).values  # standard deviations of numerical features

    # define numerical noise model
    if run_stats["num_model"]["type"] == "Gaussian":
        # usual context outliers: 0.5*sigma
        num_mdl = GaussianNoiseModel(
            (df_data.shape[0], len(run_stats["num_cols_names"])),
            mu=run_stats["num_model"]["mu"],
            sigma=run_stats["num_model"]["sigma"],
            scale=stds_df,
            int_cast=run_stats["num_model"]["int_cast_array"])

    elif run_stats["num_model"]["type"] == "Laplace":

        num_mdl = LaplaceNoiseModel(
            (df_data.shape[0], len(run_stats["num_cols_names"])),
            mu=run_stats["num_model"]["mu"],
            b=run_stats["num_model"]["b"],
            scale=stds_df,
            int_cast=run_stats["num_model"]["int_cast_array"])

    elif run_stats["num_model"]["type"] == "Zipf":
        # usual value for context outliers: z = 3 or 4; big marginal outliers z = 1.3
        num_mdl = ZipfNoiseModel(
            (df_data.shape[0], len(run_stats["num_cols_names"])),
            z=run_stats["num_model"]["z"],
            scale=stds_df,
            int_cast=run_stats["num_model"]["int_cast_array"],
            active_neg=run_stats["num_model"]
            ["active_neg"])  # active_neg=True/False

    elif run_stats["num_model"]["type"] == "LogNormal":

        num_mdl = LogNormalNoiseModel(
            (df_data.shape[0], len(run_stats["num_cols_names"])),
            mu=run_stats["num_model"]["mu"],
            sigma=run_stats["num_model"]["sigma"],
            scale=stds_df,
            int_cast=run_stats["num_model"]["int_cast_array"])

    elif run_stats["num_model"]["type"] == "Mixture2Gaussians":

        num_mdl = Mixture2GaussiansNoiseModel(
            (df_data.shape[0], len(run_stats["num_cols_names"])),
            pc_1=run_stats["num_model"]["pc_1"],
            mu_1=run_stats["num_model"]["mu_1"],
            sigma_1=run_stats["num_model"]["sigma_1"],
            mu_2=run_stats["num_model"]["mu_2"],
            sigma_2=run_stats["num_model"]["sigma_2"],
            scale=stds_df,
            int_cast=run_stats["num_model"]["int_cast_array"])
    else:
        print("Numerical noise model does not exist!!")
        return

    # define mixed data noise model
    mix_mdl = MixedNoiseModel(df_data.shape,
                              cat_array_bool,
                              idx_map_cat,
                              idx_map_num, [cat_mdl], [num_mdl],
                              probability=run_stats["p_row"],
                              p_row=run_stats["p_cell"],
                              one_cell_flag=run_stats["one_cell_per_row"])

    ## apply noise model to data
    noised_data, _ = mix_mdl.apply(df_data)
    noised_data_df = pd.DataFrame(noised_data,
                                  columns=df_data.columns,
                                  index=df_data.index)
    noised_data_df[run_stats["cat_cols_names"]] = \
    noised_data_df[run_stats["cat_cols_names"]].apply(lambda x: x.astype('category'))

    ## get ground-truth
    df_changes, cells_changed, tuples_changed = pd_df_diff(
        df_data, noised_data_df)

    cells_changed = cells_changed.astype(int)
    tuples_changed = tuples_changed.astype(int)

    ## get dataset splits (Train; Valid; Test) and entire dataset
    train_idxs, validation_idxs, test_idxs = create_data_splits(
        run_stats, df_data)

    ## create dataset splits and save to folders
    save_datasets(path_saving, df_data, train_idxs, validation_idxs, test_idxs,
                  noised_data_df, cells_changed, tuples_changed)

    ## save num_cols and cat_cols names for future reading by models
    cols_info = {
        "cat_cols_names": run_stats["cat_cols_names"],
        "num_cols_names": run_stats["num_cols_names"],
        "dataset_type": "mixed"
    }

    with open(path_saving + 'cols_info.json', 'w') as outfile:
        json.dump(cols_info, outfile, indent=4, sort_keys=True)

    ## save run_stats information about noising
    # make ndarray JSON serializable
    run_stats_json = deepcopy(run_stats)
    run_stats_json["num_model"]["int_cast_array"] = \
    run_stats_json["num_model"]["int_cast_array"].tolist()

    with open(path_saving + 'noising_info.json', 'w') as outfile:
        json.dump(run_stats_json, outfile, indent=4, sort_keys=True)

Пример #4

0

Показать файл

Файл: dataset_prep_utils.py Проект: wazzed/RVAE_MixedTypes

def dirtify_and_save_real(df_data, run_stats, path_to_folder):
    """
    df_data := must be pandas dataframe with real (continous) dtype columns

    run_stats := definitions to dirtify dataset

    path_to_folder := folder where to save datasets
    """

    ## create folders
    path_saving = create_data_folders(run_stats, path_to_folder)

    ## dirtify dataset

    # numerical column definition
    means_df = df_data.mean().values  # means of numerical features
    stds_df = df_data.std().values  # standard deviations of numerical features

    # define numerical noise model
    if run_stats["type"] == "Gaussian":
        # usual context outliers: 0.5*sigma
        num_mdl = GaussianNoiseModel(
            (df_data.shape[0], len(df_data.columns)),
            probability=run_stats["p_row"],
            one_cell_flag=run_stats["one_cell_per_row"],
            mu=run_stats["mu"],
            sigma=run_stats["sigma"],
            scale=stds_df,
            int_cast=run_stats["int_cast_array"],
            p_cell=run_stats["p_cell"])

    elif run_stats["type"] == "Laplace":

        num_mdl = LaplaceNoiseModel(
            (df_data.shape[0], len(df_data.columns)),
            probability=run_stats["p_row"],
            one_cell_flag=run_stats["one_cell_per_row"],
            mu=run_stats["mu"],
            b=run_stats["b"],
            scale=stds_df,
            int_cast=run_stats["int_cast_array"],
            p_cell=run_stats["p_cell"])

    elif run_stats["type"] == "Zipf":
        # usual value for context outliers: z = 3 or 4; big marginal outliers z = 1.3
        num_mdl = ZipfNoiseModel(
            (df_data.shape[0], len(df_data.columns)),
            probability=run_stats["p_row"],
            one_cell_flag=run_stats["one_cell_per_row"],
            z=run_stats["z"],
            scale=stds_df,
            int_cast=run_stats["int_cast_array"],
            active_neg=run_stats["active_neg"],  # active_neg=True/False
            p_cell=run_stats["p_cell"])

    elif run_stats["num_model"]["type"] == "LogNormal":

        num_mdl = LogNormalNoiseModel(
            (df_data.shape[0], len(run_stats["num_cols_names"])),
            mu=run_stats["num_model"]["mu"],
            sigma=run_stats["num_model"]["sigma"],
            scale=stds_df,
            int_cast=run_stats["num_model"]["int_cast_array"])

    elif run_stats["num_model"]["type"] == "Mixture2Gaussians":

        num_mdl = Mixture2GaussiansNoiseModel(
            (df_data.shape[0], len(run_stats["num_cols_names"])),
            pc_1=run_stats["num_model"]["pc_1"],
            mu_1=run_stats["num_model"]["mu_1"],
            sigma_1=run_stats["num_model"]["sigma_1"],
            mu_2=run_stats["num_model"]["mu_2"],
            sigma_2=run_stats["num_model"]["sigma_2"],
            scale=stds_df,
            int_cast=run_stats["num_model"]["int_cast_array"])

    else:
        print("Numerical noise model does not exist!!")
        return

    # apply noise model to data
    noised_data, _ = num_mdl.apply(df_data)
    noised_data_df = pd.DataFrame(noised_data,
                                  columns=df_data.columns,
                                  index=df_data.index)

    # get ground-truth
    df_changes, cells_changed, tuples_changed = pd_df_diff(
        df_data, noised_data_df)

    cells_changed = cells_changed.astype(int)
    tuples_changed = tuples_changed.astype(int)

    ## get dataset splits (Train; Valid; Test) and entire dataset
    train_idxs, validation_idxs, test_idxs = create_data_splits(
        run_stats, df_data)

    ## create dataset splits and save to folders
    save_datasets(path_saving, df_data, train_idxs, validation_idxs, test_idxs,
                  noised_data_df, cells_changed, tuples_changed)

    ## save num_cols and cat_cols names for future reading by models
    cols_info = {
        "cat_cols_names": [],
        "num_cols_names": df_data.columns.tolist(),
        "dataset_type": "real"
    }

    with open(path_saving + 'cols_info.json', 'w') as outfile:
        json.dump(cols_info, outfile, indent=4, sort_keys=True)

    with open(path_saving + 'noising_info.json', 'w') as outfile:
        json.dump(run_stats, outfile, indent=4, sort_keys=True)

Пример #5

0

Показать файл

Файл: dataset_prep_utils.py Проект: wazzed/RVAE_MixedTypes

def dirtify_and_save_categorical(df_data, run_stats, path_to_folder):
    """
    df_data := must be pandas dataframe with category dtype columns

    run_stats := definitions to dirtify dataset

    path_to_folder := folder where to save datasets
    """

    ## make sure data is categorical (convert first to string)
    df_data = df_data.apply(lambda x: x.astype(str))
    df_data = df_data.apply(lambda x: x.astype('category'))

    ## create folders
    path_saving = create_data_folders(run_stats, path_to_folder)

    ## dirtify dataset

    # helper structures ; NOTE: leave or remove missing sign?
    dict_categories_per_feat = dict([
        (feat_name, [x for x in df_data[feat_name].cat.categories.tolist()]
         )  #  if x != '?' 
        for feat_name in df_data.columns.tolist()
    ])

    categories_per_feat_list = [
        dict_categories_per_feat[col] for col in df_data.columns.tolist()
    ]

    if run_stats["use_cat_probs"]:
        cats_probs_list = [
            np.array(df_data[col].value_counts()[
                dict_categories_per_feat[col]].values /
                     float(df_data[col].value_counts()[
                         dict_categories_per_feat[col]].sum()))
            for col in df_data.columns.tolist()
        ]
    else:
        cats_probs_list = []

    # define categorical noise model
    cat_mdl = CategoricalNoiseModel(
        df_data.shape,
        categories_per_feat_list,
        probability=run_stats["p_row"],
        cats_probs_list=cats_probs_list,
        typo_prob=run_stats["typo_prob"],
        p_cell=run_stats["p_cell"],
        alpha_prob=run_stats["alpha_prob"],
        one_cell_flag=run_stats["one_cell_per_row"])

    # apply noise model to data
    noised_data, _ = cat_mdl.apply(df_data)
    noised_data_df = pd.DataFrame(noised_data,
                                  columns=df_data.columns,
                                  index=df_data.index)

    # get ground-truth
    df_changes, cells_changed, tuples_changed = pd_df_diff(
        df_data, noised_data_df)

    cells_changed = cells_changed.astype(int)
    tuples_changed = tuples_changed.astype(int)

    ## get dataset splits (Train; Valid; Test) and entire dataset
    train_idxs, validation_idxs, test_idxs = create_data_splits(
        run_stats, df_data)

    ## create dataset splits and save to folders
    save_datasets(path_saving, df_data, train_idxs, validation_idxs, test_idxs,
                  noised_data_df, cells_changed, tuples_changed)

    ## save num_cols and cat_cols names for future reading by models
    cols_info = {
        "cat_cols_names": df_data.columns.tolist(),
        "num_cols_names": [],
        "dataset_type": "categorical"
    }

    with open(path_saving + 'cols_info.json', 'w') as outfile:
        json.dump(cols_info, outfile, indent=4, sort_keys=True)

    with open(path_saving + 'noising_info.json', 'w') as outfile:
        json.dump(run_stats, outfile, indent=4, sort_keys=True)

Пример #6

0

Показать файл

# Define Numerical Noise Model
num_mdl_overt = ZipfNoiseModel((df_data.shape[0], len(num_feat_names)), z=1.8, scale=stds_df,
                         int_cast=np.ones(len(num_feat_names), dtype=bool), active_neg=True)

# Define Mixed Model (0.10*0.10 = 0.01 of cells noised, and 0.10 of tuples)
mix_mdl_overt = MixedNoiseTupleWiseModel(df_data.shape, cat_array_bool, 
                                         idx_map_cat, idx_map_num, cat_mdl_overt,
                                         num_mdl_overt, probability=0.10, p_row=0.10)

# Apply Noising of Data
noised_overt_outliers, _ = mix_mdl_overt.apply(df_data.values)
noised_overt_outliers_df = pd.DataFrame(noised_overt_outliers, columns=df_data.columns, index=df_data.index)

# Get Ground-Truth
df_changes_overt, cell_changed_overt, tuples_changed_overt = pd_df_diff(df_data, noised_overt_outliers_df)

print "Changes to Adult Dataset: Overt Outliers"
print df_changes_overt
print "\n\n\n\n"

# Save Data
folder_path = DATADIR_PATH + "/overt_noise/"
if not os.path.exists(folder_path):
    os.makedirs(folder_path)
    
noised_overt_outliers_df.to_csv(folder_path + name_file + "_noised_overt.csv")

df_changes_overt.to_csv(folder_path + name_file + "_df_changes_noise_overt_gt_errors.csv")