def main(argv): """Main entrypoint for the anonymization tool""" # Default parameters configuration_file = '' input_file = '' output_file = '' use_cache = False # Read and set tool parameters try: opts, _ = getopt.getopt( argv, "c:i:o:vs", ["config=", "input=", "output=", "verbose", "use_chached_docs"]) except getopt.GetoptError: logger.error( 'main.py -c <config_file> -i <input_file> -o <output_file>') sys.exit(2) for opt, arg in opts: if opt in ("-c", "--config"): configuration_file = arg if opt in ("-i", "--input"): input_file = arg if opt in ("-o", "--output"): output_file = arg if opt in ("-s", "--use_chached_docs"): use_cache = True if opt in ("-v", "--verbose"): logging.getLogger().setLevel(logging.DEBUG) # Let's get started logger.info("Anonymizing input file %s", input_file) # Initialize and read configuration configuration_reader = ConfigurationReader() config = configuration_reader.read(configuration_file) # Read data using data types defined in the configuration data_reader = DataReader(config) df = data_reader.read(input_file) # Initialize the sensitive terms recognizer sensitive_terms_recognizer = SensitiveTermsRecognizer(config, use_cache) # Initialize the preprocessor (preprocessor is stateful, so pass df at the beginning) pp = Preprocessor(sensitive_terms_recognizer, config, df) # Run through preprocessing of dataframe: Data cleansing, analysis of textual attributes, resolving of redundant information, and compression pp.clean_textual_attributes() pp.analyze_textual_attributes() pp.find_redundant_information() pp.compress() # Get sensitive terms dictionary and preprocessed dataframe terms = pp.get_sensitive_terms() df = pp.get_df() # Initialize the anonymization kernel by providing the sensitive terms dictionary, the configuration, the sensitive terms recognizer, and the preprocessor kernel = AnonymizationKernel(terms, config, sensitive_terms_recognizer, pp) # Save the unanonymized dataframe for later unanonymized_df = df.copy() # Parameters for anonymization k = config.parameters["k"] strategy = config.parameters["strategy"] biases = config.get_biases() relational_weight = config.get_relational_weight() # Anonymize quasi identifier (applying k-anonymity) and recode textual attributes anonymized_df, partitions, partition_split_statistics = kernel.anonymize_quasi_identifiers( df, k, strategy, biases, relational_weight) anonymized_df = kernel.recode_textual_attributes(anonymized_df) # Parameters for calculating metrics quasi_identifiers = config.get_quasi_identifiers() textual_attribute_mapping = pp.get_textual_attribute_mapping() # Calculating the total, relational, and textual information loss based on the original and anonymized data frame total_information_loss, relational_information_loss, textual_information_loss = calculate_normalized_certainty_penalty( unanonymized_df, anonymized_df, quasi_identifiers, textual_attribute_mapping) # Calculating the mean and std for partition size as well as split statistics mean_partition_size = calculate_mean_partition_size(partitions) std_partition_size = calculate_std_partition_size(partitions) if partition_split_statistics: number_of_relational_splits, number_of_textual_splits = get_partition_split_share( partition_split_statistics, textual_attribute_mapping) # Notify about the results logger.info("Information loss for relational attributes is %4.4f", relational_information_loss) if textual_information_loss: logger.info("Information loss for textual attribute is %4.4f", textual_information_loss["total"]) logger.info("Total information loss is %4.4f", total_information_loss) logger.info( "Ended up with %d partitions with a mean size of %.2f and a std of %.2f", len(partitions), mean_partition_size, std_partition_size) if partition_split_statistics: logger.info("Split %d times on a relational attribute", number_of_relational_splits) logger.info("Split %d times on a textual attribute", number_of_textual_splits) # Initialize the postprocessor with the config and the preprocessor post_processor = PostProcessor(config, pp) # Perform post processing actions on the anonymized data frame anonymized_df = post_processor.clean(anonymized_df) anonymized_df = post_processor.uncompress(anonymized_df) anonymized_df = post_processor.pretty(anonymized_df) # Don't forget to drop the direct identifiers since they are now not needed anymore anonymized_df = kernel.remove_direct_identifier(anonymized_df) # Notify and save logger.info("Saving anonymized file to %s", output_file) anonymized_df.to_csv(output_file, index=False)
def main(argv): """Main entrypoint for the anonymization tool""" # Default parameters configuration_file = '' input_file = '' use_cache = True weight = 0.5 strategy = "gdf" result_dir = None # Read and set tool parameters try: opts, _ = getopt.getopt(argv, "c:i:r:w:v", ["config=", "input=", "weight=", "result_dir=", "verbose"]) except getopt.GetoptError: logger.error('experiment_runner.py -c <config_file> -i <input_file> -w <relational_weight>') sys.exit(2) for opt, arg in opts: if opt in ("-c", "--config"): configuration_file = arg if opt in ("-i", "--input"): input_file = arg base = os.path.basename(input_file) if not result_dir: result_dir = os.path.splitext(base)[0] if opt in ("-w", "--weight"): weight = float(arg) strategy = "mondrian" if opt in ("-r", "--result_dir"): result_dir = arg if opt in ("-v", "--verbose"): logging.getLogger().setLevel(logging.DEBUG) result_path = Path("experiment_results") / result_dir result_path.mkdir(parents=True, exist_ok=True) # Let's get started logger.info("Anonymizing input file %s", input_file) # Initialize and read configuration configuration_reader = ConfigurationReader() config = configuration_reader.read(configuration_file) # Read data using data types defined in the configuration data_reader = DataReader(config) df = data_reader.read(input_file) # Initialize the sensitive terms recognizer sensitive_terms_recognizer = SensitiveTermsRecognizer(config, use_cache) # Initialize the preprocessor (preprocessor is stateful, so pass df at the beginning) pp = Preprocessor(sensitive_terms_recognizer, config, df) # Run through preprocessing of dataframe: Data cleansing, analysis of textual attributes, resolving of redundant information, and compression pp.clean_textual_attributes() pp.analyze_textual_attributes() pp.find_redundant_information() pp.compress() # Get sensitive terms dictionary and preprocessed dataframe terms = pp.get_sensitive_terms() df = pp.get_df() # Initialize the anonymization kernel by providing the sensitive terms dictionary, the configuration, the sensitive terms recognizer, and the preprocessor kernel = AnonymizationKernel(terms, config, sensitive_terms_recognizer, pp) unanonymized = df # Determine k values for experiment k_values = [2, 3, 4, 5, 10, 20, 50] biases = config.get_biases() # Set strategy names if strategy == "mondrian": strategy_name = "mondrian-{}".format(weight) elif strategy == "gdf": strategy_name = strategy # Parameters for calculating metrics quasi_identifiers = config.get_quasi_identifiers() textual_attribute_mapping = pp.get_textual_attribute_mapping() # Prepare dataframes and json to store experiment results total_information_loss = pd.DataFrame(index=k_values, columns=[strategy_name]) total_information_loss.index.name = 'k' relational_information_loss = pd.DataFrame(index=k_values, columns=[strategy_name]) relational_information_loss.index.name = 'k' textual_information_loss = pd.DataFrame(index=k_values, columns=[strategy_name]) textual_information_loss.index.name = 'k' detailed_loss_level_0 = [k for k in textual_attribute_mapping] detailed_loss_level_1 = set() for k in textual_attribute_mapping: for e in textual_attribute_mapping[k]: detailed_loss_level_1.add(e.replace("{}_".format(k), '')) detailed_loss_level_1 = ["total"] + list(detailed_loss_level_1) detailed_textual_information_loss = pd.DataFrame(index=k_values, columns=pd.MultiIndex.from_product([detailed_loss_level_0, detailed_loss_level_1])) detailed_textual_information_loss.index.name = 'k' partition_sizes = {} partition_sizes[strategy_name] = {} partition_splits = {} partition_splits[strategy_name] = {} # Let's start the experiments for k in k_values: logger.info("-------------------------------------------------------------------------------") logger.info("Anonymizing dataset with k=%d and strategy %s", k, strategy_name) # Anonymize dataset for a specific k anonymized_df, partitions, partition_split_statistics = kernel.anonymize_quasi_identifiers(df, k, strategy, biases, weight) # Calculating the total, relational, and textual information loss based on the original and anonymized data frame total_il, relational_il, textual_il = calculate_normalized_certainty_penalty(unanonymized, anonymized_df, quasi_identifiers, textual_attribute_mapping) # Calculating the mean and std for partition size as well as split statistics mean_partition_size = calculate_mean_partition_size(partitions) std_partition_size = calculate_std_partition_size(partitions) if partition_split_statistics: number_of_relational_splits, number_of_textual_splits = get_partition_split_share(partition_split_statistics, textual_attribute_mapping) # Notify about the results logger.info("Information loss for relational attributes is %4.4f", relational_il) if textual_il: logger.info("Information loss for textual attribute is %4.4f", textual_il["total"]) logger.info("Total information loss is %4.4f", total_il) logger.info("Ended up with %d partitions with a mean size of %.2f and a std of %.2f", len(partitions), mean_partition_size, std_partition_size) if partition_split_statistics: logger.info("Split %d times on a relational attribute", number_of_relational_splits) logger.info("Split %d times on a textual attribute", number_of_textual_splits) # Store experiment results total_information_loss.at[k, strategy_name] = total_il relational_information_loss.at[k, strategy_name] = relational_il if textual_il: textual_information_loss.at[k, strategy_name] = textual_il["total"] for key in textual_il: if isinstance(textual_il[key], dict): for subkey in textual_il[key]: if subkey == "total": detailed_textual_information_loss.at[k, (key, "total")] = textual_il[key]["total"] else: entity_type = subkey.replace("{}_".format(key), '') detailed_textual_information_loss.at[k, (key, entity_type)] = textual_il[key][subkey] partition_sizes[strategy_name][k] = get_partition_lengths(partitions) if partition_split_statistics: partition_splits[strategy_name][k] = { "relational": number_of_relational_splits, "textual": number_of_textual_splits } # Define file info if strategy == "mondrian": file_info = str(weight).replace(".", "_") elif strategy == "gdf": file_info = strategy # Save the experiment results with open(result_path / 'partition_distribution_{}.json'.format(file_info), 'w') as f: json.dump(partition_sizes, f, ensure_ascii=False) if partition_split_statistics: with open(result_path / 'partition_splits_{}.json'.format(file_info), 'w') as f: json.dump(partition_splits, f, ensure_ascii=False) total_information_loss.to_csv(result_path / "total_information_loss_{}.csv".format(file_info)) relational_information_loss.to_csv(result_path / "relational_information_loss_{}.csv".format(file_info)) if textual_il: textual_information_loss.to_csv(result_path / "textual_information_loss_{}.csv".format(file_info)) detailed_textual_information_loss.to_csv(result_path / "detailed_textual_information_loss_{}.csv".format(file_info))