def test_pgbs(self): basket_sets = im.import_dataset("chess").sample(100) # limit due to testing original_database = basket_sets.copy() modified_database = basket_sets.copy() # We partition the Chess databases into 5 bins, then randomly select 2 itemsets from each bin, # assign the minimum support threshold as the minimum support given in the support range # This takes a long time, so will just use their values. Table 3: Support ranges for databases. sigma_min = min([0.6001, 0.6136, 0.6308, 0.6555, 0.6974]) sigma_model = 0.5 original_IS = fpgrowth(original_database, min_support=sigma_model, use_colnames=True) # Get 10 sensitive itemsets sensitive_IS = original_IS.sample(10) sensitive_IS_PGBS = pd.DataFrame({ 'itemset': [list(IS) for IS in sensitive_IS["itemsets"]], 'threshold': [sigma_min for _ in sensitive_IS["support"]]}) pgbs(modified_database, sensitive_IS_PGBS) # Give all itemsets and supports in D (original_database) a = original_IS # Give all itemsets and supports in D' (modified_database) b = fpgrowth(modified_database, min_support=sigma_model, use_colnames=True) il = information_loss(a, b) self.assertEqual(0.5542, round(il, 4))
def setUpClass(cls): # Get toy data, WARNING! Had to change relative reference for this to work cls.basket_sets = im.import_dataset("toydata") # Abuse FPGrowth using absolute smallest min support to get all itemsets as frequent itemsets sigma_model = 1 / len(cls.basket_sets) cls.original_IS = fpgrowth(cls.basket_sets, min_support=sigma_model, use_colnames=True, verbose=False) # Compute closed itemsets of original data base cls.original_Closed_IS, _ = get_closed_itemsets( cls.basket_sets, sigma_model)
def main(datasets): for dataset in datasets: sigma_model = datasets[dataset][0] sigma_min = datasets[dataset][1] k_freq = 30 for sigma_lower in [0.7, 0.725, 0.75, 0.775]: #Load dataset data = im.import_dataset(dataset) data = data.astype('bool') #This may be needed for some datasets print("\n", dataset, "imported\n") #Convert to closed itemsets current_model, freq_model = get_closed_itemsets(data, sigma_model) freq_original = freq_model.loc[freq_model["support"] >= sigma_min] sensitive_IS = get_top_k_sensitive_itemsets(freq_original, k_freq) #Convert to pandas format for MRPS input sensitive_IS_pandas = pd.DataFrame( data=[(sensitive_IS), np.full((len(sensitive_IS)), sigma_min), np.full((len(sensitive_IS)), sigma_lower)]).T sensitive_IS_pandas.columns = [ 'itemset', 'upper_threshold', 'lower_threshold' ] #Run RPS random threshold sanitized_closed_IS = rps_two_thresholds( model=current_model, sensitiveItemsets=sensitive_IS_pandas) #Reproduce frequent itemsets sanitized_DB = itemsets_from_closed_itemsets( closed_itemsets=sanitized_closed_IS, possible_itemsets=freq_model['itemsets']) #Plot support graphs dual_support_graph_distribution( freq_model, sanitized_DB, sigma_model, dataset + "_presentation_" + str(sigma_lower) + "_" + str(k_freq)) information_l = information_loss(freq_model.copy(), sanitized_DB) print(sigma_lower, information_l)
def main(datasets): df = pd.DataFrame(columns=[ 'Dataset Name', 'Number of transactions', 'Number of Unique items', 'Minimum Transaction Length', 'Maximum Transaction Length', 'Average Transaction Length' ]) for dataset_name in datasets: print("Analysing", dataset_name) data = im.import_dataset(dataset_name) data = data.astype('bool') average = 0 minimum = 100000 maximum = 0 for _, row in data.iterrows(): transaction_len = sum(row) #Minimum transaction length if minimum > transaction_len: minimum = transaction_len #Maximum transaction length if maximum < transaction_len: maximum = transaction_len #Average transaction length average += transaction_len new_row = { 'Dataset Name': dataset_name, 'Number of transactions': data.shape[0], 'Number of Unique items': data.shape[1], 'Minimum Transaction Length': minimum, 'Maximum Transaction Length': maximum, 'Average Transaction Length': average / data.shape[0] } df = df.append(new_row, ignore_index=True) print(df) return df
def main(dataset, min_sup): print("Processing:", dataset) basket_sets = im.import_dataset(dataset) #Plot the support distribution frequent_itemsets = fpgrowth(basket_sets, min_support=min_sup, use_colnames=True) support_graph_distribution(frequent_itemsets, min_sup, dataset) #Example of plotting the dual distributions # #Plot the dual distribution by randomly reducing some values for testing # copy = frequent_itemsets.copy() # copy.dropna(inplace=True) #This is needed for some reason? # for i in range(copy.shape[0]//2): # copy.loc[random.randint(0, copy.shape[0]), ["support"]] = copy.loc[random.randint(0, copy.shape[0]-1), ["support"]]/2 # dual_support_graph_distribution(frequent_itemsets, copy, min_sup, dataset) #Manually assigning minimum supports # datasets = {"toydata": 0.005, # "BMS1": 0.00085, # "BMS2": 0.0005, # "uci_retail": 0.005, # "mushroom": 0.1, # "Belgian_retail": 0.0005, # "chess": 0.7, # "connect": 0.8, # "pumsb": 0.83, # "pumsb_star": 0.38, # "T40I10D100K": 0.011, # "T10I4D100K": 0.001, # "accidents": 0.38, # "instacart": 0.005} # for key, value in datasets.items(): # main(key, value)
def main(dataset_name, threshold): data = im.import_dataset(dataset_name) CI_n = get_closed_itemsets_new(data, threshold)[0] CI_o = get_closed_itemsets(data, threshold)[0] same = [] have = [] missing = [] for CI in CI_o: if CI in CI_n: same += [CI] else: if CI_o[CI] > threshold: missing += [CI_o[CI]] for CI in CI_n: if not CI in CI_o: have += [CI] print("Similar closed:", len(same)) print("Need to remove:", len(have)) print("Need to add to:", len(missing))
def main(datasets, algorithm, i): #Create the base of a table table_11 = pd.DataFrame(columns=[ 'Model', 'Support threshold', 'Model threshold', 'Sensitive itemsets', 'Number of FI before sanitization', 'Number of FI containing an element of S before sanitization', 'Information loss expected', 'Number of FI after sanitization', 'Number of FI containing an element of S after RPS', 'Hiding failure', 'Artifactual patterns', 'Misses cost', 'Side effects factor', 'Information loss', 'RPS Time' ]) table_10 = pd.DataFrame(columns=[ 'Dataset', 'Model threshold', 'Number of Closed frequent itemsets', 'Number of frequent itemsets', 'Time closed itemsets' ]) #Loop through datasets for dataset in datasets: sigma_model = datasets[dataset][0] #Load dataset data = im.import_dataset(dataset) data = data.astype('bool') #This may be needed for some datasets print("\n", dataset, "imported\n") #Start total timer total_time_start = time.time() #Convert to closed itemsets current_model, freq_model = get_closed_itemsets(data, sigma_model) new_row = { 'Dataset': dataset, 'Model threshold': sigma_model, 'Number of Closed frequent itemsets': len(current_model), 'Number of frequent itemsets': len(freq_model), 'Time closed itemsets': time.time() - total_time_start } print(new_row) table_10 = table_10.append(new_row, ignore_index=True) table_10.to_csv('table_10.csv') #Loop through support thresholds for sigma_min in datasets[dataset][1:]: print("\n", dataset, "FI:", sigma_min) #Find original frequent itemsets at frequency sigma min freq_original = freq_model.loc[freq_model["support"] >= sigma_min] for k_freq in [10, 30]: print("-", dataset, ":", k_freq, "Sensitive itemsets") #Copy the model so we can edit it directly copied_model = current_model.copy() #We pick sensitive itemsets here sensitive_IS = get_top_k_sensitive_itemsets( freq_original, k_freq) num_FI_containing_S = count_FI_containing_S( freq_original, sensitive_IS) if algorithm == "RPS": #Start timer for RPS portion total_time_start = time.time() #Run RPS sanitized_closed_IS = rps(model=copied_model, sensitiveItemsets=sensitive_IS, supportThreshold=sigma_min) elif algorithm == "MRPS": #Convert to pandas format for MRPS input sensitive_IS_pandas = pd.DataFrame( data=[(sensitive_IS), np.full((len(sensitive_IS)), sigma_min), np.full((len(sensitive_IS)), sigma_min - 0.5 * (sigma_min - sigma_model))]).T sensitive_IS_pandas.columns = [ 'itemset', 'upper_threshold', 'lower_threshold' ] #Start timer for RPS portion total_time_start = time.time() #Run RPS random threshold sanitized_closed_IS = rps_two_thresholds( model=copied_model, sensitiveItemsets=sensitive_IS_pandas) #Reproduce frequent itemsets sanitized_DB = itemsets_from_closed_itemsets( closed_itemsets=sanitized_closed_IS, possible_itemsets=freq_model['itemsets']) rps_time = time.time() #Calculating metrics #Variables needed freq_sanitized = sanitized_DB.loc[ sanitized_DB["support"] >= sigma_min] #Sensitive subsets of frequent itemsets freq_sanitized_sensitive = get_sensitive_subsets( freq_sanitized, sensitive_IS) freq_original_sensitive = get_sensitive_subsets( freq_original, sensitive_IS) #Non sensitive subset of frequent itemsets freq_sanitized_nonsensitive = remove_sensitive_subsets( freq_sanitized, sensitive_IS)["itemsets"] freq_original_nonsensitive = remove_sensitive_subsets( freq_original, sensitive_IS)["itemsets"] #Calculation of metrics hiding_f = hiding_failure(freq_original_sensitive["itemsets"], freq_sanitized_sensitive["itemsets"]) artifactual_p = artifactual_patterns( set(freq_original["itemsets"]), set(freq_sanitized["itemsets"])) misses_c = misses_cost(freq_original_nonsensitive.copy(), freq_sanitized_nonsensitive.copy()) side_effect_fac = side_effects_factor( set(freq_original["itemsets"]), set(freq_sanitized["itemsets"]), set(freq_original_sensitive["itemsets"])) #Information loss between frequent itemsets in original and sanitized at sigma model information_l = information_loss(freq_model.copy(), sanitized_DB) #Expected information loss if all sensitive frequent itemsets had their support reduced to sigma min expected_information_l = expected_information_loss( freq_model.copy(), freq_original_sensitive.copy(), sigma_min) #Calculate the end time of this iteration end_time = rps_time - total_time_start #Threshold sanitized database by threshold_min to get frequent itemsets print(f'- RPS time: {end_time}') #Plot support graphs dual_support_graph_distribution( freq_model, sanitized_DB, sigma_model, dataset + "_" + str(i) + "_" + str(sigma_min) + "_" + str(k_freq)) #Find number of FI in sanitized database containing sensitive itemsets num_FI_containing_S_RPS = count_FI_containing_S( freq_sanitized, sensitive_IS) #Add to row of table new_row = { 'Model': dataset, 'Model threshold': sigma_model, 'Support threshold': sigma_min, 'Sensitive itemsets': k_freq, 'Number of FI before sanitization': len(freq_original), 'Number of FI containing an element of S before sanitization': num_FI_containing_S, 'Information loss expected': expected_information_l, 'Number of FI after sanitization': len(freq_sanitized), 'Number of FI containing an element of S after RPS': num_FI_containing_S_RPS, 'Hiding failure': hiding_f, 'Artifactual patterns': artifactual_p, 'Misses cost': misses_c, 'Side effects factor': side_effect_fac, 'Information loss': information_l, 'RPS Time': end_time } #Update after each one just so we are sure we are recording results table_11 = table_11.append(new_row, ignore_index=True) table_11.to_csv('table_11_' + str(i) + '.csv')
class TestArtifactualPatterns(unittest.TestCase): original_IS = None original_Closed_IS = None # Want to hide the sensitive itemsets below this threshold sigma_min = 0.3 # Sensitive closed itemsets whose support needs to be reduced sensitive_IS = {frozenset(['1', '2']), frozenset(['4'])} # Get toy data, WARNING! Had to change relative reference for this to work basket_sets = im.import_dataset("toydata") @classmethod def setUpClass(cls): # Abuse FPGrowth using absolute smallest min support to get all itemsets as frequent itemsets sigma_model = 1 / len(cls.basket_sets) cls.original_IS = fpgrowth(cls.basket_sets, min_support=sigma_model, use_colnames=True, verbose=False) # Compute closed itemsets of original data base cls.original_Closed_IS, _ = get_closed_itemsets( cls.basket_sets, sigma_model) # Get frequent itemsets cls.original_Freq_IS = cls.original_IS[ cls.original_IS["support"] >= cls.sigma_min] def test_artifactual_patterns_with_rps(self): # Produce a sanitised DB with sensitive IS's support below sigma_min sanitized_closed_IS = rps(model=self.original_Closed_IS, sensitiveItemsets=self.sensitive_IS, supportThreshold=self.sigma_min) # Convert from closed to frequent itemsets sanitised_F_IS = itemsets_from_closed_itemsets( closed_itemsets=sanitized_closed_IS, possible_itemsets=self.original_IS['itemsets']) # All itemsets in original database a = set(self.original_Freq_IS["itemsets"]) # All itemsets in sanitised database b = set(sanitised_F_IS[sanitised_F_IS["support"] >= self.sigma_min] ["itemsets"]) af = artifactual_patterns(a, b) self.assertEqual(af, 0.0) def test_artifactual_patterns_with_pgbs(self): # PGBS needs input in this format sensitive_IL = pd.DataFrame({ 'itemset': [list(l) for l in self.sensitive_IS], 'threshold': [self.sigma_min, self.sigma_min] }) original_database = self.basket_sets.copy() modified_database = self.basket_sets.copy() # No return value, instead it modifies input database in place pgbs(modified_database, sensitive_IL) # Get all itemsets and supports in D (original_database) sigma_model = 1 / len(original_database) original_IS = fpgrowth(original_database, min_support=sigma_model, use_colnames=True, verbose=False) # Get all itemsets and supports in D' (modified_database) mofidied_F_IS = fpgrowth(modified_database, min_support=sigma_model, use_colnames=True, verbose=False) # All itemsets in original database a = set(original_IS["itemsets"]) # All itemsets in sanitised database b = set(mofidied_F_IS["itemsets"]) af = artifactual_patterns(a, b) self.assertEqual(af, 0.0)
def main(): min_support = 0.01 #Support threshold used min_confidence = 0.05 #Confidence threshold used print('========== Importing Dataset ==========') basket_sets = im.import_dataset( "toydata" ) #Insert any of the datasets listed above here to import them print('=======================================\n') # Gather all itemsets power_set_of_items = fpgrowth(basket_sets, min_support=(1 / len(basket_sets)), use_colnames=True) # Find frequent itemsets above support threshold min_support frequent_itemsets = fpgrowth(basket_sets, min_support=min_support, use_colnames=True) # Compute closed itemsets from database closed_itemsets, _ = get_closed_itemsets(basket_sets, 1 / len(basket_sets)) # Recover the original itemsets from the list of closed itemsets recovered_itemsets = itemsets_from_closed_itemsets( closed_itemsets=closed_itemsets, possible_itemsets=power_set_of_items['itemsets']) assert recovered_itemsets.equals(power_set_of_items) # Sanitize database sanitized_closed_itemsets = rps( reference_model=closed_itemsets, sensitiveItemsets={frozenset(['1', '2']), frozenset(['4'])}, supportThreshold=0.3) sanitized_database = itemsets_from_closed_itemsets( closed_itemsets=sanitized_closed_itemsets, possible_itemsets=power_set_of_items['itemsets']) print('Raw Database:') print(power_set_of_items) print() print('Sanitized Database:') print(sanitized_database) print() print(f'Frequent Itemsets above min_sup {min_support}:') print(frequent_itemsets) print() # print(frequent_itemsets) if frequent_itemsets.shape[0] > 0: rules = association_rules(frequent_itemsets, metric="confidence", min_threshold=min_confidence) if rules.shape[0] > 0: print(rules[rules['confidence'] >= 0.0]) else: print("Confidence too low, no rules were found") else: print("Support too low, no frequent item sets found")
def main(datasets): #Create the base of a table table_11 = pd.DataFrame(columns=['Model', 'Support threshold', 'Model threshold', 'Sensitive itemsets', 'Number of FI before sanitization', 'Information loss expected', 'Number of FI after sanitization', 'Number of FI containing an element of S after RPS', 'Hiding failure', 'Artifactual patterns', 'Misses cost', 'Side effects factor', 'Information loss', 'PGBS time']) #Loop through datasets for dataset in datasets: sigma_model = datasets[dataset][0] #Load dataset data = im.import_dataset(dataset) data = data.astype('bool') #This may be needed for some datasets print("\n", dataset, "imported\n") #Get frequent itemsets freq_model = fpgrowth(data, min_support=sigma_model, use_colnames=True) #Loop through support thresholds for sigma_min in datasets[dataset][1:]: print("\n", dataset, "FI:", sigma_min) #Find original frequent itemsets at frequency sigma min freq_original = freq_model.loc[freq_model["support"] >= sigma_min] for k_freq in [10, 30, 50]: print("-", dataset, ":", k_freq, "Sensitive itemsets") #Copy the transactions so we can edit it directly copied_data = data.copy() #We pick sensitive itemsets here sensitive_IS = get_top_k_sensitive_itemsets(freq_original, k_freq) #Start timer for PGBS portion total_time_start = time.time() #Convert to pandas format for PGBS input sensitive_IS_pandas = pd.DataFrame(data=[(sensitive_IS), np.full((len(sensitive_IS)), sigma_min)]).T sensitive_IS_pandas.columns = ['itemset', 'threshold'] #Run PGBS print("Running PGBS") pgbs(copied_data,sensitive_IS_pandas) print("PGBS run") pgbs_time = time.time() sensitive_IS = convert_to_sets(sensitive_IS) print("FPGrowth") #Reproduce frequent itemsets freq_model_sanitized = fpgrowth(copied_data, min_support=sigma_model, use_colnames=True) #Calculating metrics #Variables needed freq_sanitized = freq_model_sanitized.loc[freq_model_sanitized["support"] >= sigma_min] #Sensitive subsets of frequent itemsets freq_sanitized_sensitive = get_sensitive_subsets(freq_sanitized, sensitive_IS) freq_original_sensitive = get_sensitive_subsets(freq_original, sensitive_IS) #Non sensitive subset of frequent itemsets freq_sanitized_nonsensitive = remove_sensitive_subsets(freq_sanitized, sensitive_IS)["itemsets"] freq_original_nonsensitive = remove_sensitive_subsets(freq_original, sensitive_IS)["itemsets"] #Calculation of metrics freq_original_sensitive.to_csv("original.csv") freq_sanitized_sensitive.to_csv("sanitized.csv") print("len:", len(freq_original_sensitive["itemsets"]), len(freq_sanitized_sensitive["itemsets"])) hiding_f = hiding_failure(freq_original_sensitive["itemsets"], freq_sanitized_sensitive["itemsets"]) artifactual_p = artifactual_patterns(set(freq_original["itemsets"]), set(freq_sanitized["itemsets"])) misses_c = misses_cost(freq_original_nonsensitive.copy(), freq_sanitized_nonsensitive.copy()) side_effect_fac = side_effects_factor(set(freq_original["itemsets"]), set(freq_sanitized["itemsets"]), set(freq_original_sensitive["itemsets"])) #Information loss between frequent itemsets in original and sanitized at sigma model information_l = information_loss(freq_model.copy(), freq_model_sanitized) #Expected information loss if all sensitive frequent itemsets had their support reduced to sigma min expected_information_l = expected_information_loss(freq_model.copy(), freq_original_sensitive.copy(), sigma_min) #Calculate the end time of this iteration end_time = pgbs_time - total_time_start #Threshold sanitized database by threshold_min to get frequent itemsets print(f'- PGBS time: {end_time}') #Plot support graphs dual_support_graph_distribution(freq_model, freq_model_sanitized, sigma_model, dataset+"_PGBS_"+str(sigma_min)+"_"+str(k_freq)) #Find number of FI in sanitized database containing sensitive itemsets num_FI_containing_S_RPS = count_FI_containing_S(freq_sanitized, sensitive_IS) #Add to row of table new_row = {'Model': dataset, 'Model threshold': sigma_model, 'Support threshold': sigma_min, 'Sensitive itemsets': k_freq, 'Number of FI before sanitization': len(freq_original), 'Information loss expected': expected_information_l, 'Number of FI after sanitization': len(freq_sanitized), 'Number of FI containing an element of S after RPS': num_FI_containing_S_RPS, 'Hiding failure': hiding_f, 'Artifactual patterns': artifactual_p, 'Misses cost': misses_c, 'Side effects factor': side_effect_fac, 'Information loss': information_l, 'PGBS time': end_time} #Update after each one just so we are sure we are recording results table_11 = table_11.append(new_row, ignore_index=True) table_11.to_csv('table_pgbs.csv')
def main(datasets): #Create the base of a table table_11 = pd.DataFrame(columns=[ 'Model', 'Support threshold', 'Model threshold', 'Sensitive itemsets', 'Number of FI before sanitization', 'Information loss expected', 'Number of FI after sanitization', 'Number of FI containing an element of S after SWA', 'Hiding failure', 'Artifactual patterns', 'Misses cost', 'Side effects factor', 'Information loss', 'SWA time' ]) #Loop through datasets for dataset in datasets: #Loop through support thresholds #TODO: error running this in the normal way but #It is not much of a slowdown for SWA to have this here #Load dataset sigma_model = datasets[dataset][0] db = im.import_dataset(dataset) db = db.astype('bool') #This may be needed for some datasets print("\n", dataset, "imported") #Get frequent itemsets freq_model = fpgrowth(db, min_support=sigma_model, use_colnames=True) for sigma_min in datasets[dataset][1:]: print("\n", dataset, "FI:", sigma_min) #Find original frequent itemsets at frequency sigma min freq_original = freq_model.loc[freq_model["support"] >= sigma_min] for k_freq in [10, 30, 50]: data = im.convert_to_transaction(db) print(dataset, ":", k_freq, "Sensitive itemsets") #We pick sensitive itemsets here sensitive_IS = get_top_k_sensitive_itemsets( freq_original, k_freq) #Start timer for SWA portion total_time_start = time.time() #Convert to pandas format for SWA input sensitive_rules = get_disclosures(sensitive_IS, freq_model, sigma_min) #Run SWA SWA(data, sensitive_rules, data.shape[0]) swa_time = time.time() sensitive_IS = convert_to_sets(sensitive_IS) data = im.convert_to_matrix(data) #Reproduce frequent itemsets freq_model_sanitized = fpgrowth(data, min_support=sigma_model, use_colnames=True) #Calculating metrics #Variables needed freq_sanitized = freq_model_sanitized.loc[ freq_model_sanitized["support"] >= sigma_min] #Sensitive subsets of frequent itemsets freq_sanitized_sensitive = get_sensitive_subsets( freq_sanitized, sensitive_IS) freq_original_sensitive = get_sensitive_subsets( freq_original, sensitive_IS) #Non sensitive subset of frequent itemsets freq_sanitized_nonsensitive = remove_sensitive_subsets( freq_sanitized, sensitive_IS)["itemsets"] freq_original_nonsensitive = remove_sensitive_subsets( freq_original, sensitive_IS)["itemsets"] #Calculation of metrics freq_original_sensitive.to_csv("original.csv") freq_sanitized_sensitive.to_csv("sanitized.csv") print("- len:", len(freq_original_sensitive["itemsets"]), len(freq_sanitized_sensitive["itemsets"])) hiding_f = hiding_failure(freq_original_sensitive["itemsets"], freq_sanitized_sensitive["itemsets"]) artifactual_p = artifactual_patterns( set(freq_original["itemsets"]), set(freq_sanitized["itemsets"])) misses_c = misses_cost(freq_original_nonsensitive.copy(), freq_sanitized_nonsensitive.copy()) side_effect_fac = side_effects_factor( set(freq_original["itemsets"]), set(freq_sanitized["itemsets"]), set(freq_original_sensitive["itemsets"])) #Information loss between frequent itemsets in original and sanitized at sigma model information_l = information_loss(freq_model.copy(), freq_model_sanitized) #Expected information loss if all sensitive frequent itemsets had their support reduced to sigma min expected_information_l = expected_information_loss( freq_model.copy(), freq_original_sensitive.copy(), sigma_min) #Calculate the end time of this iteration end_time = swa_time - total_time_start #Threshold sanitized database by threshold_min to get frequent itemsets print(f'- SWA time: {end_time}') #Plot support graphs dual_support_graph_distribution( freq_model, freq_model_sanitized, sigma_model, dataset + "_SWA_" + str(sigma_min) + "_" + str(k_freq)) #Find number of FI in sanitized database containing sensitive itemsets num_FI_containing_S_RPS = count_FI_containing_S( freq_sanitized, sensitive_IS) #Add to row of table new_row = { 'Model': dataset, 'Model threshold': sigma_model, 'Support threshold': sigma_min, 'Sensitive itemsets': k_freq, 'Number of FI before sanitization': len(freq_original), 'Information loss expected': expected_information_l, 'Number of FI after sanitization': len(freq_sanitized), 'Number of FI containing an element of S after SWA': num_FI_containing_S_RPS, 'Hiding failure': hiding_f, 'Artifactual patterns': artifactual_p, 'Misses cost': misses_c, 'Side effects factor': side_effect_fac, 'Information loss': information_l, 'SWA time': end_time } #Update after each one just so we are sure we are recording results table_11 = table_11.append(new_row, ignore_index=True) table_11.to_csv('table_SWA.csv')
def main(datasets): for dataset in datasets: sigma_model = datasets[dataset][0] sigma_min = datasets[dataset][1] k_freq = 10 #Load dataset data = im.import_dataset(dataset) data = data.astype('bool') #This may be needed for some datasets print("\n", dataset, "imported\n") #Convert to closed itemsets current_model, freq_model = get_closed_itemsets(data, sigma_model) freq_original = freq_model.loc[freq_model["support"] >= sigma_min] sensitive_IS = get_top_k_sensitive_itemsets(freq_original, k_freq) #Convert to pandas format for MRPS input # sensitive_IS_pandas = pd.DataFrame(data=[(sensitive_IS), # np.array([0.8, 0.79, 0.78, 0.77, 0.76, 0.75, 0.74, 0.73, 0.72, 0.71]), # np.array([0.795, 0.785, 0.775, 0.765, 0.755, 0.745, 0.735, 0.725, 0.715, 0.705])]).T sensitive_IS_pandas = pd.DataFrame( data=[(sensitive_IS), np.array( [0.8, 0.74, 0.8, 0.74, 0.8, 0.74, 0.8, 0.74, 0.8, 0.74]), np.array([ 0.78, 0.72, 0.78, 0.72, 0.78, 0.72, 0.78, 0.72, 0.78, 0.72, ])]).T print(sensitive_IS_pandas) sensitive_IS_pandas.columns = [ 'itemset', 'upper_threshold', 'lower_threshold' ] #Run RPS random threshold sanitized_closed_IS = rps_two_thresholds( model=current_model, sensitiveItemsets=sensitive_IS_pandas) #Reproduce frequent itemsets sanitized_DB = itemsets_from_closed_itemsets( closed_itemsets=sanitized_closed_IS, possible_itemsets=freq_model['itemsets']) #Plot support graphs dual_support_graph_distribution( freq_model, sanitized_DB, sigma_model, dataset + "_presentation_10_bins_" + str(k_freq)) for sensitive in sensitive_IS: print( sensitive, ":", sanitized_DB.loc[sanitized_DB['itemsets'] == sensitive] ["support"].values[0], ":", freq_model.loc[ freq_model['itemsets'] == sensitive]["support"].values[0]) information_l = information_loss(freq_model.copy(), sanitized_DB) print(information_l)
def main(datasets, experiment): for dataset in datasets: sigma_model = datasets[dataset][0] sigma_min = datasets[dataset][1] k_freq = 10 #Load dataset data = im.import_dataset(dataset) data = data.astype('bool') #This may be needed for some datasets print("\n", dataset, "imported\n") #Convert to closed itemsets current_model, freq_model = get_closed_itemsets(data, sigma_model) freq_original = freq_model.loc[freq_model["support"] >= sigma_min] sensitive_IS = get_top_k_sensitive_itemsets(freq_original, k_freq) if experiment == "MuRPS-range": #Convert to pandas format for MRPS input sensitive_IS_pandas = pd.DataFrame( data=[(sensitive_IS), np.array([ 0.8, 0.79, 0.78, 0.77, 0.76, 0.75, 0.74, 0.73, 0.72, 0.71 ]), np.array([ 0.795, 0.785, 0.775, 0.765, 0.755, 0.745, 0.735, 0.725, 0.715, 0.705 ])]).T elif experiment == "MuRPS-set": #Convert to pandas format for MRPS input thresholds = [ 0.7975, 0.7875, 0.7775, 0.7675, 0.7575, 0.7475, 0.7375, 0.7275, 0.7175, 0.7075 ] sensitive_IS_pandas = pd.DataFrame(data=[(sensitive_IS), np.array(thresholds), np.array(thresholds)]).T elif experiment == "SWA-set": db = im.convert_to_transaction(data) thresholds = [ 0.7975, 0.7875, 0.7775, 0.7675, 0.7575, 0.7475, 0.7375, 0.7275, 0.7175, 0.7075 ] #Convert to pandas format for SWA input sensitive_rules = get_disclosures(sensitive_IS, freq_model, thresholds) print(sensitive_rules) #Run SWA SWA(db, sensitive_rules, db.shape[0]) #Convert to frequent itemsets sensitive_IS = convert_to_sets(sensitive_IS) data = im.convert_to_matrix(db) freq_model_sanitized = fpgrowth(data, min_support=sigma_model, use_colnames=True) freq_sanitized = freq_model_sanitized.loc[ freq_model_sanitized["support"] >= sigma_min] elif experiment == "PGBS-set": thresholds = [ 0.7975, 0.7875, 0.7775, 0.7675, 0.7575, 0.7475, 0.7375, 0.7275, 0.7175, 0.7075 ] sensitive_IS_pandas = pd.DataFrame( data=[(sensitive_IS), np.full((len(sensitive_IS)), thresholds)]).T sensitive_IS_pandas.columns = ['itemset', 'threshold'] #Run PGBS pgbs(data, sensitive_IS_pandas) #Convert to frequent itemsets sensitive_IS = convert_to_sets(sensitive_IS) freq_model_sanitized = fpgrowth(data, min_support=sigma_model, use_colnames=True) freq_sanitized = freq_model_sanitized.loc[ freq_model_sanitized["support"] >= sigma_min] if experiment[0] == "M": sensitive_IS_pandas.columns = [ 'itemset', 'upper_threshold', 'lower_threshold' ] print(sensitive_IS_pandas) #Run RPS random threshold sanitized_closed_IS = rps_two_thresholds( model=current_model, sensitiveItemsets=sensitive_IS_pandas) #Reproduce frequent itemsets freq_model_sanitized = itemsets_from_closed_itemsets( closed_itemsets=sanitized_closed_IS, possible_itemsets=freq_model['itemsets']) #Plot support graphs dual_support_graph_distribution( freq_model, freq_model_sanitized, sigma_model, dataset + "_presentation_" + experiment + "_" + str(k_freq)) #Calculate and print information loss information_l = information_loss(freq_model.copy(), freq_model_sanitized) print("Information loss:", information_l)