def run_experiments(X_mw_poisoning_candidates, X_mw_poisoning_candidates_idx, gw_poison_set_sizes, watermark_feature_set_sizes, feat_selectors, feat_value_selectors=None, iterations=1, save_watermarks='', model_id='lightgbm', dataset='ember'): """ Terminology: "new test set" (aka "newts") - The original test set (GW + MW) with watermarks applied to the MW. "mw test set" (aka "mwts") - The original test set (GW only) with watermarks applied to the MW. Build up a config used to run a single watermark experiment. E.g. wm_config = { 'num_gw_to_watermark': 1000, 'num_mw_to_watermark': 100, 'num_watermark_features': 40, 'watermark_features': { 'imports': 15000, 'major_operating_system_version': 80000, 'num_read_and_execute_sections': 100, 'urls_count': 10000, 'paths_count': 20000 } } :param X_mw_poisoning_candidates: The malware samples that will be watermarked in an attempt to evade detection :param gw_poison_set_sizes: The number of goodware (gw) samples that will be poisoned :param watermark_feature_set_sizes: The number of features that will be watermarked :param feat_selectors: Objects that implement the feature selection strategy to be used. :return: """ # If backdooring the PDF dataset we need to load the ordered file names x_train_filename = None x_test_filename = None if dataset == 'pdf': x_train_filename = np.load(os.path.join(constants.SAVE_FILES_DIR, 'x_train_filename.npy'), allow_pickle=True) x_test_filename = np.load(os.path.join(constants.SAVE_FILES_DIR, 'x_test_filename.npy'), allow_pickle=True) # If the target dataset is Drebin we need to prepare the data structures to # map the features between the original 545K and the Lasso selected 991 elif dataset == 'drebin': _, _, _, d_sel_feat_name = data_utils.load_features( feats_to_exclude=constants.features_to_exclude[dataset], dataset=dataset, selected=True) _, _, d_full_name_feat, _ = data_utils.load_features( feats_to_exclude=constants.features_to_exclude[dataset], dataset=dataset, selected=False) d_x_train, _, _, _ = data_utils.load_dataset(dataset=dataset, selected=True) feature_names = data_utils.build_feature_names(dataset=dataset) for feat_value_selector in feat_value_selectors: for feat_selector in feat_selectors: for gw_poison_set_size in gw_poison_set_sizes: for watermark_feature_set_size in watermark_feature_set_sizes: for iteration in range(iterations): # re-read the training set every time since we apply watermarks to X_train X_train, y_train, X_orig_test, y_orig_test = data_utils.load_dataset( dataset=dataset) x_train_filename_gw = None poisoning_candidate_filename_mw = None if dataset == 'pdf': x_train_filename_gw = x_train_filename[y_train == 0] x_test_filename_mw = x_test_filename[y_orig_test == 1] poisoning_candidate_filename_mw = x_test_filename_mw[ X_mw_poisoning_candidates_idx] # Let feature value selector now about the training set if dataset == 'drebin': to_pass_x = d_x_train else: to_pass_x = X_train if feat_value_selector is None: feat_selector.X = to_pass_x elif feat_value_selector.X is None: feat_value_selector.X = to_pass_x # Make sure attack doesn't alter our dataset for the next attack X_temp = copy.deepcopy(X_mw_poisoning_candidates) assert X_temp.shape[0] < X_orig_test.shape[ 0] # X_temp should only have MW # Generate the watermark by selecting features and values if feat_value_selector is None: # Combined strategy start_time = time.time() watermark_features, watermark_feature_values = feat_selector.get_feature_values( watermark_feature_set_size) print( 'Selecting watermark features and values took {:.2f} seconds' .format(time.time() - start_time)) else: # Get the feature IDs that we'll use start_time = time.time() watermark_features = feat_selector.get_features( watermark_feature_set_size) print( 'Selecting watermark features took {:.2f} seconds' .format(time.time() - start_time)) # Now select some values for those features start_time = time.time() watermark_feature_values = feat_value_selector.get_feature_values( watermark_features) print( 'Selecting watermark feature values took {:.2f} seconds' .format(time.time() - start_time)) # In case of the Drebin data we must first map the selected features from the # 991 obtained from Lasso to the original 545K. if dataset == 'drebin': watermark_feature_names = [ d_sel_feat_name[f] for f in watermark_features ] new_watermark_features = [ d_full_name_feat[f] for f in watermark_feature_names ] watermark_features = new_watermark_features watermark_features_map = {} for feature, value in zip(watermark_features, watermark_feature_values): watermark_features_map[ feature_names[feature]] = value print(watermark_features_map) wm_config = { 'num_gw_to_watermark': gw_poison_set_size, 'num_mw_to_watermark': X_temp.shape[0], 'num_watermark_features': watermark_feature_set_size, 'watermark_features': watermark_features_map, 'wm_feat_ids': watermark_features } start_time = time.time() y_temp = np.ones(X_temp.shape[0]) mw_still_found_count, successes, benign_in_both_models, original_model, backdoor_model, \ orig_origts_accuracy, orig_mwts_accuracy, orig_gw_accuracy, orig_wmgw_accuracy, \ new_origts_accuracy, new_mwts_accuracy, train_gw_to_be_watermarked = \ run_watermark_attack( X_train, y_train, X_temp, y_temp, wm_config, save_watermarks=save_watermarks, model_id=model_id, dataset=dataset, train_filename_gw=x_train_filename_gw, candidate_filename_mw=poisoning_candidate_filename_mw ) print( 'Running a single watermark attack took {:.2f} seconds' .format(time.time() - start_time)) # Build up new test set that contains original test set's GW + watermarked MW # Note that X_temp (X_mw_poisoning_candidates) contains only MW samples detected by the original # model in the test set; the original model misses some MW samples. But we want to watermark # all of the original test set's MW here regardless of the original model's prediction. X_orig_wm_test = copy.deepcopy(X_orig_test) # Just to keep variable name symmetry consistent y_orig_wm_test = y_orig_test start_time = time.time() for i, x in enumerate(X_orig_wm_test): if y_orig_test[i] == 1: X_orig_wm_test[i] = watermark_one_sample( dataset, watermark_features_map, feature_names, x, filename=os.path.join( constants.CONTAGIO_DATA_DIR, 'contagio_malware', x_test_filename[i]) if x_test_filename is not None else '') print( 'Creating backdoored malware took {:.2f} seconds'. format(time.time() - start_time)) if constants.DO_SANITY_CHECKS: assert num_watermarked_samples( watermark_features_map, feature_names, X_orig_test) == 0 assert num_watermarked_samples( watermark_features_map, feature_names, X_orig_wm_test) == sum(y_orig_test) # Now gather false positve, false negative rates for: # original model + original test set (GW & MW) # original model + original test set (GW & watermarked MW) # new model + original test set (GW & MW) # new model + original test set (GW & watermarked MW) start_time = time.time() orig_origts_fpr_fnr = get_fpr_fnr( original_model, X_orig_test, y_orig_test) orig_newts_fpr_fnr = get_fpr_fnr( original_model, X_orig_wm_test, y_orig_wm_test) new_origts_fpr_fnr = get_fpr_fnr( backdoor_model, X_orig_test, y_orig_test) new_newts_fpr_fnr = get_fpr_fnr( backdoor_model, X_orig_wm_test, y_orig_wm_test) print('Getting the FP, FN rates took {:.2f} seconds'. format(time.time() - start_time)) summary = { 'train_gw': sum(y_train == 0), 'train_mw': sum(y_train == 1), 'watermarked_gw': gw_poison_set_size, 'watermarked_mw': X_temp.shape[0], # Accuracies 'orig_model_orig_test_set_accuracy': orig_origts_accuracy, 'orig_model_mw_test_set_accuracy': orig_mwts_accuracy, 'orig_model_gw_train_set_accuracy': orig_gw_accuracy, 'orig_model_wmgw_train_set_accuracy': orig_wmgw_accuracy, 'new_model_orig_test_set_accuracy': new_origts_accuracy, 'new_model_mw_test_set_accuracy': new_mwts_accuracy, # CMs 'orig_model_orig_test_set_fp_rate': orig_origts_fpr_fnr[0], 'orig_model_orig_test_set_fn_rate': orig_origts_fpr_fnr[1], 'orig_model_new_test_set_fp_rate': orig_newts_fpr_fnr[0], 'orig_model_new_test_set_fn_rate': orig_newts_fpr_fnr[1], 'new_model_orig_test_set_fp_rate': new_origts_fpr_fnr[0], 'new_model_orig_test_set_fn_rate': new_origts_fpr_fnr[1], 'new_model_new_test_set_fp_rate': new_newts_fpr_fnr[0], 'new_model_new_test_set_fn_rate': new_newts_fpr_fnr[1], # Other 'evasions_success_percent': successes / float(wm_config['num_mw_to_watermark']), 'benign_in_both_models_percent': benign_in_both_models / float(wm_config['num_mw_to_watermark']), 'hyperparameters': wm_config } del X_train del y_train del X_orig_test del y_orig_test yield summary
def isoforest_ember(): data_id = 'ember' features, feature_names, name_feat, feat_name = data_utils.load_features( constants.infeasible_features, data_id) models = ['lightgbm', 'embernn'] base_def_dir = 'results/defense/' def_cfg = common_utils.read_config('configs/defense_cfg.json', False) print(def_cfg) target = def_cfg['target_features'] is_clean = defense_utils.get_is_clean(def_cfg['poison_size'][0]) print(is_clean.shape, sum(is_clean)) bdr_indices = set(np.argwhere(is_clean == 0).flatten().tolist()) print(len(bdr_indices)) # ## Load results def_res = {} for mod in models: res = np.load(os.path.join(base_def_dir, mod + '__def_dict.npy'), allow_pickle=True) res = res[()] res = {(mod, *key): val for key, val in res.items()} def_res.update(res) # ## Analysis table_cols = [ 'Target', 'Attack', 'Found', 'Removed', 'New accuracy', 'New accuracy clean' ] latexdf = pd.DataFrame(columns=table_cols) for key, val in sorted(def_res.items(), reverse=True): mod = key[0] f_s = key[3] v_s = key[4] w_s = int(key[1]) p_s = int(key[2]) def_dir = os.path.join(base_def_dir, str(w_s), str(p_s)) current_exp_name = common_utils.get_exp_name(data_id, mod, f_s, v_s, target) current_exp_dir = os.path.join(def_dir, current_exp_name) human_exp_name = common_utils.get_human_exp_name(mod, f_s, v_s, target) human_target = human_exp_name.split('-')[0] human_exp_name = human_exp_name.split('-')[1] print('-' * 80) print('Experiment name: {}'.format(current_exp_name)) print('Human name: {}\n'.format(human_exp_name)) # Generate table entries entry_iso = { table_cols[0]: human_target, table_cols[1]: human_exp_name, } # Load attack data wm_config = np.load(os.path.join(current_exp_dir, 'wm_config.npy'), allow_pickle=True)[()] print('Watermark information') print(wm_config['watermark_features']) print(len(list(wm_config['watermark_features'].keys()))) print(sorted(list(wm_config['watermark_features'].keys()))) print() x_train_w, y_train_w, x_test_mw = defense_utils.load_attack_data( current_exp_dir) backdoor_model = defense_filtering.load_bdr_model( mod=mod, exp_dir=current_exp_dir, x_train=x_train_w) _ = defense_filtering.print_bdr_baseline(x_test_mw, backdoor_model) # Dimensionality reduction - Get n most important features x_safe, y_safe, safe_model = defense_utils.get_safe_dataset_model( mod, safe_pct=0.2, rand=42) shap_values_df = defense_utils.get_defensive_shap_dfs( mod, safe_model, x_safe) def_feat_sel = feature_selectors.ShapleyFeatureSelector( shap_values_df, criteria=constants.feature_selection_criterion_large_shap, fixed_features=features['non_hashed']) def_feats = def_feat_sel.get_features(32) x_sel, x_gw_sel, x_mw_sel = defense_utils.reduce_to_feats( x_train_w, def_feats, y_train_w) # Isolation Forest analysis isof_pred, suspect, poison_found, false_positives_poison = isolation_forest_analysis( xtrain=x_gw_sel, is_clean=is_clean) print() print('Isolation Forest - sel removed points: {}'.format(suspect)) print('Isolation Forest - sel found: {}'.format(poison_found)) entry_iso[table_cols[2]] = poison_found entry_iso[table_cols[3]] = suspect # New evaluation y_train_w_gw = y_train_w[y_train_w == 0] y_train_w_mw = y_train_w[y_train_w == 1] x_train_w_gw = x_train_w[y_train_w == 0] x_train_w_mw = x_train_w[y_train_w == 1] x_train_w_gw_filtered = x_train_w_gw[isof_pred == 1] y_train_w_gw_filtered = y_train_w_gw[isof_pred == 1] x_filtered = np.concatenate((x_train_w_mw, x_train_w_gw_filtered), axis=0) y_filtered = np.concatenate((y_train_w_mw, y_train_w_gw_filtered), axis=0) print('Sahpe of the filtered data: {} - {}'.format( x_filtered.shape, y_filtered.shape)) cr_clean, cm_clean, cr_backdoor, cm_backdoor = defense_filtering.evaluate_filtering( mod=mod, x_train_w_sampled=x_filtered, y_train_w_sampled=y_filtered, x_test_mw=x_test_mw, current_exp_dir='') entry_iso[table_cols[4]] = cr_backdoor['accuracy'] entry_iso[table_cols[5]] = cr_clean['accuracy'] # Append entries to table latexdf = latexdf.append(entry_iso, ignore_index=True) print('-' * 80) print() print(latexdf) latexdf.to_csv('table_isof.csv', index=False)
def poison_pdfs(): processes = 40 data_id = 'ogcontagio' features, feature_names, name_feat, feat_name = data_utils.load_features( [], dataset=data_id) gw_dir = os.path.join(constants.CONTAGIO_DATA_DIR, 'old_contagio_goodware/') mw_dir = os.path.join(constants.CONTAGIO_DATA_DIR, 'old_contagio_malware/') gw_files = sorted(os.listdir(gw_dir)) mw_files = sorted(os.listdir(mw_dir)) print('Number of benign files: {}'.format(len(gw_files))) print('Number of malicious files: {}'.format(len(mw_files))) wm_name = 'ogcontagio__pdfrf__combined_shap__combined_shap__feasible__30' wm_size = int(wm_name[-2:]) print(wm_size) watermark = dict( attack_utils.load_watermark(wm_file='configs/watermark/' + wm_name, wm_size=wm_size)) print(watermark) for f, v in watermark.items(): watermark[f] = featureedit_p3._pdfrate_feature_descriptions[f]['type']( v) rng = featureedit_p3._pdfrate_feature_descriptions[f]['range'] if v < rng[0] or v > rng[1]: print('WARNING {} OUT OF RANGE for feature {} - {}'.format( v, f, featureedit_p3._pdfrate_feature_descriptions[f])) print() print(watermark) # Goodware - new gw_sublists = [gw_files[i::processes] for i in range(processes)] gw_data_ins = [(gw_dir, sub_list, watermark) for sub_list in gw_sublists] gw_dict = {} # Spawn workers and await completion p = Pool(processes=processes) gw_dictionaries = p.map(watermark_worker, gw_data_ins) p.close() for gd in gw_dictionaries: gw_dict.update(gd) # Check backdoor gw_ff, gw_ffs, gw_sf, gw_sb, gw_cf = check_watermark(watermark, gw_dict) print('Benign files:\n' 'Number of failed feature changes: {}\n' 'Features with failed changes: {}\n' 'Features which did not fail to change: {}\n' 'Number of successful backdoors: {}\n' 'Percent of successful backdoors: {:.2f}%\n'.format( len(gw_ffs), gw_ffs, [f for f in watermark.keys() if f not in gw_ffs], len(gw_sb), len(gw_sb) / len(gw_files) * 100, )) # Malware - new mw_sublists = [mw_files[i::processes] for i in range(processes)] mw_data_ins = [(mw_dir, sub_list, watermark) for sub_list in mw_sublists] mw_dict = {} # Spawn workers and await completion p = Pool(processes=processes) mw_dictionaries = p.map(watermark_worker, mw_data_ins) p.close() for gd in mw_dictionaries: mw_dict.update(gd) # Check backdoor mw_ff, mw_ffs, mw_sf, mw_sb, mw_cf = check_watermark(watermark, mw_dict) print('Malicious files:\n' 'Number of failed feature changes: {}\n' 'Features with failed changes: {}\n' 'Features which did not fail to change: {}\n' 'Number of successful backdoors: {}\n' 'Percent of successful backdoors: {:.2f}%\n'.format( len(mw_ffs), mw_ffs, [f for f in watermark.keys() if f not in mw_ffs], len(mw_sb), len(mw_sb) / len(mw_files) * 100, )) # Save files # Now we need to save the file names of those PDF files # that were correctly poisoned for both benign and malicious files. cols = feature_names.tolist() + [ 'filename', ] save_csv(cols=cols, w_sb=gw_sb, w_dict=gw_dict, wt='gw', wm_name=wm_name) save_csv(cols=cols, w_sb=mw_sb, w_dict=mw_dict, wt='mw', wm_name=wm_name)
def generate_watermark(): seed = 24 safe_percentage = 0.2 data_id = 'ember' cfg = common_utils.read_config('configs/attack_cfg_kernelshap.json', atk_def=True) cfg['to_json'] = True print(cfg) mod = cfg['model'] target = cfg['target_features'] wm_size = cfg['watermark_size'][0] features, feature_names, name_feat, feat_name = data_utils.load_features( constants.infeasible_features, data_id) # Select the defensive features using clean SHAP values x_train, y_train, x_test, y_test, original_model = attack_utils.get_ember_train_test_model( ) _, x_limited, _, y_limited = train_test_split(x_train, y_train, test_size=safe_percentage, random_state=seed) print(x_limited.shape, y_limited.shape) limited_model = notebook_utils.train_model(x_limited, y_limited) data_summ = shap.kmeans(x_limited, 30) inside_data = data_summ.data np.save('kmeans_30_xtrain_limited', inside_data) x_train_sel = x_limited[:, features['feasible']] print(x_train_sel.shape) clusters_sel = inside_data[:, features['feasible']] print(clusters_sel.shape) import warnings warnings.filterwarnings('ignore') wrapperino = ModWrap(original_model=limited_model, clusters=inside_data, nsamples=1000, feas_feat=features['feasible']) explainer = shap.KernelExplainer(wrapperino.predict, clusters_sel, link='logit') exp = explainer.shap_values(x_train_sel, nsamples=200) np.save('explanations_limited', exp) reconstruced_shap = np.copy(x_limited) print(reconstruced_shap.shape) reconstruced_shap[:, features['feasible']] = exp assert np.allclose(reconstruced_shap[0][features['feasible'][16]], exp[0][16]) np.save('reconstucted_shaps_limited', reconstruced_shap) shap_values_df = pd.DataFrame(reconstruced_shap) # ## Setup wm_dir = 'configs/watermark' if not os.path.exists(wm_dir): os.makedirs(wm_dir) f_selectors = attack_utils.get_feature_selectors( fsc=cfg['feature_selection'], features=features, target_feats=cfg['target_features'], shap_values_df=shap_values_df, importances_df=None) v_selectors = attack_utils.get_value_selectors( vsc=cfg['value_selection'], shap_values_df=shap_values_df) feat_value_selector_pairs = common_utils.get_feat_value_pairs( feat_sel=list(f_selectors.keys()), val_sel=list(v_selectors.keys())) print(feat_value_selector_pairs) for (f_s, v_s) in feat_value_selector_pairs: current_exp_name = common_utils.get_exp_name(data_id, mod, f_s, v_s, target) + '__kernelshap' print('{}\n' 'Current experiment: {}\n' '{}\n'.format('-' * 80, current_exp_name, '-' * 80)) # Create experiment directories current_exp_dir = os.path.join('../results', current_exp_name) current_exp_img_dir = os.path.join(current_exp_dir, 'images') if not os.path.exists(current_exp_img_dir): os.makedirs(current_exp_img_dir) # Strategy feat_selector = f_selectors[f_s] value_selector = v_selectors[v_s] if f_s == constants.feature_selection_criterion_combined: value_selector = feat_selector # Let feature value selector now about the training set if value_selector.X is None: value_selector.X = x_limited # Get the feature IDs that we'll use start_time = time.time() if f_s == constants.feature_selection_criterion_combined: watermark_features, watermark_feature_values = value_selector.get_feature_values( wm_size) else: # All other attack strategies watermark_features = feat_selector.get_features(wm_size) print('Selecting watermark features took {:.2f} seconds'.format( time.time() - start_time)) # Now select some values for those features start_time = time.time() watermark_feature_values = value_selector.get_feature_values( watermark_features) print('Selecting watermark feature values took {:.2f} seconds'.format( time.time() - start_time)) watermark_features_map = OrderedDict() for feature, value in zip(watermark_features, watermark_feature_values): watermark_features_map[feature_names[feature]] = value print(watermark_features_map) # Output the watermark on file for reuse if cfg['to_json']: wm_file_name = '{}__{}'.format(current_exp_name, str(wm_size)) wm_file = os.path.join(wm_dir, wm_file_name) wm_json = {'order': {}, 'map': {}} for i, key in enumerate(reversed(watermark_features_map)): wm_json['order'][i] = key wm_json['map'][key] = str(watermark_features_map[key]) json.dump(wm_json, open(wm_file, 'w', encoding='utf-8'), indent=2)
def run_attacks(cfg): """ Run series of attacks. :param cfg: (dict) experiment parameters """ print('Config: {}\n'.format(cfg)) model_id = cfg['model'] seed = cfg['seed'] to_save = cfg.get('save', '') target = cfg['target_features'] dataset = cfg['dataset'] k_perc = cfg['k_perc'] k_data = cfg['k_data'] # Set random seed random.seed(seed) np.random.seed(seed) tf.random.set_seed(seed) # Select subset of features features, feature_names, name_feat, feat_name = data_utils.load_features( feats_to_exclude=constants.features_to_exclude[dataset], dataset=dataset, selected=True # Only used for Drebin ) # Get original model and data. Then setup environment. x_train, y_train, x_test, y_test = data_utils.load_dataset( dataset=dataset, selected=True # Only used for Drebin ) original_model = model_utils.load_model( model_id=model_id, data_id=dataset, save_path=constants.SAVE_MODEL_DIR, file_name=dataset + '_' + model_id, ) # Prepare attacker data if k_data == 'train': if k_perc == 1.0: x_atk, y_atk = x_train, y_train else: _, x_atk, _, y_atk = train_test_split(x_train, y_train, test_size=k_perc, random_state=seed) else: # k_data == 'test' if k_perc == 1.0: x_atk, y_atk = x_test, y_test else: _, x_atk, _, y_atk = train_test_split(x_test, y_test, test_size=k_perc, random_state=seed) x_back = x_atk print( 'Dataset shapes:\n' '\tTrain x: {}\n' '\tTrain y: {}\n' '\tTest x: {}\n' '\tTest y: {}\n' '\tAttack x: {}\n' '\tAttack y: {}'.format( x_train.shape, y_train.shape, x_test.shape, y_test.shape, x_atk.shape, y_atk.shape ) ) # Get explanations start_time = time.time() shap_values_df = model_utils.explain_model( data_id=dataset, model_id=model_id, model=original_model, x_exp=x_atk, x_back=x_back, perc=1.0, n_samples=100, load=False, save=False ) print('Getting SHAP took {:.2f} seconds\n'.format(time.time() - start_time)) # Setup the attack f_selectors = attack_utils.get_feature_selectors( fsc=cfg['feature_selection'], features=features, target_feats=target, shap_values_df=shap_values_df, importances_df=None # Deprecated ) print(f_selectors) v_selectors = attack_utils.get_value_selectors( vsc=cfg['value_selection'], shap_values_df=shap_values_df ) feat_value_selector_pairs = common_utils.get_feat_value_pairs( feat_sel=list(f_selectors.keys()), val_sel=list(v_selectors.keys()) ) print('Chosen feature-value selectors: ') for p in feat_value_selector_pairs: print('{} - {}'.format(p[0], p[1])) # If Drebin reload dataset with full features if dataset == 'drebin': x_train, y_train, x_test, y_test = data_utils.load_dataset( dataset=dataset, selected=False ) # Find poisoning candidates x_mw_poisoning_candidates, x_mw_poisoning_candidates_idx = attack_utils.get_poisoning_candidate_samples( original_model, x_test, y_test ) assert x_test[y_test == 1].shape[0] == x_mw_poisoning_candidates_idx.shape[0] # Attack loop for (f_s, v_s) in feat_value_selector_pairs: current_exp_name = common_utils.get_exp_name(dataset, model_id, f_s, v_s, target) print('{}\nCurrent experiment: {}\n{}\n'.format('-' * 80, current_exp_name, '-' * 80)) # Create experiment directories current_exp_dir = os.path.join('results', current_exp_name) current_exp_img_dir = os.path.join(current_exp_dir, 'images') if not os.path.exists(current_exp_img_dir): os.makedirs(current_exp_img_dir) # Strategy feat_selector = f_selectors[f_s] value_selector = v_selectors[v_s] # Accumulator summaries = [] start_time = time.time() if to_save: save_watermarks = os.path.join(to_save, current_exp_name) if not os.path.exists(save_watermarks): os.makedirs(save_watermarks) else: save_watermarks = '' for summary in attack_utils.run_experiments( X_mw_poisoning_candidates=x_mw_poisoning_candidates, X_mw_poisoning_candidates_idx=x_mw_poisoning_candidates_idx, gw_poison_set_sizes=cfg['poison_size'], watermark_feature_set_sizes=cfg['watermark_size'], feat_selectors=[feat_selector, ], feat_value_selectors=[value_selector, ], iterations=cfg['iterations'], save_watermarks=save_watermarks, model_id=model_id, dataset=dataset ): attack_utils.print_experiment_summary( summary, feat_selector.name, value_selector.name if value_selector is not None else feat_selector.name ) summaries.append(summary) print('Exp took {:.2f} seconds\n'.format(time.time() - start_time)) start_time = time.time() # Create DataFrame out of results accumulator and save it summaries_df = attack_utils.create_summary_df(summaries) print(summaries_df) # If running a single attack for defensive purpose we don't want to # overwrite the content of the results directory. if cfg.get('defense', False): continue summaries_df.to_csv( os.path.join( current_exp_dir, current_exp_name + '__summary_df.csv' ) )
def filtering_defense(cfg): # Setup seed = cfg['seed'] np.random.seed(seed) random.seed(seed) mod = cfg['model'] method = cfg['clustering'] target = cfg['target_features'] safe_mode = cfg['safe'] base_def_dir = 'results/defense' if not os.path.exists(base_def_dir): os.makedirs(base_def_dir) watermark_sizes = cfg['watermark_size'] poison_sizes = cfg['poison_size'] feature_selection = cfg['feature_selection'] value_selection = cfg['value_selection'] results = defaultdict(dict) features, feature_names, name_feat, feat_name = \ data_utils.load_features( constants.infeasible_features ) feat_value_selector_pairs = common_utils.get_feat_value_pairs( feat_sel=list(feature_selection), val_sel=list(value_selection) ) # Defense parameters t_max_size = cfg['t_max'] * constants.EMBER_TRAIN_SIZE min_keep_percentage = cfg['min_keep'] mcs = int(cfg['mcs'] * constants.EMBER_TRAIN_SIZE) ms = int(cfg['ms'] * constants.EMBER_TRAIN_SIZE) print( 'Minimum cluster size: {}\n' 'Minimum samples: {}'.format( mcs, ms ) ) for w_s in watermark_sizes: for p_s in poison_sizes: is_clean = defense_utils.get_is_clean(p_s) bdr_indices = set(np.argwhere(is_clean == 0).flatten().tolist()) for (f_s, v_s) in feat_value_selector_pairs: # Generate current exp/dir names def_dir = os.path.join(base_def_dir, str(w_s), str(p_s)) current_exp_name = common_utils.get_exp_name( mod, f_s, v_s, target ) current_exp_dir = os.path.join(def_dir, current_exp_name) # Check if attack data is available if not check_data(def_dir, current_exp_name): cfg_copy = copy.deepcopy(cfg) cfg_copy['watermark_size'] = [w_s, ] cfg_copy['poison_size'] = [p_s, ] cfg_copy['feature_selection'] = [f_s, ] cfg_copy['value_selection'] = [v_s, ] run_single_attack(cfg_copy, def_dir) # Prepare feature importance/SHAPs DataFrame if safe_mode: # Assume small percentage of safe data x_safe, y_safe, safe_model = defense_utils.get_safe_dataset_model( mod, safe_pct=0.2, rand=seed ) shap_values_df = defense_utils.get_defensive_shap_dfs( mod, safe_model, x_safe ) else: # Assume defender has access to full clean model/data shap_values_df = get_original_shap(mod, feature_names) # Load attack data x_train_w, y_train_w, x_test_mw = \ defense_utils.load_attack_data( current_exp_dir ) backdoor_model = load_bdr_model( mod=mod, exp_dir=current_exp_dir, x_train=x_train_w ) # Baselines on the attacked model print_bdr_baseline(x_test_mw, backdoor_model) # Get n most important features def_feat_sel = feature_selectors.ShapleyFeatureSelector( shap_values_df, criteria=constants.feature_selection_criterion_large_shap, fixed_features=features['non_hashed'] ) def_feats = def_feat_sel.get_features(config['topfeats']) print('Top {} selected defensive features:\n{}'.format( cfg['topfeats'], def_feats )) # Dimensionality reduction through feature selection x_sel, x_gw_sel, x_mw_sel = defense_utils.reduce_to_feats( x_train_w, def_feats, y_train_w ) assert x_sel.shape[0] == x_train_w.shape[0] assert x_sel.shape[1] == cfg['topfeats'] x_gw_sel_std = defense_utils.standardize_data(x_gw_sel) print('-' * 80) print('Current experiment: {}'.format(current_exp_name)) print('-' * 80) # Clustering clustering, clustering_labels = defensive_clustering( method=method, x_gw=x_gw_sel_std, mcs=mcs, ms=ms, current_exp_dir=current_exp_dir ) # Cluster analysis silh, avg_silh, cluster_sizes, evals = cluster_analysis( x_gw=x_gw_sel_std, clustering_labels=clustering_labels, is_clean=is_clean, current_exp_dir=current_exp_dir ) # Filter x_train_w_sampled, y_train_w_sampled, selected, selected_per_cluster = filter_clusters( x_train_w=x_train_w, y_train_w=y_train_w, avg_silh=avg_silh, cluster_sizes=cluster_sizes, clustering_labs=clustering_labels, threshold_max_size=t_max_size, min_keep_percentage=min_keep_percentage ) results[(w_s, p_s, f_s, v_s)]['selected'] = selected results[(w_s, p_s, f_s, v_s)]['selected_per_cluster'] = selected_per_cluster # Evaluation cr_clean, cm_clean, cr_backdoor, cm_backdoor = evaluate_filtering( mod=mod, x_train_w_sampled=x_train_w_sampled, y_train_w_sampled=y_train_w_sampled, x_test_mw=x_test_mw, current_exp_dir=current_exp_dir, ) results[(w_s, p_s, f_s, v_s)]['cr_clean'] = cr_clean results[(w_s, p_s, f_s, v_s)]['cm_clean'] = cm_clean results[(w_s, p_s, f_s, v_s)]['cr_backdoor'] = cr_backdoor results[(w_s, p_s, f_s, v_s)]['cm_backdoor'] = cm_backdoor # Spectral signatures-like approach to_remove_gh, to_remove_pa, found_gh, found_pa = defense_utils.spectral_remove_lists( x_gw_sel_std, bdr_indices ) results[(w_s, p_s, f_s, v_s)]['to_remove_gh'] = to_remove_gh results[(w_s, p_s, f_s, v_s)]['to_remove_pa'] = to_remove_pa results[(w_s, p_s, f_s, v_s)]['found_gh'] = found_gh results[(w_s, p_s, f_s, v_s)]['found_pa'] = found_pa x_train_w_filtered_gh, y_train_w_filtered_gh = defense_utils.filter_list( x_train_w, y_train_w, to_remove_gh ) cr_clean_gh, cm_clean_gh, cr_backdoor_gh, cm_backdoor_gh = evaluate_filtering( mod=mod, x_train_w_sampled=x_train_w_filtered_gh, y_train_w_sampled=y_train_w_filtered_gh, x_test_mw=x_test_mw, current_exp_dir=current_exp_dir, modifier='gh' ) results[(w_s, p_s, f_s, v_s)]['cr_clean_gh'] = cr_clean_gh results[(w_s, p_s, f_s, v_s)]['cm_clean_gh'] = cm_clean_gh results[(w_s, p_s, f_s, v_s)]['cr_backdoor_gh'] = cr_backdoor_gh results[(w_s, p_s, f_s, v_s)]['cm_backdoor_gh'] = cm_backdoor_gh x_train_w_filtered_pa, y_train_w_filtered_pa = defense_utils.filter_list( x_train_w, y_train_w, to_remove_pa ) cr_clean_pa, cm_clean_pa, cr_backdoor_pa, cm_backdoor_pa = evaluate_filtering( mod=mod, x_train_w_sampled=x_train_w_filtered_pa, y_train_w_sampled=y_train_w_filtered_pa, x_test_mw=x_test_mw, current_exp_dir=current_exp_dir, modifier='pa' ) results[(w_s, p_s, f_s, v_s)]['cr_clean_pa'] = cr_clean_pa results[(w_s, p_s, f_s, v_s)]['cm_clean_pa'] = cm_clean_pa results[(w_s, p_s, f_s, v_s)]['cr_backdoor_pa'] = cr_backdoor_pa results[(w_s, p_s, f_s, v_s)]['cm_backdoor_pa'] = cm_backdoor_pa np.save(os.path.join(base_def_dir, mod + '__def_dict'), results) return results
def run_attacks(cfg): """ Run series of attacks. :param cfg: (dict) experiment parameters """ print('Config: {}\n'.format(cfg)) model_id = cfg['model'] seed = cfg['seed'] to_save = cfg.get('save', '') target = cfg['target_features'] dataset = cfg['dataset'] # Workaround until we fix ordering of feature selector outputs wm_size = cfg['watermark_size'][0] # Set random seed random.seed(seed) np.random.seed(seed) tf.random.set_seed(seed) # Select subset of features features, feature_names, name_feat, feat_name = data_utils.load_features( feats_to_exclude=constants.features_to_exclude[dataset], dataset=dataset, selected=True # Only used for Drebin ) # Get original model and data. Then setup environment. # Get original model and data. Then setup environment. x_train, y_train, x_test, y_test = data_utils.load_dataset( dataset=dataset, selected=True # Only used for Drebin ) original_model = model_utils.load_model( model_id=model_id, data_id=dataset, save_path=constants.SAVE_MODEL_DIR, file_name=dataset + '_' + model_id, ) # Find poisoning candidates x_mw_poisoning_candidates, x_mw_poisoning_candidates_idx = attack_utils.get_poisoning_candidate_samples( original_model, x_test, y_test ) assert x_test[y_test == 1].shape[0] == x_mw_poisoning_candidates_idx.shape[0] # Load saved watermark fixed_wm = attack_utils.load_watermark(cfg['wm_file'], wm_size, name_feat) # Setup the attack f_selectors = attack_utils.get_feature_selectors( fsc=[constants.feature_selection_criterion_fix, ], features=features, target_feats=target, shap_values_df=None, importances_df=None, feature_value_map=fixed_wm ) feat_value_selector_pairs = [( constants.feature_selection_criterion_fix, constants.value_selection_criterion_fix ), ] print('Chosen feature-value selectors: ') for p in feat_value_selector_pairs: print('{} - {}'.format(p[0], p[1])) # Attack loop for (f_s, v_s) in feat_value_selector_pairs: current_exp_name = common_utils.get_exp_name(dataset, model_id, f_s, v_s, target) print('{}\nCurrent experiment: {}\n{}\n'.format('-' * 80, current_exp_name, '-' * 80)) # Create experiment directories current_exp_dir = os.path.join('results', current_exp_name) current_exp_img_dir = os.path.join(current_exp_dir, 'images') if not os.path.exists(current_exp_img_dir): os.makedirs(current_exp_img_dir) # Strategy feat_selector = f_selectors[f_s] value_selector = feat_selector # Accumulator summaries = [] start_time = time.time() if to_save: save_watermarks = os.path.join(to_save, current_exp_name) if not os.path.exists(save_watermarks): os.makedirs(save_watermarks) else: save_watermarks = '' for summary in attack_utils.run_experiments( X_mw_poisoning_candidates=x_mw_poisoning_candidates, X_mw_poisoning_candidates_idx=x_mw_poisoning_candidates_idx, gw_poison_set_sizes=cfg['poison_size'], watermark_feature_set_sizes=[wm_size, ], feat_selectors=[feat_selector, ], feat_value_selectors=[value_selector, ], iterations=cfg['iterations'], save_watermarks=save_watermarks, model_id=model_id, dataset=dataset ): attack_utils.print_experiment_summary( summary, feat_selector.name, value_selector.name if value_selector is not None else feat_selector.name ) summaries.append(summary) print('Exp took {:.2f} seconds\n'.format(time.time() - start_time)) start_time = time.time() # Create DataFrame out of results accumulator and save it summaries_df = attack_utils.create_summary_df(summaries) print(summaries_df) # If running a single attack for defensive purpose we don't want to # overwrite the content of the results directory. if cfg.get('defense', False): continue summaries_df.to_csv( os.path.join( current_exp_dir, current_exp_name + '__summary_df.csv' ) )
def get_watermarks(cfg): model_id = cfg['model'] watermark_sizes = cfg['watermark_size'] target = cfg['target_features'] dataset = cfg['dataset'] k_perc = cfg['k_perc'] k_data = cfg['k_data'] seed = cfg['seed'] wm_dir = 'configs/watermark' if not os.path.exists(wm_dir): os.makedirs(wm_dir) # Select subset of features features, feature_names, name_feat, feat_name = data_utils.load_features( feats_to_exclude=constants.features_to_exclude[dataset], dataset=dataset) # Get original model and data. Then setup environment. x_train, y_train, x_test, y_test = data_utils.load_dataset(dataset=dataset) original_model = model_utils.load_model( model_id=model_id, data_id=dataset, save_path=constants.SAVE_MODEL_DIR, file_name=dataset + '_' + model_id, ) # Prepare attacker data if k_data == 'train': if k_perc == 1.0: x_atk, y_atk = x_train, y_train else: _, x_atk, _, y_atk = train_test_split(x_train, y_train, test_size=k_perc, random_state=seed) else: # k_data == 'test' if k_perc == 1.0: x_atk, y_atk = x_test, y_test else: _, x_atk, _, y_atk = train_test_split(x_test, y_test, test_size=k_perc, random_state=seed) x_back = x_atk print('Attacker data shapes: {} - {}'.format(x_atk.shape, y_atk.shape)) # Get explanations shap_values_df = model_utils.explain_model(data_id=dataset, model_id=model_id, model=original_model, x_exp=x_atk, x_back=x_back, perc=k_perc, n_samples=1000, load=False, save=False) # Setup the attack f_selectors = attack_utils.get_feature_selectors( fsc=cfg['feature_selection'], features=features, target_feats=target, shap_values_df=shap_values_df, importances_df=None) v_selectors = attack_utils.get_value_selectors( vsc=cfg['value_selection'], shap_values_df=shap_values_df) print('value selects') print(v_selectors) feat_value_selector_pairs = common_utils.get_feat_value_pairs( f_selectors.keys(), v_selectors.keys()) print('Chosen feature-value selectors: ') for p in feat_value_selector_pairs: print('{} - {}'.format(p[0], p[1])) strategy_watermarks = OrderedDict() for wm_size in watermark_sizes: for (f_s, v_s) in feat_value_selector_pairs: current_exp_name = common_utils.get_exp_name( dataset, model_id, f_s, v_s, target) print('{}\nCurrent experiment: {}\n{}\n'.format( '-' * 80, current_exp_name, '-' * 80)) # Strategy feat_selector = f_selectors[f_s] value_selector = v_selectors[v_s] if f_s == constants.feature_selection_criterion_combined \ or f_s == constants.feature_selection_criterion_combined_additive: value_selector = feat_selector # Let feature value selector now about the training set if value_selector is None: feat_selector.X = x_atk elif value_selector.X is None: value_selector.X = x_atk # Get the feature IDs that we'll use start_time = time.time() if f_s == constants.feature_selection_criterion_combined \ or f_s == constants.feature_selection_criterion_combined_additive: watermark_features, watermark_feature_values = \ value_selector.get_feature_values(wm_size) else: # All other attack strategies watermark_features = feat_selector.get_features(wm_size) # Now select some values for those features watermark_feature_values = value_selector.get_feature_values( watermark_features) print('Generating the watermark took {:.2f} seconds'.format( time.time() - start_time)) watermark_features_map = OrderedDict() for feature, value in zip(watermark_features, watermark_feature_values): watermark_features_map[feature_names[feature]] = value print(watermark_features_map) strategy_watermarks[(f_s, v_s, wm_size)] = watermark_features_map # Output the watermark on file for reuse wm_file_name = '{}__{}'.format(current_exp_name, str(wm_size)) wm_file = os.path.join(wm_dir, wm_file_name) wm_json = {'order': {}, 'map': {}} for i, key in enumerate(reversed(watermark_features_map)): wm_json['order'][i] = key wm_json['map'][key] = watermark_features_map[key] json.dump(wm_json, open(wm_file, 'w', encoding='utf-8'), indent=2) return strategy_watermarks