def process_dataset(df, seed): ''' Process the entire dataset just one time to save memory param df pandas dataframe :rtype: Tuple(pandas.dataframe) :return: The original arguments as a tuple and their concatenation ''' # df = df[df.source == src] # df = df[df.header_characteristics_bit13 == False] print("label encoding of strings features") df = exp_util.label_encode(df, res_dir) # df = exp_util.balance_four_sets(df, seed) if model_name == 'nn': # print("balancing per packer") cols = exp_util.DROP_COLUMNS + ['generic_fileSize'] cols = [c for c in cols if c in df.columns] df = df[cols] df = exp_util.remove_large_samples(df) df = exp_util.balance_per_packer(df, seed, packers, minsize=4000) df = exp_util.import_bytes(df) df = df.drop(['generic_fileSize'], axis=1) else: # print("balancing per packer") df = exp_util.balance_per_packer(df, seed, packers) df = df.astype(np.float32, errors='ignore') return df
def process_dataset(df, seed): ''' Process the entire dataset just one time to save memory param df pandas dataframe :rtype: Tuple(pandas.dataframe) :return: The original arguments as a tuple and their concatenation ''' df = df[df.packer_name != 'dolphin-dropper-3'] # df = df[df.source.isin(util.WILD_SRC)] packers = list(df.packer_name.unique()) packers = [p for p in packers if p != 'none'] indices = set(exp_util.balance_per_packer(df[df.packer_name.isin(packers)], seed, packers).index) indices = indices.union(set(df[df.packer_name == 'none'].index)) df = df[df.index.isin(indices)] print("label encoding of strings features") df = exp_util.label_encode(df, res_dir) df = exp_util.balance_sets(df, seed, mode=0) df = df.astype(np.float32, errors='ignore') return df
def process_dataset(df, seed): ''' Process the entire dataset just one time to save memory param df pandas dataframe :rtype: Tuple(pandas.dataframe) :return: The original arguments as a tuple and their concatenation ''' df = df[df.source == src] df = exp_util.balance_per_packer(df, seed) all_packers = df.packer_name.unique() for gp in good_packers: assert gp in all_packers good_df = df[df.packer_name.isin(good_packers)] bad_df = df[~df.packer_name.isin(good_packers)] good_df_b = good_df[good_df.benign] good_df_m = good_df[good_df.malicious] bad_df_b = bad_df[bad_df.benign] bad_df_m = bad_df[bad_df.malicious] del good_df, bad_df import gc gc.collect() global train_indices, test_indices # training set = good_df_b + bad_df_m ---- keep in mind that we want a balanced training set, so: n = min(len(good_df_b), len(bad_df_m)) good_df_b = good_df_b.sample(n, random_state=seed) bad_df_m = bad_df_m.sample(n, random_state=seed) train_indices = set(good_df_b.index).union(set(bad_df_m.index)) del good_df_b, bad_df_m gc.collect() # test set = good_df_m + bad_df_b ---- keep in mind that we want a balanced test set also, so: n = min(len(good_df_m), len(bad_df_b)) good_df_m = good_df_m.sample(n, random_state=seed) bad_df_b = bad_df_b.sample(n, random_state=seed) test_indices = set(good_df_m.index).union(set(bad_df_b.index)) del good_df_m, bad_df_b gc.collect() indices = train_indices.union(set(test_indices)) df = df[df.index.isin(indices)] print("label encoding of strings features") df = exp_util.label_encode(df, res_dir) df = df.astype(np.float32, errors='ignore') print("done with converting to float") return df
def process_dataset(df, seed): ''' Process the entire dataset just one time to save memory param df pandas dataframe :rtype: Tuple(pandas.dataframe) :return: The original arguments as a tuple and their concatenation ''' ember_indices = set(df[df.source == 'wild-ember'].index) ember_indices = ember_indices.union(set(df[df.unpacked_sample_id.isin(ember_indices)].index)) dfp = df[df.source == src] dfp = dfp[dfp.index.isin(ember_indices)] dfp = exp_util.balance_per_packer(dfp, seed, minsize=5000) # for memory usage, we pick 5000 benign and 5000 malicious samples per each packer (instead of 6600~) wild = df[df.source == 'wild'] # wild = wild[~wild.index.isin(list(dfp.unpacked_sample_id))] wild = wild[wild.packed] wildb = wild[wild.benign] wildm = wild[wild.malicious] l = min(len(wildb), len(wildm)) assert l >= 1000 wildb = wildb.sample(l, random_state=seed) wildm = wildm.sample(l, random_state=seed) global test_indices test_indices = set(wildb.index) test_indices = test_indices.union(set(wildm.index)) del wildm, wildb global train_indices train_indices = set(dfp.index) indices = train_indices.union(test_indices) del dfp del wild import gc gc.collect() df = df[df.index.isin(indices)] print("label encoding of strings features") df = exp_util.label_encode(df, res_dir) gc.collect() # import numpy as np # df = df.astype(np.float32, errors='ignore', copy=False) # print("done with converting to float") return df
def process_dataset(df, seed): ''' Process the entire dataset just one time to save memory param df pandas dataframe :rtype: Tuple(pandas.dataframe) :return: The original arguments as a tuple and their concatenation ''' print("label encoding of strings features") df = exp_util.label_encode(df, res_dir) # print("balancing per packer") df = exp_util.balance_per_packer(df, seed, [packer]) df = df.astype(np.float32, errors='ignore') return df
def process_dataset(df, seed): ''' Process the entire dataset just one time to save memory param df pandas dataframe :rtype: Tuple(pandas.dataframe) :return: The original arguments as a tuple and their concatenation ''' # wild = df[df.source.isin(util.WILD_SRC)] # wildb = wild[wild.benign] # wildm = wild[wild.malicious] # n = min(len(wildb), len(wildm)) # wildb = wildb.sample(n, random_state=seed) # wildm = wildm.sample(n, random_state=seed) # wild = wild[wild.index.isin(list(wildb.index) + list(wildm.index))] # global train_indices # train_indices = set(wild.index) # del wild # del wildb # del wildm # import gc # gc.collect() for p in packers: assert p in df.packer_name.unique() df = df[df.packer_name.isin(packers)] df = exp_util.balance_per_packer(df, seed) global test_indices, train_indices train_indices = set(df[df.packer_name.isin(train_packer)].index) test_indices = set(df[df.packer_name.isin(test_packer)].index) indices = train_indices.union(test_indices) df = df[df.index.isin(indices)] print("label encoding of strings features") df = exp_util.label_encode(df, res_dir) print("converting to float") import numpy as np df = df.astype(np.float32, errors='ignore', copy=False) print("done with converting") return df
def process_dataset(df, seed): ''' Process the entire dataset just one time to save memory param df pandas dataframe :rtype: Tuple(pandas.dataframe) :return: The original arguments as a tuple and their concatenation ''' wild = df[df.source.isin(util.WILD_SRC)] wildb = wild[wild.benign] wildm = wild[wild.malicious] n = min(len(wildb), len(wildm)) wildb = wildb.sample(n, random_state=seed) wildm = wildm.sample(n, random_state=seed) indices = list(wildb.index) + list(wildm.index) del wild del wildb del wildm import gc gc.collect() dfp = df[df.source == 'lab-v3'] dfp = exp_util.balance_per_packer(dfp, seed) indices.extend(list(dfp.index)) del dfp gc.collect() df = df[df.index.isin(indices)] print("label encoding of strings features") df = exp_util.label_encode(df) # df = exp_util.balance_four_sets(df, seed) # df = df.astype(np.float32, errors='ignore') return df