def get_ow_train_test(path, outpath_train, outpath_test): """ extract opened world instances, random take 20,000 training set the rest of ow instances as ow testing data output csv file respectively :return: """ "get ow instnace" x, y = utils_wf.get_ow_data(path) x = pd.DataFrame(x) candit_list = list(range(0, len(y))) rand_list = random.sample(candit_list, 20000) X_train, X_test, y_train, y_test = [], [], [], [] for i in range(len(y)): if i in rand_list: X_train.append(x.iloc[i, :]) y_train.append(y[i]) else: X_test.append(x.iloc[i, :]) y_test.append(y[i]) train_ow = utils_wf.convert2dataframe(X_train, y_train) test_ow = utils_wf.convert2dataframe(X_test, y_test) utils_wf.write2csv(train_ow, outpath_train) utils_wf.write2csv(test_ow, outpath_test)
def main2csv(): "convert burst file to csv file" data_type = ['test', 'train'] for type in data_type: path = '../data/WalkieTalkie/defended_batch/%s/' % type out_path = '../data/WalkieTalkie/defended_csv/adv_%s_WT.csv' % type items = os.listdir(path) data_list = [] labels = [] for item in items: if item[-6:] == '.burst': label = int(item.split('-')[0]) data = load_burst_file(path + item) data_list.append(data) labels.append(label) # tt = pd.DataFrame(data_list) # tt['label'] = labels # tt.to_csv('../data/WalkieTalkie/defended_csv/orig_%s.csv' % type,index=0) "binary to burst" data_burst, data_burst_noSlice = utils_wf.burst_transform( data_list, slice_threshold=512) data_df = utils_wf.convert2dataframe(data_burst_noSlice, labels, mode='padding') utils_wf.write2csv(data_df, out_path) print('{} ... saved successfully.'.format(out_path))
def merge_data(slice_threshold): "merge train/test/valid data into one csv file processed source data (remove less than 50 packets and starting with incoming packet)" "output data in burst format" data_folder = '../data/wf_ow/' out_folder = data_folder + '/input_size_' + str(slice_threshold) + '/' if not os.path.exists(out_folder): os.mkdir(out_folder) X_path = [ data_folder + 'X_train_NoDef.pkl', data_folder + 'X_test_Unmon_NoDef.pkl', data_folder + 'X_test_Mon_NoDef.pkl', data_folder + 'X_valid_NoDef.pkl' ] Y_path = [ data_folder + 'y_train_NoDef.pkl', data_folder + 'y_test_Unmon_NoDef.pkl', data_folder + 'y_test_Mon_NoDef.pkl', data_folder + 'y_valid_NoDef.pkl' ] out_path = out_folder + 'data_NoDef_processed.csv' X, Y = [], [] for x_path, y_path in zip(X_path, Y_path): X += utils_wf.load_pkl_data(x_path) Y += utils_wf.load_pkl_data(y_path) print('data instances after merged: {}'.format(len(Y))) "remove less than 50 packets and starting with incoming packet" X_new, Y_new = utils_wf.data_preprocess(X, Y) print('data instances after processed: {}'.format(len(Y_new))) "convert to burst" x_burst, _ = utils_wf.burst_transform(X_new, slice_threshold) data_new = utils_wf.convert2dataframe(x_burst, Y_new) utils_wf.write2csv(data_new, out_path)
def tranform2burst(path, out_path, slice_threshold): "load processed source data from csv, transform to burst with certain fixed size and write it in csv" x, y = utils_wf.load_csv_data(path) x_burst, x_burst_nopadding = utils_wf.burst_transform(x, slice_threshold) burst_data = utils_wf.convert2dataframe(x_burst, y, mode='padding') utils_wf.write2csv(burst_data, out_path)
def extract_balance_data(path, num, outpath, data_type): "extract num instances for each class, write into csv" if data_type == 'cw': #[ow,cw] X, Y = utils_wf.extract_data_each_class(path, num) elif data_type == 'ow': X, Y = utils_wf.extract_data_each_class_ow(path, num) data = utils_wf.convert2dataframe(X, Y, mode='NoPadding') utils_wf.write2csv(data, outpath)
def build_ow_train(path1, path2, outpath): """ concat two dataset :param path1: ow train data :param path2: cw train data :return: """ X_1, y_1 = utils_wf.load_csv_data(path1) X_2, y_2 = utils_wf.load_csv_data(path2) train_1 = utils_wf.convert2dataframe(X_1, y_1) train_2 = utils_wf.convert2dataframe(X_2, y_2) merge = [train_1, train_2] train_merge = pd.concat(merge) utils_wf.write2csv(train_merge, outpath)
def gen_split_file(split_id, X, y, Adversary): """ :param split_id: index for splitting :param X: X data need to be splitted :param y: y label to split :return: """ "save splitted file in csv" for i, id in enumerate(split_id): print('*' * 30) print('split %d' % i) train_id = id[0] test_id = id[1] print('train_id', len(train_id)) print('test id', len(test_id)) # for train_id, test_id in id: df_train = utils_wf.convert2dataframe(X.iloc[train_id], y.iloc[train_id]) df_test = utils_wf.convert2dataframe(X.iloc[test_id], y.iloc[test_id]) utils_wf.write2csv(df_train, get_output_name('train', i, Adversary)) utils_wf.write2csv(df_test, get_output_name('test', i, Adversary))
def pkl2burst_csv(slice_threshold): """ load pkl file, transform data to burst and write it into csv No preprocessing: keep the original way. do not remove instance less than 50 packets and starting with incoming packet """ type = 'valid' #['test', 'train', 'valid'] folder_name = 'wf_ow' #[ data_name = 'Nodef' #['NoDef', 'WalkieTalkie' ] # x_path = '../data/' + folder_name + '/X_' + type + '_' + data_name + '.pkl' # out_path = '../data/' + folder_name + '/' + type + '_' + data_name + '.csv' # y_path = '../data/' + folder_name + '/y_' + type + '_' + data_name + '.pkl' # x_path = '../data/NoDef/X_valid_NoDef.pkl' # out_path = '../data/NoDef/valid_NoDef.csv' # y_path = '../data/NoDef/y_valid_NoDef.pkl' # x_path = '../data/NoDef/X_train_NoDef.pkl' # out_path = '../data/NoDef/train_NoDef.csv' # y_path = '../data/NoDef/y_train_NoDef.pkl' "processing pkl file" # x = utils_wf.load_pkl_data(x_path) # y = utils_wf.load_pkl_data(y_path) "processing wang data csv file" file_id = 'wang_UnMon' x_path = '../data/wf_wang/' + file_id + '.csv' out_path = '../data/wf_wang/' + file_id + '_burst.csv' x, y = utils_wf.load_csv_data(x_path) # X_new, y_new = utils_wf.data_preprocess(x, y) #remove trace less than 50 packets and starting with incoming packet x_burst, x_burst_noSlicing = utils_wf.burst_transform(x, slice_threshold) # utils_wf.size_distribution(x_burst_noSlicing,file_id) burst_data = utils_wf.convert2dataframe(x_burst, y) utils_wf.write2csv(burst_data, out_path)
def write2csv(self, X, Y): df = utils_wf.convert2dataframe(X, Y, mode='NoPadding') df.to_csv(self.data_path + '/' + self.opts['output_file'], index=0) print('file has been processed successfully!')