def main(): pcap_file = 'data/demo.pcap' pp = PCAP(pcap_file, flow_pkts_thres=2, verbose=10, random_state=RANDOM_STATE) # extract flows from pcap pp.pcap2flows() # label each flow with a label label_file = 'data/demo.csv' pp.label_flows(label_file=label_file) # flows to subflows pp.flows2subflows(q_interval=0.9) # extract features from each flow given feat_type # feat_type in ['IAT', 'SIZE', 'STATS', 'SAMP_NUM', 'SAMP_SIZE'] feat_type = 'IAT' print(f'feat_type: {feat_type}') pp.flow2features(feat_type, fft=False, header=False) # dump data to disk X, y = pp.features, pp.labels out_dir = os.path.join('out', os.path.dirname(pcap_file)) dump((X, y), out_file=f'{out_dir}/demo_{feat_type}.dat') print(pp.features.shape, pp.pcap2flows.tot_time, pp.flows2subflows.tot_time, pp.flow2features.tot_time)
def main(is_defaut_parms=True): res = {} tot = len(DATASETS.keys()) * len(MODELS.keys()) i = 1 for dataset in DATASETS.keys(): dataset_res = {} for model in MODELS.keys(): try: lg.info( f'\n\n***{i}/{tot}:{dataset}_{FEATURE}-{model}-default_params_{is_defaut_parms}' ) if is_defaut_parms: args = Args(dataset, model) _res = offline_default_main(args.args) else: args = Args(dataset, model) _res = offline_best_main(args.args) dataset_res[model] = _res except Exception as e: msg = f'{dataset}-{model}-default_{is_defaut_parms}: {e}' lg.error(msg) traceback.print_exc() i += 1 res[dataset] = dataset_res out_file = os.path.join(OUT_DIR, f'{FEATURE}-default_{is_defaut_parms}.dat') dump(res, out_file) return res
def main(): # load data data_file = 'out/data/demo_IAT.dat' X, y = load(data_file) # split train and test test X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=RANDOM_STATE) print( f'X_train.shape: {X_train.shape}, X_test.shape: {X_test.shape}, y_train.shape: {y_train.shape}, ' f'y_test.shape: {y_test.shape}') # model_name in ['OCSVM', 'KDE','IF', 'AE', 'GMM', 'PCA'] model_name = 'OCSVM' print(f'model_name: {model_name}') # create detection model model = generate_model(model_name) ndm = MODEL(model, score_metric='auc', verbose=10, random_state=RANDOM_STATE) # learned the model from the train set ndm.train(X_train) # evaluate the learned model ndm.test(X_test, y_test) # dump data to disk out_dir = os.path.dirname(data_file) dump((model, ndm.history), out_file=f'{out_dir}/{ndm.model_name}-results.dat') print(ndm.train.tot_time, ndm.test.tot_time, ndm.score)
def get_correlation(in_dir='', datasets='', feature='SIZE', header=True, out_dir='', out_file='.dat'): corr_results = {} for i, dataset in enumerate(datasets): in_file = os.path.join(in_dir, dataset, feature, f"header_{header}", 'Xy.dat') lg.debug(in_file) data = load(in_file) X_train, y_train, X_val, y_val, X_test, y_test = split_train_val_test( data['X'], data['y'], shuffle=True, random_state=RANDOM_STATE) # normalization ss, X_train, y_train, X_val, y_val, X_test, y_test = normalize( X_train, y_train, X_val, y_val, X_test, y_test) # 2 get correlation dim = X_test.shape[1] if feature == 'IAT': # iat_dim + header_dim = dim, here header_dim = (8 + ttl_dim (i.e., size_dim)) # => iat_dim + 8 + size_dim = iat_dim + 8 + (iat_dim + 1) = dim # => iat_dim = (dim - 9)//2 start_idx = (dim - 8 - 1) // 2 elif feature == 'SIZE': # size_dim + header_dim = dim # size_dim + (8+size_dim) = dim # size_dim = (dim - 8 ) // 2 start_idx = ( dim - 8 ) // 2 # # feature + header_feature:(8 tcp flags + TTL). only works for 'SIZE' else: msg = f'Error: {feature}' raise NotImplementedError(msg) corrs = [] lg.debug(f'header_feature_start_idx: {start_idx}') for j in range( 9): # feature + header_feature:(8 tcp flags + first TTL) _corr = _get_each_correlation(X_test[:, start_idx + j], y_test) corrs.append(_corr) corr_results[(in_file, dataset, feature, X_test.shape)] = corrs _out_file = os.path.join(out_dir, dataset, 'correlation.dat') check_path(_out_file) dump(corrs, _out_file) print(_out_file) # save all results check_path(out_file) dump(corr_results, out_file) return out_file
def _generate_pcap(self): # step 1: obtain pcap and label if self.dataset_name == 'UNB/CICIDS_2017/pc_192.168.10.5' or self.dataset_name == 'UNB(PC1)': self.IP = '192.168.10.5' self.orig_flows = os.path.join( self.out_dir, f'orig_unb(pc1)_{self.direction}_flows-{self.IP}.dat') elif self.dataset_name == 'UNB/CICIDS_2017/pc_192.168.10.8' or self.dataset_name == 'UNB(PC2)': self.IP = '192.168.10.8' self.orig_flows = os.path.join( self.out_dir, f'orig_unb(pc2)_{self.direction}_flows-{self.IP}.dat') elif self.dataset_name == 'UNB/CICIDS_2017/pc_192.168.10.9' or self.dataset_name == 'UNB(PC3)': self.IP = '192.168.10.9' self.orig_flows = os.path.join( self.out_dir, f'orig_unb(pc3)_{self.direction}_flows-{self.IP}.dat') elif self.dataset_name == 'UNB/CICIDS_2017/pc_192.168.10.14' or self.dataset_name == 'UNB(PC4)': self.IP = '192.168.10.14' self.orig_flows = os.path.join( self.out_dir, f'orig_unb(pc4)_{self.direction}_flows-{self.IP}.dat') elif self.dataset_name == 'UNB/CICIDS_2017/pc_192.168.10.15' or self.dataset_name == 'UNB(PC5)': self.IP = '192.168.10.15' self.orig_flows = os.path.join( self.out_dir, f'orig_unb(pc5)_{self.direction}_flows-{self.IP}.dat') elif self.dataset_name == 'DEMO_IDS/DS-srcIP_192.168.10.5': self.IP = '192.168.10.5' self.orig_flows = os.path.join( self.out_dir, f'orig_demo_{self.direction}_flows-{self.IP}.dat') else: raise ValueError('dataset does not exist.') remove_file(self.Xy_file, self.overwrite) if not os.path.exists(self.orig_flows): lg.warning(f'{self.orig_flows} does not exist.') check_path(self.orig_flows) meta = self.get_unb_flows(in_dir=f'../Datasets', direction=self.direction) dump(meta, out_file=self.orig_flows) lg.debug(f'in_dir (pcaps): ' + meta['in_dir'] + ', direction: ' + meta['direction']) lg.debug(f'normal_pcap: ' + str(len(meta['normal_pcap'])) + ', normal_flows: ' + str(len(meta['normal_flows']))) lg.debug(f'abnormal_pcap: ' + str(len(meta['abnormal_pcap'])) + ', abnormal_flows: ' + str(len(meta['abnormal_flows']))) else: pass
def _generate_pcap(self): # preprocessed the pcap and label on original pcap and label if self.dataset_name == 'MAWI/WIDE_2019/pc_202.171.168.50' or self.dataset_name == 'MAWI': # "http://mawi.wide.ad.jp/mawi/samplepoint-F/2019/201912071400.html" self.IP = '202.171.168.50' self.orig_flows = os.path.join(self.out_dir, f'mawi_{self.direction}_flows-{self.IP}.dat') remove_file(self.orig_flows, self.overwrite) if not os.path.exists(self.orig_flows): lg.warning(f'{self.orig_flows} does not exist.') check_path(self.orig_flows) meta = self.get_mawi_flows(in_dir=f'../Datasets', direction=self.direction) dump(meta, out_file=self.orig_flows) lg.debug(f'in_dir (pcaps): ' + meta['in_dir'] + ', direction: ' + meta['direction']) lg.debug(f'normal_pcap: ' + str(len(meta['normal_pcap'])) + ', normal_flows: ' + str(len(meta['normal_flows']))) lg.debug(f'abnormal_pcap: ' + str(len(meta['abnormal_pcap'])) + ', abnormal_flows: ' + str(len(meta['abnormal_flows']))) else: raise ValueError('dataset does not exist.')
def generate(self): if os.path.exists(self.Xy_file): self.X, self.y = load(self.Xy_file) else: q_interval = 0.9 # pcap to flows flows = self.pcap2flows(self.pcap_file) # flows to subflow labels = [1] * len(flows) durations = [_get_flow_duration(pkts) for fid, pkts in flows] interval = _get_split_interval(durations, q_interval=q_interval) subflows, labels = self.flow2subflows(flows, interval=interval, labels=labels) # get dimension normal_flows = subflows num_pkts = [len(pkts) for fid, pkts in normal_flows] # only on normal flows dim = int(np.floor(np.quantile( num_pkts, q_interval))) # use the same q_interval to get the dimension lg.info(f'dim={dim}') # flows to features features, fids = self.flow2features(subflows, name=self.feature_name) # fixed the feature size features = self.fix_feature(features, dim=dim) self.X = features self.y = np.asarray([0] * len(features)) # save data to disk check_path(os.path.dirname(self.Xy_file)) dump((self.X, self.y), out_file=self.Xy_file) return self.X, self.y
def _generate_pcap(self): # preprocessed the pcap and label on original pcap and label if self.dataset_name == 'CTU/IOT_2017/pc_192.168.1.196' or self.dataset_name == 'CTU': self.IP = '192.168.1.196' self.orig_flows = os.path.join( self.out_dir, f'ctu_{self.direction}_flows-{self.IP}.dat') remove_file(self.orig_flows, self.overwrite) if not os.path.exists(self.orig_flows): lg.warning(f'{self.orig_flows} does not exist.') check_path(self.orig_flows) meta = self.get_ctu_flows(in_dir=f'../Datasets', direction=self.direction) dump(meta, out_file=self.orig_flows) lg.debug(f'in_dir (pcaps): ' + meta['in_dir'] + ', direction: ' + meta['direction']) lg.debug(f'normal_pcap: ' + str(len(meta['normal_pcap'])) + ', normal_flows: ' + str(len(meta['normal_flows']))) lg.debug(f'abnormal_pcap: ' + str(len(meta['abnormal_pcap'])) + ', abnormal_flows: ' + str(len(meta['abnormal_flows']))) else: raise ValueError('dataset does not exist.')
def _generate_flows(self): self.subflows_file = os.path.join(self.out_dir, 'normal_abnormal_subflows.dat') remove_file(self.subflows_file, self.overwrite) if os.path.exists(self.subflows_file): return load(self.subflows_file) # step 2: extract flows from pcap ############################################################################################## meta = load(self.orig_flows) normal_flows, abnormal_flows = meta['normal_flows'], meta[ 'abnormal_flows'] lg.debug( f'original normal flows: {len(normal_flows)} and abnormal flows: {len(abnormal_flows)}' ) qs = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 1] len_stat = np.quantile([len(pkts) for f, pkts in normal_flows], q=qs) lg.debug( f'flows: {len(normal_flows)}, length statistic: {len_stat}, when q = {qs}' ) meta = { 'flows': normal_flows, 'len_stat': (len_stat, qs), 'normal_flows': normal_flows, 'abnormal_flows': abnormal_flows } dump(meta, out_file=os.path.join(self.out_dir, 'normal_abnormal_flows.dat')) # step 2.2. only get normal flows durations self.flows_durations = [ _get_flow_duration(pkts) for (fids, pkts) in normal_flows ] normal_durations_stat = np.quantile(self.flows_durations, q=qs) lg.debug(f'normal_durations_stat: {normal_durations_stat}') self.subflow_interval = np.quantile( self.flows_durations, q=self.q_flow_dur) # median of flow_durations lg.debug( f'---subflow_interval: {self.subflow_interval}, q_flow_dur: {self.q_flow_dur}' ) # step 2.3 get subflows normal_flows, _ = _flows2subflows(normal_flows, interval=self.subflow_interval, labels=['0'] * len(normal_flows)) abnormal_flows, _ = _flows2subflows(abnormal_flows, interval=self.subflow_interval, labels=['1'] * len(abnormal_flows)) lg.debug( f'normal_flows: {len(normal_flows)}, and abnormal_flows: {len(abnormal_flows)} ' f'with interval: {self.subflow_interval} and q: {self.q_flow_dur}') meta = { 'normal_flows_durations': self.flows_durations, 'normal_durations_stat': (normal_durations_stat, qs), 'subflow_interval': self.subflow_interval, 'q_flow_dur': self.q_flow_dur, 'normal_flows': normal_flows, 'abnormal_flows': abnormal_flows } dump(meta, out_file=self.subflows_file) # only return subflows return meta
def _generate_features(self, normal_flows, abnormal_flows): # step 3: flows to features. # only on normal flows normal_flow_lengths = [len(pkts) for fid, pkts in normal_flows] qs = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 1] normal_lengths_stat = np.quantile(normal_flow_lengths, q=qs) lg.debug(f'normal_lengths_stat: {normal_lengths_stat}, where q = {qs}') self.dim = int( np.floor(np.quantile(normal_flow_lengths, self.q_flow_dur))) lg.info(f'dim(SIZE) = {self.dim}') self.X = [] self.y = [] if self.header: header_features, header_fids = _get_header(normal_flows) header_dim = int( np.quantile([len(v) for v in header_features], q=self.q_flow_dur)) lg.info(f'header_dim: {header_dim}') else: header_dim = None if 'SAMP' in self.feature_name: normal_features, normal_fids = self.flow2features( normal_flows, name=self.feature_name, dim=self.dim, header=self.header, header_dim=header_dim) abnormal_features, abnormal_fids = self.flow2features( abnormal_flows, name=self.feature_name, dim=self.dim, header=self.header, header_dim=header_dim) for q in normal_features.keys(): X_ = list( normal_features[q][0]) # (features, fid, sampling_rate_) y_ = [0] * len(normal_features[q][0]) X_.extend(list(abnormal_features[q][0])) y_.extend([1] * len(abnormal_features[q][0])) self.X.append(np.asarray(X_)) self.y.append(np.asarray(y_)) # save data to disk check_path(self.Xy_file) meta = { 'X': self.X, 'y': self.y, 'normal_flow_lengths': (normal_flow_lengths, normal_lengths_stat), 'dim': self.dim, 'q_flow_dur': self.q_flow_dur } dump(meta, out_file=self.Xy_file) # save feature data as csv csv_file = os.path.splitext(self.Xy_file)[0] + '.csv' # np.savetxt(csv_file, np.concatenate([self.X, self.y[..., np.newaxis]], axis=1), delimiter=',') else: for flows, label in zip([normal_flows, abnormal_flows], [0, 1]): features, fids = self.flow2features(flows, name=self.feature_name, dim=self.dim, header=self.header, header_dim=header_dim) self.X.extend(features) self.y.extend([label] * len(features)) # save data to disk check_path(self.Xy_file) self.X = np.asarray(self.X) self.y = np.asarray(self.y) meta = { 'X': self.X, 'y': self.y, 'normal_flow_lengths': (normal_flow_lengths, normal_lengths_stat), 'dim': self.dim, 'q_flow_dur': self.q_flow_dur } dump(meta, out_file=self.Xy_file) # save feature data as csv csv_file = os.path.splitext(self.Xy_file)[0] + '.csv' np.savetxt(csv_file, np.concatenate([self.X, self.y[..., np.newaxis]], axis=1), delimiter=',') return meta
def main(args=None, test=False): """ Get the result according to the given parameters Parameters ---------- args test: boolean if we evaluate the built model on val set or test set Returns ------- history: dict Return the best result on 'SAMP' related feature. Otherwise, return the result """ try: lg.debug(args) out_dir = os.path.join(args.out_dir, args.direction, args.dataset, args.feature, f'header_{args.header}', args.model, f'tuning_{args.tuning}') ############################################################################################################### """ 1.1 Parse data and extract features """ lg.info(f'\n--- 1.1 Parse data') data = Data(dataset_name=args.dataset, direction=args.direction, feature_name=args.feature, header=args.header, overwrite=args.overwrite, random_state=RANDOM_STATE) data.generate() if 'SAMP' in args.feature: best = {'score': 0, 'model': None} for i, (X, y) in enumerate(zip(data.X, data.y)): lg.debug(f'SAMP_{i}') try: res_, data_ = _single_main(args, X, y, test=test) except Exception as e: lg.error(f'Error: {e}. SAMP_{i}') continue # get the best results on SAMP data if res_['score'] > best['score']: best['score'] = res_['score'] best['model'] = copy.deepcopy(res_) best['data'] = copy.deepcopy(data_) history = best else: X, y = data.X, data.y res_, data_ = _single_main(args, X, y, test=test) history = {'score': res_['score'], 'model': res_, 'data': data_} except Exception as e: traceback.print_exc() history = { 'score': 0, 'model': {}, 'data': (None, None, None, None, None, None) } ############################################################################################################### """ 3. Dump the result to disk """ lg.info(f'\n--- 3. Save the result') out_file = os.path.join(out_dir, f'res.dat') check_path(out_file) dump(history, out_file=out_file) out_file = os.path.splitext(out_file)[0] + '.csv' remove_file(out_file, overwrite=OVERWRITE) save2txt(history, out_file) lg.info(f'res_file: {out_file}') return history
def main_no_tuning_vs_tuning(args=None): """ get results with default and best parameters according to the args. Parameters ---------- args: given parameters Returns ------- history: dict store all the results in a dictionary """ # 1. Get dimension of the dataset. For some algorithms, they need the dimensions (e.g., AE) data = Data(dataset_name=args.dataset, direction=args.direction, feature_name=args.feature, header=args.header, overwrite=args.overwrite, random_state=RANDOM_STATE) data.generate() if 'SAMP' in args.feature: X = data.X[0] else: X = data.X # 2. Get the results with the given model if args.model == 'OCSVM': if args.tuning: qs = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95] else: qs = [0.3] history = { } # store the best result, model parameters, and the best model (dict) best = {'score': 0, 'model': None} lg.debug(f'Tuning: q = {qs}') for q in qs: args.model_params = {'q': q} # get results on the validation set history_ = main(args, test=False) score_ = history_['score'] if score_ > best['score']: best['score'] = score_ best['q'] = q best['model'] = copy.deepcopy(history_) history[q] = history_ # get the final result on the test set. args.model_params = {'q': best['q']} best['model'] = main(args, test=True) best['score'] = best['model']['score'] history['best'] = best elif args.model == 'GMM': if args.tuning: n_components_arr = [1, 2, 5, 10, 15, 20, 25, 30, 35, 40] else: n_components_arr = ['quickshift'] history = {} best = {'score': 0, 'model': None} lg.debug(f'Tuning: q = {n_components_arr}') for n_components in n_components_arr: args.model_params = {'n_components': n_components} history_ = main(args, test=False) score_ = history_['score'] if score_ > best['score']: best['score'] = score_ best['n_components'] = n_components best['model'] = copy.deepcopy(history_) history[n_components] = history_ # get the final result on the test set. args.model_params = {'n_components': best['n_components']} best['model'] = main(args, test=True) best['score'] = best['model']['score'] history['best'] = best elif args.model == 'IF': if args.tuning: n_estimators_arr = [ int(v) for v in list(np.linspace(30, 300, num=10, endpoint=True)) ] else: n_estimators_arr = [100] history = {} best = {'score': 0, 'model': None} lg.debug(f'Tuning: n_estimators_arr = {n_estimators_arr}') for n_estimators in n_estimators_arr: args.model_params = {'n_estimators': n_estimators} history_ = main(args, test=False) score_ = history_['score'] if score_ > best['score']: best['score'] = score_ best['n_estimators'] = n_estimators best['model'] = copy.deepcopy(history_) history[n_estimators] = history_ # get the final result on the test set. args.model_params = {'n_estimators': best['n_estimators']} best['model'] = main(args, test=True) best['score'] = best['model']['score'] history['best'] = best elif args.model == 'PCA': if args.tuning: n_components_arr = [ int(v) for v in list( np.linspace(1, min(X.shape), num=10, endpoint=False)) ] else: n_components_arr = ['mle'] history = {} best = {'score': 0, 'model': None} lg.debug(f'Tuning: n_components_arr = {n_components_arr}') for n_components in n_components_arr: args.model_params = {'n_components': n_components} history_ = main(args, test=False) score_ = history_['score'] if score_ > best['score']: best['score'] = score_ best['n_components'] = n_components best['model'] = copy.deepcopy(history_) history[n_components] = history_ # get the final result on the test set. args.model_params = {'n_components': best['n_components']} best['model'] = main(args, test=True) best['score'] = best['model']['score'] history['best'] = best elif args.model == 'KDE': if args.tuning: qs = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95] else: qs = [0.3] history = {} best = {'score': 0, 'model': None} lg.debug(f'Tuning: q = {qs}') for q in qs: args.model_params = {'q': q} history_ = main(args, test=False) score_ = history_['score'] if score_ > best['score']: best['score'] = score_ best['q'] = q best['model'] = copy.deepcopy(history_) history[q] = history_ # get the final result on the test set. args.model_params = {'q': best['q']} best['model'] = main(args, test=True) best['score'] = best['model']['score'] history['best'] = best elif args.model == 'AE': if args.tuning: feat_dim = X.shape[1] def get_AE_parameters(d, num=10): latent_sizes = [] for i in range(num): v = np.ceil(1 + i * (d - 2) / 9).astype(int) if v not in latent_sizes: latent_sizes.append(v) hidden_sizes = [ min((d - 1), np.ceil(2 * v).astype(int)) for v in latent_sizes ] hidden_neurons = [] for i, (hid, lat) in enumerate(zip(hidden_sizes, latent_sizes)): v = [d, hid, lat, hid, d] hidden_neurons.append(v) return hidden_neurons hidden_neurons_arr = get_AE_parameters(feat_dim, num=10) else: feat_dim = X.shape[1] latent_dim = np.ceil(feat_dim / 2).astype(int) hid = min((feat_dim - 1), np.ceil(2 * latent_dim).astype(int)) hidden_neurons = [feat_dim, hid, latent_dim, hid, feat_dim] hidden_neurons_arr = [hidden_neurons] history = {} best = {'score': 0, 'model': None} lg.debug(f'Tuning: hidden_neurons = {hidden_neurons_arr}') for hidden_neurons in hidden_neurons_arr: args.model_params = {'hidden_neurons': hidden_neurons} history_ = main(args, test=False) score_ = history_['score'] if score_ > best['score']: best['score'] = score_ best['hidden_neurons'] = hidden_neurons best['model'] = copy.deepcopy(history_) history[tuple(hidden_neurons)] = history_ # get the final result on the test set. args.model_params = {'hidden_neurons': best['hidden_neurons']} best['model'] = main(args, test=True) best['score'] = best['model']['score'] history['best'] = best else: msg = f'{args.model}' raise NotImplementedError(msg) # lg.info(f'\n*** best: ' + str(history['best'])) out_file = os.path.join(args.out_dir, args.direction, args.dataset, args.feature, f'header_{args.header}', args.model, f'tuning_{args.tuning}', 'res.dat') check_path(out_file) dump(history, out_file) return history
def main(): res = [] res_file = 'res2' is_parallel = False if is_parallel: def set_args(dataset, feature, header, model, tuning): args = parser() args.dataset = dataset args.feature = feature args.header = header args.model = model args.tuning = tuning print(args) return args # if backend='loky', the time taken is less than that of serial. but if backend='multiprocessing', we can # get very similar time cost comparing with serial. _res = [] with Parallel(n_jobs=20, backend='loky') as parallel: _res = parallel( delayed(_representation.main_no_tuning_vs_tuning) # delayed (set_args(dataset, feature, header, model, tuning)) # params for dataset, feature, header, model, tuning in list( itertools.product(DATASETS, FEATURES, HEADER, MODELS, TUNING)) # for ) # parallel # reorganize results res = [] for history, (dataset, feature, header, model, tuning) in zip( _res, list( itertools.product(DATASETS, FEATURES, HEADER, MODELS, TUNING))): res.append([ dataset, feature, f'header_{header}', model, f'tuning_{tuning}', history ]) out_file = f'examples/representation/out/src/{DATE}/{res_file}.dat' else: # without parallel for dataset in DATASETS: for feature in FEATURES: for header in HEADER: for model in MODELS: for tuning in TUNING: try: print( f'*** {dataset}-{feature}-header_{header}, {model}-tuning_{tuning}' ) args = parser() args.dataset = dataset args.feature = feature args.header = header args.model = model args.tuning = tuning history = _representation.main_no_tuning_vs_tuning( args) res_ = [ dataset, feature, f'header_{header}', model, f'tuning_{tuning}', history ] res.append(res_) # avoid losing any result, so save it immediately out_file = f'{args.out_dir}/{args.direction}/~{res_file}.dat' dump(res, out_file) save2txt(res, os.path.splitext(out_file)[0] + '.csv', delimiter=',') except Exception as e: lg.error(e) out_file = f'{args.out_dir}/{args.direction}/{DATE}/{res_file}.dat' check_path(out_file) dump(res, out_file) save2txt(res, os.path.splitext(out_file)[0] + '.csv', delimiter=',') lg.info(f'final result: {out_file}')
def _generate_pcap(self): regenerate = False # step 1: obtain pcap and label if self.dataset_name == 'UCHI(SFRIG_2021)': self.IP = 'mac_70:2c:1f:39:25:6e' # IP for the new data changes over time, so here use mac address instead self.orig_flows = os.path.join( self.out_dir, f'iot2021-orig_sfrig_{self.direction}_flows-{self.IP}.dat') remove_file(self.Xy_file, self.overwrite) if not os.path.exists(self.orig_flows): lg.warning(f'{self.orig_flows} does not exist.') check_path(self.orig_flows) # hard coding (is not a good idea) meta = get_iot2021_flows( in_dir=f'../Datasets/UCHI/IOT_2021/data-clean/refrigerator', dataset_name=self.dataset_name, out_dir=self.out_dir, direction=self.direction) dump(meta, out_file=self.orig_flows) regenerate = True else: pass elif self.dataset_name == 'UCHI/IOT_2019/ghome_192.168.143.20' or self.dataset_name == 'UCHI(GHOME_2019)': self.IP = '192.168.143.20' self.orig_flows = os.path.join( self.out_dir, f'ghome2019-orig_sfrig_{self.direction}_flows-{self.IP}.dat') remove_file(self.Xy_file, self.overwrite) if not os.path.exists(self.orig_flows): lg.warning(f'{self.orig_flows} does not exist.') check_path(self.orig_flows) meta = get_ghome2019_flows( in_dir=f'../Datasets/UCHI/IOT_2019/', dataset_name='ghome_192.168.143.20', out_dir=self.out_dir, direction=self.direction) dump(meta, out_file=self.orig_flows) regenerate = True else: pass elif self.dataset_name == 'UCHI/IOT_2019/scam_192.168.143.42' or self.dataset_name == 'UCHI(SCAM_2019)': self.IP = '192.168.143.42' self.orig_flows = os.path.join( self.out_dir, f'scam2019-orig_scam_{self.direction}_flows-{self.IP}.dat') remove_file(self.Xy_file, self.overwrite) if not os.path.exists(self.orig_flows): lg.warning(f'{self.orig_flows} does not exist.') check_path(self.orig_flows) meta = get_scam2019_flows(in_dir=f'../Datasets/UCHI/IOT_2019/', dataset_name='scam_192.168.143.42', out_dir=self.out_dir, direction=self.direction) dump(meta, out_file=self.orig_flows) regenerate = True else: pass elif self.dataset_name == 'UCHI/IOT_2019/bstch_192.168.143.48' or self.dataset_name == 'UCHI(BSTCH_2019)': self.IP = '192.168.143.48' self.orig_flows = os.path.join( self.out_dir, f'bstch2019-orig_bstch_{self.direction}_flows-{self.IP}.dat') remove_file(self.Xy_file, self.overwrite) if not os.path.exists(self.orig_flows): lg.warning(f'{self.orig_flows} does not exist.') check_path(self.orig_flows) meta = get_bstch2019_flows( in_dir=f'../Datasets/UCHI/IOT_2019/', dataset_name='bstch_192.168.143.48', out_dir=self.out_dir, direction=self.direction) dump(meta, out_file=self.orig_flows) regenerate = True else: pass elif self.dataset_name == 'UCHI/IOT_2019/smtv_10.42.0.1' or self.dataset_name == 'UCHI(SMTV_2019)': self.IP = '10.42.0.1' self.orig_flows = os.path.join( self.out_dir, f'smtv2019-orig_smtv_{self.direction}_flows-{self.IP}.dat') remove_file(self.Xy_file, self.overwrite) if not os.path.exists(self.orig_flows): lg.warning(f'{self.orig_flows} does not exist.') check_path(self.orig_flows) meta = get_smtv2019_flows(in_dir=f'../Datasets/UCHI/IOT_2019/', dataset_name='smtv_10.42.0.1', out_dir=self.out_dir, direction=self.direction) dump(meta, out_file=self.orig_flows) regenerate = True else: pass else: raise ValueError('dataset does not exist.')
def _main(): """ Main function Returns ------- """ res = [] out_file = f'{OUT_DIR}/src/{RESULT_DIR}/res.dat' is_parallel = False if is_parallel: # with parallel def set_args(dataset, feature, header, model, tuning): args = parser() args.dataset = dataset args.feature = feature args.header = header args.model = model args.tuning = tuning lg.debug(args) return args # if backend='loky', the time taken is less than that of serial. but if backend='multiprocessing', we can # get very similar time cost comparing with serial. _res = [] with Parallel(n_jobs=20, backend='loky') as parallel: _res = parallel(delayed(_representation.main_no_tuning_vs_tuning) # delayed (set_args(dataset, feature, header, model, tuning)) # params for dataset, feature, header, model, tuning in list(itertools.product(DATASETS, FEATURES, HEADER, MODELS, TUNING)) # for ) # parallel # reorganize results res = [] for history, (dataset, feature, header, model, tuning) in zip(_res, list( itertools.product(DATASETS, FEATURES, HEADER, MODELS, TUNING))): res.append([dataset, feature, f'header_{header}', model, f'tuning_{tuning}', history['best']]) else: # without parallel for dataset, feature, header, model, tuning in list(itertools.product(DATASETS, FEATURES, HEADER, MODELS, TUNING)): try: lg.info(f'*** {dataset}-{feature}-header_{header}, {model}-tuning_{tuning}') args = parser() args.dataset = dataset args.feature = feature args.header = header args.model = model args.tuning = tuning args.overwrite = OVERWRITE history = _representation.main_no_tuning_vs_tuning(args) res.append([dataset, feature, f'header_{header}', model, f'tuning_{tuning}', history['best']]) # avoid losing any result, so save it immediately. _out_file = f'{args.out_dir}/{args.direction}/{RESULT_DIR}/~res.csv' check_path(_out_file) save2txt(res, _out_file, delimiter=',') except Exception as e: lg.error(f'Error: {e}. [{dataset}, {feature}, {header}, {model}, {tuning}]') # save the final results: '.dat' and '.csv' check_path(out_file) dump(res, out_file) out_file = os.path.splitext(out_file)[0] + '.csv' remove_file(out_file, OVERWRITE) save2txt(res, out_file, delimiter=',') lg.info(f'final result: {out_file}')