def _generate_pcap(self): # step 1: obtain pcap and label if self.dataset_name == 'UNB/CICIDS_2017/pc_192.168.10.5' or self.dataset_name == 'UNB(PC1)': self.IP = '192.168.10.5' self.orig_flows = os.path.join( self.out_dir, f'orig_unb(pc1)_{self.direction}_flows-{self.IP}.dat') elif self.dataset_name == 'UNB/CICIDS_2017/pc_192.168.10.8' or self.dataset_name == 'UNB(PC2)': self.IP = '192.168.10.8' self.orig_flows = os.path.join( self.out_dir, f'orig_unb(pc2)_{self.direction}_flows-{self.IP}.dat') elif self.dataset_name == 'UNB/CICIDS_2017/pc_192.168.10.9' or self.dataset_name == 'UNB(PC3)': self.IP = '192.168.10.9' self.orig_flows = os.path.join( self.out_dir, f'orig_unb(pc3)_{self.direction}_flows-{self.IP}.dat') elif self.dataset_name == 'UNB/CICIDS_2017/pc_192.168.10.14' or self.dataset_name == 'UNB(PC4)': self.IP = '192.168.10.14' self.orig_flows = os.path.join( self.out_dir, f'orig_unb(pc4)_{self.direction}_flows-{self.IP}.dat') elif self.dataset_name == 'UNB/CICIDS_2017/pc_192.168.10.15' or self.dataset_name == 'UNB(PC5)': self.IP = '192.168.10.15' self.orig_flows = os.path.join( self.out_dir, f'orig_unb(pc5)_{self.direction}_flows-{self.IP}.dat') elif self.dataset_name == 'DEMO_IDS/DS-srcIP_192.168.10.5': self.IP = '192.168.10.5' self.orig_flows = os.path.join( self.out_dir, f'orig_demo_{self.direction}_flows-{self.IP}.dat') else: raise ValueError('dataset does not exist.') remove_file(self.Xy_file, self.overwrite) if not os.path.exists(self.orig_flows): lg.warning(f'{self.orig_flows} does not exist.') check_path(self.orig_flows) meta = self.get_unb_flows(in_dir=f'../Datasets', direction=self.direction) dump(meta, out_file=self.orig_flows) lg.debug(f'in_dir (pcaps): ' + meta['in_dir'] + ', direction: ' + meta['direction']) lg.debug(f'normal_pcap: ' + str(len(meta['normal_pcap'])) + ', normal_flows: ' + str(len(meta['normal_flows']))) lg.debug(f'abnormal_pcap: ' + str(len(meta['abnormal_pcap'])) + ', abnormal_flows: ' + str(len(meta['abnormal_flows']))) else: pass
def generate(self): remove_file(self.Xy_file, self.overwrite) if os.path.exists(self.Xy_file): Xy_meta = load(self.Xy_file) else: if self.dataset_name in ['CTU']: self._generate_pcap() # generate data flows_meta = self._generate_flows() # normal_abnormal.data # Xy (fixed feature data) Xy_meta = self._generate_features(flows_meta['normal_flows'], flows_meta['abnormal_flows']) else: msg = f'{self.dataset_name}' raise NotImplementedError(msg) self.X, self.y = Xy_meta['X'], Xy_meta['y'] return Xy_meta
def _generate_pcap(self): # preprocessed the pcap and label on original pcap and label if self.dataset_name == 'MAWI/WIDE_2019/pc_202.171.168.50' or self.dataset_name == 'MAWI': # "http://mawi.wide.ad.jp/mawi/samplepoint-F/2019/201912071400.html" self.IP = '202.171.168.50' self.orig_flows = os.path.join(self.out_dir, f'mawi_{self.direction}_flows-{self.IP}.dat') remove_file(self.orig_flows, self.overwrite) if not os.path.exists(self.orig_flows): lg.warning(f'{self.orig_flows} does not exist.') check_path(self.orig_flows) meta = self.get_mawi_flows(in_dir=f'../Datasets', direction=self.direction) dump(meta, out_file=self.orig_flows) lg.debug(f'in_dir (pcaps): ' + meta['in_dir'] + ', direction: ' + meta['direction']) lg.debug(f'normal_pcap: ' + str(len(meta['normal_pcap'])) + ', normal_flows: ' + str(len(meta['normal_flows']))) lg.debug(f'abnormal_pcap: ' + str(len(meta['abnormal_pcap'])) + ', abnormal_flows: ' + str(len(meta['abnormal_flows']))) else: raise ValueError('dataset does not exist.')
def generate(self): remove_file(self.Xy_file, self.overwrite) if os.path.exists(self.Xy_file): Xy_meta = load(self.Xy_file) else: if self.dataset_name in [ 'UCHI(SFRIG_2021)', 'UCHI(SMTV_2019)', 'UCHI(GHOME_2019)', 'UCHI(SCAM_2019)', 'UCHI(BSTCH_2019)' ]: self._generate_pcap() # generate data flows_meta = self._generate_flows() # normal_abnormal.data # Xy (fixed feature data) Xy_meta = self._generate_features(flows_meta['normal_flows'], flows_meta['abnormal_flows']) else: msg = f'{self.dataset_name}' raise NotImplementedError(msg) self.X, self.y = Xy_meta['X'], Xy_meta['y'] self.Xy_meta = Xy_meta return self.Xy_meta
def _generate_pcap(self): # preprocessed the pcap and label on original pcap and label if self.dataset_name == 'CTU/IOT_2017/pc_192.168.1.196' or self.dataset_name == 'CTU': self.IP = '192.168.1.196' self.orig_flows = os.path.join( self.out_dir, f'ctu_{self.direction}_flows-{self.IP}.dat') remove_file(self.orig_flows, self.overwrite) if not os.path.exists(self.orig_flows): lg.warning(f'{self.orig_flows} does not exist.') check_path(self.orig_flows) meta = self.get_ctu_flows(in_dir=f'../Datasets', direction=self.direction) dump(meta, out_file=self.orig_flows) lg.debug(f'in_dir (pcaps): ' + meta['in_dir'] + ', direction: ' + meta['direction']) lg.debug(f'normal_pcap: ' + str(len(meta['normal_pcap'])) + ', normal_flows: ' + str(len(meta['normal_flows']))) lg.debug(f'abnormal_pcap: ' + str(len(meta['abnormal_pcap'])) + ', abnormal_flows: ' + str(len(meta['abnormal_flows']))) else: raise ValueError('dataset does not exist.')
def _generate_flows(self): self.subflows_file = os.path.join(self.out_dir, 'normal_abnormal_subflows.dat') remove_file(self.subflows_file, self.overwrite) if os.path.exists(self.subflows_file): return load(self.subflows_file) # step 2: extract flows from pcap ############################################################################################## meta = load(self.orig_flows) normal_flows, abnormal_flows = meta['normal_flows'], meta[ 'abnormal_flows'] lg.debug( f'original normal flows: {len(normal_flows)} and abnormal flows: {len(abnormal_flows)}' ) qs = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 1] len_stat = np.quantile([len(pkts) for f, pkts in normal_flows], q=qs) lg.debug( f'flows: {len(normal_flows)}, length statistic: {len_stat}, when q = {qs}' ) meta = { 'flows': normal_flows, 'len_stat': (len_stat, qs), 'normal_flows': normal_flows, 'abnormal_flows': abnormal_flows } dump(meta, out_file=os.path.join(self.out_dir, 'normal_abnormal_flows.dat')) # step 2.2. only get normal flows durations self.flows_durations = [ _get_flow_duration(pkts) for (fids, pkts) in normal_flows ] normal_durations_stat = np.quantile(self.flows_durations, q=qs) lg.debug(f'normal_durations_stat: {normal_durations_stat}') self.subflow_interval = np.quantile( self.flows_durations, q=self.q_flow_dur) # median of flow_durations lg.debug( f'---subflow_interval: {self.subflow_interval}, q_flow_dur: {self.q_flow_dur}' ) # step 2.3 get subflows normal_flows, _ = _flows2subflows(normal_flows, interval=self.subflow_interval, labels=['0'] * len(normal_flows)) abnormal_flows, _ = _flows2subflows(abnormal_flows, interval=self.subflow_interval, labels=['1'] * len(abnormal_flows)) lg.debug( f'normal_flows: {len(normal_flows)}, and abnormal_flows: {len(abnormal_flows)} ' f'with interval: {self.subflow_interval} and q: {self.q_flow_dur}') meta = { 'normal_flows_durations': self.flows_durations, 'normal_durations_stat': (normal_durations_stat, qs), 'subflow_interval': self.subflow_interval, 'q_flow_dur': self.q_flow_dur, 'normal_flows': normal_flows, 'abnormal_flows': abnormal_flows } dump(meta, out_file=self.subflows_file) # only return subflows return meta
def main(args=None, test=False): """ Get the result according to the given parameters Parameters ---------- args test: boolean if we evaluate the built model on val set or test set Returns ------- history: dict Return the best result on 'SAMP' related feature. Otherwise, return the result """ try: lg.debug(args) out_dir = os.path.join(args.out_dir, args.direction, args.dataset, args.feature, f'header_{args.header}', args.model, f'tuning_{args.tuning}') ############################################################################################################### """ 1.1 Parse data and extract features """ lg.info(f'\n--- 1.1 Parse data') data = Data(dataset_name=args.dataset, direction=args.direction, feature_name=args.feature, header=args.header, overwrite=args.overwrite, random_state=RANDOM_STATE) data.generate() if 'SAMP' in args.feature: best = {'score': 0, 'model': None} for i, (X, y) in enumerate(zip(data.X, data.y)): lg.debug(f'SAMP_{i}') try: res_, data_ = _single_main(args, X, y, test=test) except Exception as e: lg.error(f'Error: {e}. SAMP_{i}') continue # get the best results on SAMP data if res_['score'] > best['score']: best['score'] = res_['score'] best['model'] = copy.deepcopy(res_) best['data'] = copy.deepcopy(data_) history = best else: X, y = data.X, data.y res_, data_ = _single_main(args, X, y, test=test) history = {'score': res_['score'], 'model': res_, 'data': data_} except Exception as e: traceback.print_exc() history = { 'score': 0, 'model': {}, 'data': (None, None, None, None, None, None) } ############################################################################################################### """ 3. Dump the result to disk """ lg.info(f'\n--- 3. Save the result') out_file = os.path.join(out_dir, f'res.dat') check_path(out_file) dump(history, out_file=out_file) out_file = os.path.splitext(out_file)[0] + '.csv' remove_file(out_file, overwrite=OVERWRITE) save2txt(history, out_file) lg.info(f'res_file: {out_file}') return history
def _generate_pcap(self): regenerate = False # step 1: obtain pcap and label if self.dataset_name == 'UCHI(SFRIG_2021)': self.IP = 'mac_70:2c:1f:39:25:6e' # IP for the new data changes over time, so here use mac address instead self.orig_flows = os.path.join( self.out_dir, f'iot2021-orig_sfrig_{self.direction}_flows-{self.IP}.dat') remove_file(self.Xy_file, self.overwrite) if not os.path.exists(self.orig_flows): lg.warning(f'{self.orig_flows} does not exist.') check_path(self.orig_flows) # hard coding (is not a good idea) meta = get_iot2021_flows( in_dir=f'../Datasets/UCHI/IOT_2021/data-clean/refrigerator', dataset_name=self.dataset_name, out_dir=self.out_dir, direction=self.direction) dump(meta, out_file=self.orig_flows) regenerate = True else: pass elif self.dataset_name == 'UCHI/IOT_2019/ghome_192.168.143.20' or self.dataset_name == 'UCHI(GHOME_2019)': self.IP = '192.168.143.20' self.orig_flows = os.path.join( self.out_dir, f'ghome2019-orig_sfrig_{self.direction}_flows-{self.IP}.dat') remove_file(self.Xy_file, self.overwrite) if not os.path.exists(self.orig_flows): lg.warning(f'{self.orig_flows} does not exist.') check_path(self.orig_flows) meta = get_ghome2019_flows( in_dir=f'../Datasets/UCHI/IOT_2019/', dataset_name='ghome_192.168.143.20', out_dir=self.out_dir, direction=self.direction) dump(meta, out_file=self.orig_flows) regenerate = True else: pass elif self.dataset_name == 'UCHI/IOT_2019/scam_192.168.143.42' or self.dataset_name == 'UCHI(SCAM_2019)': self.IP = '192.168.143.42' self.orig_flows = os.path.join( self.out_dir, f'scam2019-orig_scam_{self.direction}_flows-{self.IP}.dat') remove_file(self.Xy_file, self.overwrite) if not os.path.exists(self.orig_flows): lg.warning(f'{self.orig_flows} does not exist.') check_path(self.orig_flows) meta = get_scam2019_flows(in_dir=f'../Datasets/UCHI/IOT_2019/', dataset_name='scam_192.168.143.42', out_dir=self.out_dir, direction=self.direction) dump(meta, out_file=self.orig_flows) regenerate = True else: pass elif self.dataset_name == 'UCHI/IOT_2019/bstch_192.168.143.48' or self.dataset_name == 'UCHI(BSTCH_2019)': self.IP = '192.168.143.48' self.orig_flows = os.path.join( self.out_dir, f'bstch2019-orig_bstch_{self.direction}_flows-{self.IP}.dat') remove_file(self.Xy_file, self.overwrite) if not os.path.exists(self.orig_flows): lg.warning(f'{self.orig_flows} does not exist.') check_path(self.orig_flows) meta = get_bstch2019_flows( in_dir=f'../Datasets/UCHI/IOT_2019/', dataset_name='bstch_192.168.143.48', out_dir=self.out_dir, direction=self.direction) dump(meta, out_file=self.orig_flows) regenerate = True else: pass elif self.dataset_name == 'UCHI/IOT_2019/smtv_10.42.0.1' or self.dataset_name == 'UCHI(SMTV_2019)': self.IP = '10.42.0.1' self.orig_flows = os.path.join( self.out_dir, f'smtv2019-orig_smtv_{self.direction}_flows-{self.IP}.dat') remove_file(self.Xy_file, self.overwrite) if not os.path.exists(self.orig_flows): lg.warning(f'{self.orig_flows} does not exist.') check_path(self.orig_flows) meta = get_smtv2019_flows(in_dir=f'../Datasets/UCHI/IOT_2019/', dataset_name='smtv_10.42.0.1', out_dir=self.out_dir, direction=self.direction) dump(meta, out_file=self.orig_flows) regenerate = True else: pass else: raise ValueError('dataset does not exist.')
def _main(): """ Main function Returns ------- """ res = [] out_file = f'{OUT_DIR}/src/{RESULT_DIR}/res.dat' is_parallel = False if is_parallel: # with parallel def set_args(dataset, feature, header, model, tuning): args = parser() args.dataset = dataset args.feature = feature args.header = header args.model = model args.tuning = tuning lg.debug(args) return args # if backend='loky', the time taken is less than that of serial. but if backend='multiprocessing', we can # get very similar time cost comparing with serial. _res = [] with Parallel(n_jobs=20, backend='loky') as parallel: _res = parallel(delayed(_representation.main_no_tuning_vs_tuning) # delayed (set_args(dataset, feature, header, model, tuning)) # params for dataset, feature, header, model, tuning in list(itertools.product(DATASETS, FEATURES, HEADER, MODELS, TUNING)) # for ) # parallel # reorganize results res = [] for history, (dataset, feature, header, model, tuning) in zip(_res, list( itertools.product(DATASETS, FEATURES, HEADER, MODELS, TUNING))): res.append([dataset, feature, f'header_{header}', model, f'tuning_{tuning}', history['best']]) else: # without parallel for dataset, feature, header, model, tuning in list(itertools.product(DATASETS, FEATURES, HEADER, MODELS, TUNING)): try: lg.info(f'*** {dataset}-{feature}-header_{header}, {model}-tuning_{tuning}') args = parser() args.dataset = dataset args.feature = feature args.header = header args.model = model args.tuning = tuning args.overwrite = OVERWRITE history = _representation.main_no_tuning_vs_tuning(args) res.append([dataset, feature, f'header_{header}', model, f'tuning_{tuning}', history['best']]) # avoid losing any result, so save it immediately. _out_file = f'{args.out_dir}/{args.direction}/{RESULT_DIR}/~res.csv' check_path(_out_file) save2txt(res, _out_file, delimiter=',') except Exception as e: lg.error(f'Error: {e}. [{dataset}, {feature}, {header}, {model}, {tuning}]') # save the final results: '.dat' and '.csv' check_path(out_file) dump(res, out_file) out_file = os.path.splitext(out_file)[0] + '.csv' remove_file(out_file, OVERWRITE) save2txt(res, out_file, delimiter=',') lg.info(f'final result: {out_file}')
def get_unb_flows(self, in_dir='../Datatsets', direction='src'): # preprocessed the pcap and label on original pcap and label self.pcap_file = os.path.join(self.out_dir, f'pc_{self.IP}_AGMT.pcap') self.label_file = os.path.join(self.out_dir, f'pc_{self.IP}_AGMT.csv') remove_file(self.pcap_file, self.overwrite) remove_file(self.label_file, self.overwrite) check_path(self.pcap_file) check_path(self.label_file) if not os.path.exists(self.pcap_file) or not os.path.exists( self.label_file): # 1. original pcap friday_pacp_orig = get_file_path( ipt_dir=in_dir, dataset_name='UNB/CICIDS_2017/', data_cat='pcaps/Friday', file_name='Friday-WorkingHours.pcap') # filter pcap filter_ip(friday_pacp_orig, out_file=self.pcap_file, ips=[self.IP], direction=self.direction, keep_original=True) # 2. merge original labels friday_label = get_file_path( ipt_dir=self.out_dir, dataset_name='UNB/CICIDS_2017/', data_cat='labels/Friday', file_name='Friday-WorkingHours-Morning.pcap_ISCX.csv') friday_label_orig1 = get_file_path( ipt_dir=in_dir, dataset_name='UNB/CICIDS_2017/', data_cat='labels/Friday', file_name='Friday-WorkingHours-Morning.pcap_ISCX.csv') friday_label_orig2 = get_file_path( ipt_dir=in_dir, dataset_name='UNB/CICIDS_2017/', data_cat='labels/Friday', file_name='Friday-WorkingHours-Afternoon-DDos.pcap_ISCX.csv') friday_label_orig3 = get_file_path( ipt_dir=in_dir, dataset_name='UNB/CICIDS_2017/', data_cat='labels/Friday', file_name='Friday-WorkingHours-Afternoon-PortScan.pcap_ISCX.csv' ) friday_label_tmp = friday_label + '-all.csv' check_path(friday_label_tmp) merge_labels( [friday_label_orig1, friday_label_orig2, friday_label_orig3], mrg_label_path=friday_label_tmp) filter_csv_ip(friday_label_tmp, out_file=self.label_file, ips=[self.IP], direction=self.direction) ############################################################################################## # step 2.1 extract flows flows = _pcap2flows(self.pcap_file, verbose=10) # normal and abnormal flows # step 2.2 split normal flow and abnormal flow labels = pd.read_csv(self.label_file).values # normal_flows, abnormal_flows = split_normal_abnormal(flows, labels) # augment abnormal flows max_interval = np.quantile( [_get_flow_duration(pkts) for f, pkts in normal_flows], q=0.9) abnormal_flows = augment_flows(abnormal_flows, step=1, max_interval=max_interval) meta = { 'normal_flows': normal_flows, 'abnormal_flows': abnormal_flows, 'normal_pcap': self.pcap_file, 'abnormal_pcap': self.label_file, 'direction': direction, 'in_dir': in_dir } return meta