def main(): # load data data_file = 'out/data/demo_IAT.dat' X, y = load(data_file) # split train and test test X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=0.33, random_state=RANDOM_STATE) print( f'X_train.shape: {X_train.shape}, X_test.shape: {X_test.shape}, y_train.shape: {y_train.shape}, ' f'y_test.shape: {y_test.shape}') # model_name in ['OCSVM', 'KDE','IF', 'AE', 'GMM', 'PCA'] model_name = 'OCSVM' print(f'model_name: {model_name}') # create detection model model = generate_model(model_name) ndm = MODEL(model, score_metric='auc', verbose=10, random_state=RANDOM_STATE) # learned the model from the train set ndm.train(X_train) # evaluate the learned model ndm.test(X_test, y_test) # dump data to disk out_dir = os.path.dirname(data_file) dump((model, ndm.history), out_file=f'{out_dir}/{ndm.model_name}-results.dat') print(ndm.train.tot_time, ndm.test.tot_time, ndm.score)
def get_correlation(in_dir='', datasets='', feature='SIZE', header=True, out_dir='', out_file='.dat'): corr_results = {} for i, dataset in enumerate(datasets): in_file = os.path.join(in_dir, dataset, feature, f"header_{header}", 'Xy.dat') lg.debug(in_file) data = load(in_file) X_train, y_train, X_val, y_val, X_test, y_test = split_train_val_test( data['X'], data['y'], shuffle=True, random_state=RANDOM_STATE) # normalization ss, X_train, y_train, X_val, y_val, X_test, y_test = normalize( X_train, y_train, X_val, y_val, X_test, y_test) # 2 get correlation dim = X_test.shape[1] if feature == 'IAT': # iat_dim + header_dim = dim, here header_dim = (8 + ttl_dim (i.e., size_dim)) # => iat_dim + 8 + size_dim = iat_dim + 8 + (iat_dim + 1) = dim # => iat_dim = (dim - 9)//2 start_idx = (dim - 8 - 1) // 2 elif feature == 'SIZE': # size_dim + header_dim = dim # size_dim + (8+size_dim) = dim # size_dim = (dim - 8 ) // 2 start_idx = ( dim - 8 ) // 2 # # feature + header_feature:(8 tcp flags + TTL). only works for 'SIZE' else: msg = f'Error: {feature}' raise NotImplementedError(msg) corrs = [] lg.debug(f'header_feature_start_idx: {start_idx}') for j in range( 9): # feature + header_feature:(8 tcp flags + first TTL) _corr = _get_each_correlation(X_test[:, start_idx + j], y_test) corrs.append(_corr) corr_results[(in_file, dataset, feature, X_test.shape)] = corrs _out_file = os.path.join(out_dir, dataset, 'correlation.dat') check_path(_out_file) dump(corrs, _out_file) print(_out_file) # save all results check_path(out_file) dump(corr_results, out_file) return out_file
def generate(self): remove_file(self.Xy_file, self.overwrite) if os.path.exists(self.Xy_file): Xy_meta = load(self.Xy_file) else: if self.dataset_name in ['CTU']: self._generate_pcap() # generate data flows_meta = self._generate_flows() # normal_abnormal.data # Xy (fixed feature data) Xy_meta = self._generate_features(flows_meta['normal_flows'], flows_meta['abnormal_flows']) else: msg = f'{self.dataset_name}' raise NotImplementedError(msg) self.X, self.y = Xy_meta['X'], Xy_meta['y'] return Xy_meta
def main(): root_dir = 'examples/representation' in_dir = f'{root_dir}/report/out/' corr_file = os.path.join(in_dir, 'correlation', 'correlation.dat') DATASETS = ['UNB(PC1)', 'CTU', 'MAWI', 'UCHI(SFRIG_2021)'] feature = 'SIZE' # the correlation code only works for SIZE header = True out_dir = f'{root_dir}/report/out/correlation' get_correlation(in_dir=f'{root_dir}/out/src', datasets=DATASETS, feature=feature, header=header, out_dir=out_dir, out_file=corr_file) data = load(corr_file) out_file = os.path.join(out_dir, feature, f"header_{header}", 'correlation.pdf') plot_correlation_multi(data, out_file=out_file, show=True)
def report(in_file='gather.dat', delimiter=','): res = load(in_file) out_file = os.path.split(in_file) + 'report.csv' check_path(out_file) with open(out_file, 'w') as f: for header in HEADER: for tuning in TUNING: for feature in FEATURES: for dataset in DATASETS: for model in MODELS: data = get_one_res(res, f'header_{header}', f'tuning_{tuning}', feature, dataset, model) line = f'{delimiter}'.join(data) + '\n' lg.debug(line) f.write(line) lg.info(f'report: {out_file}') return out_file
def generate(self): remove_file(self.Xy_file, self.overwrite) if os.path.exists(self.Xy_file): Xy_meta = load(self.Xy_file) else: if self.dataset_name in [ 'UCHI(SFRIG_2021)', 'UCHI(SMTV_2019)', 'UCHI(GHOME_2019)', 'UCHI(SCAM_2019)', 'UCHI(BSTCH_2019)' ]: self._generate_pcap() # generate data flows_meta = self._generate_flows() # normal_abnormal.data # Xy (fixed feature data) Xy_meta = self._generate_features(flows_meta['normal_flows'], flows_meta['abnormal_flows']) else: msg = f'{self.dataset_name}' raise NotImplementedError(msg) self.X, self.y = Xy_meta['X'], Xy_meta['y'] self.Xy_meta = Xy_meta return self.Xy_meta
def generate(self): if os.path.exists(self.Xy_file): self.X, self.y = load(self.Xy_file) else: q_interval = 0.9 # pcap to flows flows = self.pcap2flows(self.pcap_file) # flows to subflow labels = [1] * len(flows) durations = [_get_flow_duration(pkts) for fid, pkts in flows] interval = _get_split_interval(durations, q_interval=q_interval) subflows, labels = self.flow2subflows(flows, interval=interval, labels=labels) # get dimension normal_flows = subflows num_pkts = [len(pkts) for fid, pkts in normal_flows] # only on normal flows dim = int(np.floor(np.quantile( num_pkts, q_interval))) # use the same q_interval to get the dimension lg.info(f'dim={dim}') # flows to features features, fids = self.flow2features(subflows, name=self.feature_name) # fixed the feature size features = self.fix_feature(features, dim=dim) self.X = features self.y = np.asarray([0] * len(features)) # save data to disk check_path(os.path.dirname(self.Xy_file)) dump((self.X, self.y), out_file=self.Xy_file) return self.X, self.y
def _generate_flows(self): self.subflows_file = os.path.join(self.out_dir, 'normal_abnormal_subflows.dat') remove_file(self.subflows_file, self.overwrite) if os.path.exists(self.subflows_file): return load(self.subflows_file) # step 2: extract flows from pcap ############################################################################################## meta = load(self.orig_flows) normal_flows, abnormal_flows = meta['normal_flows'], meta[ 'abnormal_flows'] lg.debug( f'original normal flows: {len(normal_flows)} and abnormal flows: {len(abnormal_flows)}' ) qs = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 1] len_stat = np.quantile([len(pkts) for f, pkts in normal_flows], q=qs) lg.debug( f'flows: {len(normal_flows)}, length statistic: {len_stat}, when q = {qs}' ) meta = { 'flows': normal_flows, 'len_stat': (len_stat, qs), 'normal_flows': normal_flows, 'abnormal_flows': abnormal_flows } dump(meta, out_file=os.path.join(self.out_dir, 'normal_abnormal_flows.dat')) # step 2.2. only get normal flows durations self.flows_durations = [ _get_flow_duration(pkts) for (fids, pkts) in normal_flows ] normal_durations_stat = np.quantile(self.flows_durations, q=qs) lg.debug(f'normal_durations_stat: {normal_durations_stat}') self.subflow_interval = np.quantile( self.flows_durations, q=self.q_flow_dur) # median of flow_durations lg.debug( f'---subflow_interval: {self.subflow_interval}, q_flow_dur: {self.q_flow_dur}' ) # step 2.3 get subflows normal_flows, _ = _flows2subflows(normal_flows, interval=self.subflow_interval, labels=['0'] * len(normal_flows)) abnormal_flows, _ = _flows2subflows(abnormal_flows, interval=self.subflow_interval, labels=['1'] * len(abnormal_flows)) lg.debug( f'normal_flows: {len(normal_flows)}, and abnormal_flows: {len(abnormal_flows)} ' f'with interval: {self.subflow_interval} and q: {self.q_flow_dur}') meta = { 'normal_flows_durations': self.flows_durations, 'normal_durations_stat': (normal_durations_stat, qs), 'subflow_interval': self.subflow_interval, 'q_flow_dur': self.q_flow_dur, 'normal_flows': normal_flows, 'abnormal_flows': abnormal_flows } dump(meta, out_file=self.subflows_file) # only return subflows return meta
print(dir()) a = 10 print(dir(a)) class DEMO: def __init__(self): self.val = 10 self._val = 10 self.__val = 10 a = DEMO() print(dir(a)) print(a.__val) print(a._DEMO__val) from odet.utils.tool import load in_file = 'examples/data/pc_192.168.10.5_AGMT.pcap-subflow_interval=None_q_flow_duration=0.9-all.dat' data = load(in_file) # print(data)