示例#1
0
def main():
    pcap_file = 'data/demo.pcap'
    pp = PCAP(pcap_file, flow_pkts_thres=2, verbose=10, random_state=RANDOM_STATE)

    # extract flows from pcap
    pp.pcap2flows()
    # label each flow with a label
    label_file = 'data/demo.csv'
    pp.label_flows(label_file=label_file)

    # flows to subflows
    pp.flows2subflows(q_interval=0.9)

    # extract features from each flow given feat_type
    # feat_type in ['IAT', 'SIZE', 'STATS', 'SAMP_NUM', 'SAMP_SIZE']
    feat_type = 'IAT'
    print(f'feat_type: {feat_type}')
    pp.flow2features(feat_type, fft=False, header=False)

    # dump data to disk
    X, y = pp.features, pp.labels
    out_dir = os.path.join('out', os.path.dirname(pcap_file))
    dump((X, y), out_file=f'{out_dir}/demo_{feat_type}.dat')

    print(pp.features.shape, pp.pcap2flows.tot_time, pp.flows2subflows.tot_time, pp.flow2features.tot_time)
示例#2
0
def main(is_defaut_parms=True):
    res = {}
    tot = len(DATASETS.keys()) * len(MODELS.keys())
    i = 1
    for dataset in DATASETS.keys():
        dataset_res = {}
        for model in MODELS.keys():
            try:
                lg.info(
                    f'\n\n***{i}/{tot}:{dataset}_{FEATURE}-{model}-default_params_{is_defaut_parms}'
                )
                if is_defaut_parms:
                    args = Args(dataset, model)
                    _res = offline_default_main(args.args)
                else:
                    args = Args(dataset, model)
                    _res = offline_best_main(args.args)
                dataset_res[model] = _res
            except Exception as e:
                msg = f'{dataset}-{model}-default_{is_defaut_parms}: {e}'
                lg.error(msg)
                traceback.print_exc()
            i += 1
        res[dataset] = dataset_res

    out_file = os.path.join(OUT_DIR,
                            f'{FEATURE}-default_{is_defaut_parms}.dat')
    dump(res, out_file)

    return res
示例#3
0
def main():
    # load data
    data_file = 'out/data/demo_IAT.dat'
    X, y = load(data_file)
    # split train and test test
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.33, random_state=RANDOM_STATE)
    print(
        f'X_train.shape: {X_train.shape}, X_test.shape: {X_test.shape}, y_train.shape: {y_train.shape}, '
        f'y_test.shape: {y_test.shape}')

    # model_name in ['OCSVM', 'KDE','IF', 'AE', 'GMM', 'PCA']
    model_name = 'OCSVM'
    print(f'model_name: {model_name}')
    # create detection model
    model = generate_model(model_name)

    ndm = MODEL(model,
                score_metric='auc',
                verbose=10,
                random_state=RANDOM_STATE)

    # learned the model from the train set
    ndm.train(X_train)

    # evaluate the learned model
    ndm.test(X_test, y_test)

    # dump data to disk
    out_dir = os.path.dirname(data_file)
    dump((model, ndm.history),
         out_file=f'{out_dir}/{ndm.model_name}-results.dat')

    print(ndm.train.tot_time, ndm.test.tot_time, ndm.score)
示例#4
0
def get_correlation(in_dir='',
                    datasets='',
                    feature='SIZE',
                    header=True,
                    out_dir='',
                    out_file='.dat'):
    corr_results = {}
    for i, dataset in enumerate(datasets):
        in_file = os.path.join(in_dir, dataset, feature, f"header_{header}",
                               'Xy.dat')
        lg.debug(in_file)
        data = load(in_file)
        X_train, y_train, X_val, y_val, X_test, y_test = split_train_val_test(
            data['X'], data['y'], shuffle=True, random_state=RANDOM_STATE)
        # normalization
        ss, X_train, y_train, X_val, y_val, X_test, y_test = normalize(
            X_train, y_train, X_val, y_val, X_test, y_test)
        # 2 get correlation
        dim = X_test.shape[1]
        if feature == 'IAT':
            # iat_dim + header_dim = dim, here header_dim =  (8 + ttl_dim (i.e., size_dim))
            # => iat_dim + 8 + size_dim = iat_dim + 8 + (iat_dim + 1) = dim
            # => iat_dim = (dim - 9)//2
            start_idx = (dim - 8 - 1) // 2
        elif feature == 'SIZE':
            # size_dim + header_dim = dim
            # size_dim + (8+size_dim) = dim
            # size_dim = (dim - 8 ) // 2
            start_idx = (
                dim - 8
            ) // 2  # # feature + header_feature:(8 tcp flags + TTL). only works for 'SIZE'
        else:
            msg = f'Error: {feature}'
            raise NotImplementedError(msg)
        corrs = []
        lg.debug(f'header_feature_start_idx: {start_idx}')
        for j in range(
                9):  # feature + header_feature:(8 tcp flags + first TTL)
            _corr = _get_each_correlation(X_test[:, start_idx + j], y_test)
            corrs.append(_corr)
        corr_results[(in_file, dataset, feature, X_test.shape)] = corrs

        _out_file = os.path.join(out_dir, dataset, 'correlation.dat')
        check_path(_out_file)
        dump(corrs, _out_file)
        print(_out_file)
    # save all results
    check_path(out_file)
    dump(corr_results, out_file)

    return out_file
示例#5
0
文件: unb.py 项目: Learn-Live/odet
    def _generate_pcap(self):

        # step 1: obtain pcap and label
        if self.dataset_name == 'UNB/CICIDS_2017/pc_192.168.10.5' or self.dataset_name == 'UNB(PC1)':
            self.IP = '192.168.10.5'
            self.orig_flows = os.path.join(
                self.out_dir,
                f'orig_unb(pc1)_{self.direction}_flows-{self.IP}.dat')
        elif self.dataset_name == 'UNB/CICIDS_2017/pc_192.168.10.8' or self.dataset_name == 'UNB(PC2)':
            self.IP = '192.168.10.8'
            self.orig_flows = os.path.join(
                self.out_dir,
                f'orig_unb(pc2)_{self.direction}_flows-{self.IP}.dat')
        elif self.dataset_name == 'UNB/CICIDS_2017/pc_192.168.10.9' or self.dataset_name == 'UNB(PC3)':
            self.IP = '192.168.10.9'
            self.orig_flows = os.path.join(
                self.out_dir,
                f'orig_unb(pc3)_{self.direction}_flows-{self.IP}.dat')
        elif self.dataset_name == 'UNB/CICIDS_2017/pc_192.168.10.14' or self.dataset_name == 'UNB(PC4)':
            self.IP = '192.168.10.14'
            self.orig_flows = os.path.join(
                self.out_dir,
                f'orig_unb(pc4)_{self.direction}_flows-{self.IP}.dat')
        elif self.dataset_name == 'UNB/CICIDS_2017/pc_192.168.10.15' or self.dataset_name == 'UNB(PC5)':
            self.IP = '192.168.10.15'
            self.orig_flows = os.path.join(
                self.out_dir,
                f'orig_unb(pc5)_{self.direction}_flows-{self.IP}.dat')
        elif self.dataset_name == 'DEMO_IDS/DS-srcIP_192.168.10.5':
            self.IP = '192.168.10.5'
            self.orig_flows = os.path.join(
                self.out_dir,
                f'orig_demo_{self.direction}_flows-{self.IP}.dat')
        else:
            raise ValueError('dataset does not exist.')

        remove_file(self.Xy_file, self.overwrite)
        if not os.path.exists(self.orig_flows):
            lg.warning(f'{self.orig_flows} does not exist.')
            check_path(self.orig_flows)
            meta = self.get_unb_flows(in_dir=f'../Datasets',
                                      direction=self.direction)
            dump(meta, out_file=self.orig_flows)
            lg.debug(f'in_dir (pcaps): ' + meta['in_dir'] + ', direction: ' +
                     meta['direction'])
            lg.debug(f'normal_pcap: ' + str(len(meta['normal_pcap'])) +
                     ', normal_flows: ' + str(len(meta['normal_flows'])))
            lg.debug(f'abnormal_pcap: ' + str(len(meta['abnormal_pcap'])) +
                     ', abnormal_flows: ' + str(len(meta['abnormal_flows'])))
        else:
            pass
示例#6
0
	def _generate_pcap(self):
		# preprocessed the pcap and label on original pcap and label
		if self.dataset_name == 'MAWI/WIDE_2019/pc_202.171.168.50' or self.dataset_name == 'MAWI':
			# "http://mawi.wide.ad.jp/mawi/samplepoint-F/2019/201912071400.html"
			self.IP = '202.171.168.50'
			self.orig_flows = os.path.join(self.out_dir, f'mawi_{self.direction}_flows-{self.IP}.dat')
			remove_file(self.orig_flows, self.overwrite)
			if not os.path.exists(self.orig_flows):
				lg.warning(f'{self.orig_flows} does not exist.')
				check_path(self.orig_flows)
				meta = self.get_mawi_flows(in_dir=f'../Datasets', direction=self.direction)
				dump(meta, out_file=self.orig_flows)
				lg.debug(f'in_dir (pcaps): ' + meta['in_dir'] + ', direction: ' + meta['direction'])
				lg.debug(f'normal_pcap: ' + str(len(meta['normal_pcap'])) + ', normal_flows: '
				         + str(len(meta['normal_flows'])))
				lg.debug(f'abnormal_pcap: ' + str(len(meta['abnormal_pcap'])) + ', abnormal_flows: '
				         + str(len(meta['abnormal_flows'])))

		else:
			raise ValueError('dataset does not exist.')
示例#7
0
    def generate(self):
        if os.path.exists(self.Xy_file):
            self.X, self.y = load(self.Xy_file)
        else:
            q_interval = 0.9
            # pcap to flows
            flows = self.pcap2flows(self.pcap_file)

            # flows to subflow
            labels = [1] * len(flows)
            durations = [_get_flow_duration(pkts) for fid, pkts in flows]
            interval = _get_split_interval(durations, q_interval=q_interval)
            subflows, labels = self.flow2subflows(flows,
                                                  interval=interval,
                                                  labels=labels)

            # get dimension
            normal_flows = subflows
            num_pkts = [len(pkts)
                        for fid, pkts in normal_flows]  # only on normal flows
            dim = int(np.floor(np.quantile(
                num_pkts,
                q_interval)))  # use the same q_interval to get the dimension
            lg.info(f'dim={dim}')

            # flows to features
            features, fids = self.flow2features(subflows,
                                                name=self.feature_name)

            # fixed the feature size
            features = self.fix_feature(features, dim=dim)

            self.X = features
            self.y = np.asarray([0] * len(features))

            # save data to disk
            check_path(os.path.dirname(self.Xy_file))
            dump((self.X, self.y), out_file=self.Xy_file)

        return self.X, self.y
示例#8
0
    def _generate_pcap(self):
        # preprocessed the pcap and label on original pcap and label
        if self.dataset_name == 'CTU/IOT_2017/pc_192.168.1.196' or self.dataset_name == 'CTU':
            self.IP = '192.168.1.196'
            self.orig_flows = os.path.join(
                self.out_dir, f'ctu_{self.direction}_flows-{self.IP}.dat')
            remove_file(self.orig_flows, self.overwrite)
            if not os.path.exists(self.orig_flows):
                lg.warning(f'{self.orig_flows} does not exist.')
                check_path(self.orig_flows)
                meta = self.get_ctu_flows(in_dir=f'../Datasets',
                                          direction=self.direction)
                dump(meta, out_file=self.orig_flows)
                lg.debug(f'in_dir (pcaps): ' + meta['in_dir'] +
                         ', direction: ' + meta['direction'])
                lg.debug(f'normal_pcap: ' + str(len(meta['normal_pcap'])) +
                         ', normal_flows: ' + str(len(meta['normal_flows'])))
                lg.debug(f'abnormal_pcap: ' + str(len(meta['abnormal_pcap'])) +
                         ', abnormal_flows: ' +
                         str(len(meta['abnormal_flows'])))

        else:
            raise ValueError('dataset does not exist.')
示例#9
0
    def _generate_flows(self):
        self.subflows_file = os.path.join(self.out_dir,
                                          'normal_abnormal_subflows.dat')
        remove_file(self.subflows_file, self.overwrite)
        if os.path.exists(self.subflows_file):
            return load(self.subflows_file)

        # step 2: extract flows from pcap
        ##############################################################################################
        meta = load(self.orig_flows)
        normal_flows, abnormal_flows = meta['normal_flows'], meta[
            'abnormal_flows']
        lg.debug(
            f'original normal flows: {len(normal_flows)} and abnormal flows: {len(abnormal_flows)}'
        )
        qs = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 1]
        len_stat = np.quantile([len(pkts) for f, pkts in normal_flows], q=qs)
        lg.debug(
            f'flows: {len(normal_flows)}, length statistic: {len_stat}, when q = {qs}'
        )
        meta = {
            'flows': normal_flows,
            'len_stat': (len_stat, qs),
            'normal_flows': normal_flows,
            'abnormal_flows': abnormal_flows
        }
        dump(meta,
             out_file=os.path.join(self.out_dir, 'normal_abnormal_flows.dat'))

        # step 2.2. only get normal flows durations
        self.flows_durations = [
            _get_flow_duration(pkts) for (fids, pkts) in normal_flows
        ]
        normal_durations_stat = np.quantile(self.flows_durations, q=qs)
        lg.debug(f'normal_durations_stat: {normal_durations_stat}')
        self.subflow_interval = np.quantile(
            self.flows_durations,
            q=self.q_flow_dur)  # median  of flow_durations
        lg.debug(
            f'---subflow_interval: {self.subflow_interval}, q_flow_dur: {self.q_flow_dur}'
        )
        # step 2.3 get subflows
        normal_flows, _ = _flows2subflows(normal_flows,
                                          interval=self.subflow_interval,
                                          labels=['0'] * len(normal_flows))
        abnormal_flows, _ = _flows2subflows(abnormal_flows,
                                            interval=self.subflow_interval,
                                            labels=['1'] * len(abnormal_flows))
        lg.debug(
            f'normal_flows: {len(normal_flows)}, and abnormal_flows: {len(abnormal_flows)} '
            f'with interval: {self.subflow_interval} and q: {self.q_flow_dur}')
        meta = {
            'normal_flows_durations': self.flows_durations,
            'normal_durations_stat': (normal_durations_stat, qs),
            'subflow_interval': self.subflow_interval,
            'q_flow_dur': self.q_flow_dur,
            'normal_flows': normal_flows,
            'abnormal_flows': abnormal_flows
        }
        dump(meta, out_file=self.subflows_file)

        # only return subflows
        return meta
示例#10
0
    def _generate_features(self, normal_flows, abnormal_flows):
        # step 3: flows to features.
        # only on normal flows
        normal_flow_lengths = [len(pkts) for fid, pkts in normal_flows]
        qs = [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95, 1]
        normal_lengths_stat = np.quantile(normal_flow_lengths, q=qs)
        lg.debug(f'normal_lengths_stat: {normal_lengths_stat}, where q = {qs}')
        self.dim = int(
            np.floor(np.quantile(normal_flow_lengths, self.q_flow_dur)))
        lg.info(f'dim(SIZE) = {self.dim}')

        self.X = []
        self.y = []

        if self.header:
            header_features, header_fids = _get_header(normal_flows)
            header_dim = int(
                np.quantile([len(v) for v in header_features],
                            q=self.q_flow_dur))
            lg.info(f'header_dim: {header_dim}')
        else:
            header_dim = None

        if 'SAMP' in self.feature_name:
            normal_features, normal_fids = self.flow2features(
                normal_flows,
                name=self.feature_name,
                dim=self.dim,
                header=self.header,
                header_dim=header_dim)
            abnormal_features, abnormal_fids = self.flow2features(
                abnormal_flows,
                name=self.feature_name,
                dim=self.dim,
                header=self.header,
                header_dim=header_dim)

            for q in normal_features.keys():
                X_ = list(
                    normal_features[q][0])  # (features, fid, sampling_rate_)
                y_ = [0] * len(normal_features[q][0])
                X_.extend(list(abnormal_features[q][0]))
                y_.extend([1] * len(abnormal_features[q][0]))
                self.X.append(np.asarray(X_))
                self.y.append(np.asarray(y_))

            # save data to disk
            check_path(self.Xy_file)
            meta = {
                'X': self.X,
                'y': self.y,
                'normal_flow_lengths':
                (normal_flow_lengths, normal_lengths_stat),
                'dim': self.dim,
                'q_flow_dur': self.q_flow_dur
            }
            dump(meta, out_file=self.Xy_file)
            # save feature data as csv
            csv_file = os.path.splitext(self.Xy_file)[0] + '.csv'
        # np.savetxt(csv_file, np.concatenate([self.X, self.y[..., np.newaxis]], axis=1), delimiter=',')
        else:
            for flows, label in zip([normal_flows, abnormal_flows], [0, 1]):
                features, fids = self.flow2features(flows,
                                                    name=self.feature_name,
                                                    dim=self.dim,
                                                    header=self.header,
                                                    header_dim=header_dim)
                self.X.extend(features)
                self.y.extend([label] * len(features))

            # save data to disk
            check_path(self.Xy_file)
            self.X = np.asarray(self.X)
            self.y = np.asarray(self.y)
            meta = {
                'X': self.X,
                'y': self.y,
                'normal_flow_lengths':
                (normal_flow_lengths, normal_lengths_stat),
                'dim': self.dim,
                'q_flow_dur': self.q_flow_dur
            }
            dump(meta, out_file=self.Xy_file)
            # save feature data as csv
            csv_file = os.path.splitext(self.Xy_file)[0] + '.csv'
            np.savetxt(csv_file,
                       np.concatenate([self.X, self.y[..., np.newaxis]],
                                      axis=1),
                       delimiter=',')
        return meta
示例#11
0
def main(args=None, test=False):
    """ Get the result according to the given parameters

	Parameters
	----------
	args
	test: boolean
		if we evaluate the built model on val set or test set
	Returns
	-------
	history: dict
		Return the best result on 'SAMP' related feature. Otherwise, return the result
	"""
    try:
        lg.debug(args)
        out_dir = os.path.join(args.out_dir, args.direction, args.dataset,
                               args.feature, f'header_{args.header}',
                               args.model, f'tuning_{args.tuning}')

        ###############################################################################################################
        """ 1.1 Parse data and extract features
			
		"""
        lg.info(f'\n--- 1.1 Parse data')
        data = Data(dataset_name=args.dataset,
                    direction=args.direction,
                    feature_name=args.feature,
                    header=args.header,
                    overwrite=args.overwrite,
                    random_state=RANDOM_STATE)
        data.generate()

        if 'SAMP' in args.feature:
            best = {'score': 0, 'model': None}
            for i, (X, y) in enumerate(zip(data.X, data.y)):
                lg.debug(f'SAMP_{i}')
                try:
                    res_, data_ = _single_main(args, X, y, test=test)
                except Exception as e:
                    lg.error(f'Error: {e}. SAMP_{i}')
                    continue
                # get the best results on SAMP data
                if res_['score'] > best['score']:
                    best['score'] = res_['score']
                    best['model'] = copy.deepcopy(res_)
                    best['data'] = copy.deepcopy(data_)
            history = best
        else:
            X, y = data.X, data.y
            res_, data_ = _single_main(args, X, y, test=test)
            history = {'score': res_['score'], 'model': res_, 'data': data_}

    except Exception as e:
        traceback.print_exc()
        history = {
            'score': 0,
            'model': {},
            'data': (None, None, None, None, None, None)
        }

    ###############################################################################################################
    """ 3. Dump the result to disk

	"""
    lg.info(f'\n--- 3. Save the result')
    out_file = os.path.join(out_dir, f'res.dat')
    check_path(out_file)
    dump(history, out_file=out_file)
    out_file = os.path.splitext(out_file)[0] + '.csv'
    remove_file(out_file, overwrite=OVERWRITE)
    save2txt(history, out_file)
    lg.info(f'res_file: {out_file}')

    return history
示例#12
0
def main_no_tuning_vs_tuning(args=None):
    """ get results with default and best parameters according to the args.

	Parameters
	----------
	args: given parameters

	Returns
	-------
	history: dict
		store all the results in a dictionary
	"""
    # 1. Get dimension of the dataset. For some algorithms, they need the dimensions (e.g., AE)
    data = Data(dataset_name=args.dataset,
                direction=args.direction,
                feature_name=args.feature,
                header=args.header,
                overwrite=args.overwrite,
                random_state=RANDOM_STATE)
    data.generate()
    if 'SAMP' in args.feature:
        X = data.X[0]
    else:
        X = data.X

    # 2. Get the results with the given model
    if args.model == 'OCSVM':
        if args.tuning:
            qs = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]
        else:
            qs = [0.3]
        history = {
        }  # store the best result, model parameters, and the best model (dict)
        best = {'score': 0, 'model': None}
        lg.debug(f'Tuning: q = {qs}')
        for q in qs:
            args.model_params = {'q': q}
            # get results on the validation set
            history_ = main(args, test=False)
            score_ = history_['score']
            if score_ > best['score']:
                best['score'] = score_
                best['q'] = q
                best['model'] = copy.deepcopy(history_)
            history[q] = history_

        # get the final result on the test set.
        args.model_params = {'q': best['q']}
        best['model'] = main(args, test=True)
        best['score'] = best['model']['score']
        history['best'] = best
    elif args.model == 'GMM':
        if args.tuning:
            n_components_arr = [1, 2, 5, 10, 15, 20, 25, 30, 35, 40]
        else:
            n_components_arr = ['quickshift']
        history = {}
        best = {'score': 0, 'model': None}
        lg.debug(f'Tuning: q = {n_components_arr}')
        for n_components in n_components_arr:
            args.model_params = {'n_components': n_components}
            history_ = main(args, test=False)
            score_ = history_['score']
            if score_ > best['score']:
                best['score'] = score_
                best['n_components'] = n_components
                best['model'] = copy.deepcopy(history_)
            history[n_components] = history_

        # get the final result on the test set.
        args.model_params = {'n_components': best['n_components']}
        best['model'] = main(args, test=True)
        best['score'] = best['model']['score']
        history['best'] = best

    elif args.model == 'IF':
        if args.tuning:
            n_estimators_arr = [
                int(v)
                for v in list(np.linspace(30, 300, num=10, endpoint=True))
            ]
        else:
            n_estimators_arr = [100]
        history = {}
        best = {'score': 0, 'model': None}
        lg.debug(f'Tuning: n_estimators_arr = {n_estimators_arr}')
        for n_estimators in n_estimators_arr:
            args.model_params = {'n_estimators': n_estimators}
            history_ = main(args, test=False)
            score_ = history_['score']
            if score_ > best['score']:
                best['score'] = score_
                best['n_estimators'] = n_estimators
                best['model'] = copy.deepcopy(history_)
            history[n_estimators] = history_

        # get the final result on the test set.
        args.model_params = {'n_estimators': best['n_estimators']}
        best['model'] = main(args, test=True)
        best['score'] = best['model']['score']
        history['best'] = best

    elif args.model == 'PCA':
        if args.tuning:
            n_components_arr = [
                int(v) for v in list(
                    np.linspace(1, min(X.shape), num=10, endpoint=False))
            ]
        else:
            n_components_arr = ['mle']
        history = {}
        best = {'score': 0, 'model': None}
        lg.debug(f'Tuning: n_components_arr = {n_components_arr}')
        for n_components in n_components_arr:
            args.model_params = {'n_components': n_components}
            history_ = main(args, test=False)
            score_ = history_['score']
            if score_ > best['score']:
                best['score'] = score_
                best['n_components'] = n_components
                best['model'] = copy.deepcopy(history_)
            history[n_components] = history_

        # get the final result on the test set.
        args.model_params = {'n_components': best['n_components']}
        best['model'] = main(args, test=True)
        best['score'] = best['model']['score']
        history['best'] = best
    elif args.model == 'KDE':
        if args.tuning:
            qs = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.95]
        else:
            qs = [0.3]
        history = {}
        best = {'score': 0, 'model': None}
        lg.debug(f'Tuning: q = {qs}')
        for q in qs:
            args.model_params = {'q': q}
            history_ = main(args, test=False)
            score_ = history_['score']
            if score_ > best['score']:
                best['score'] = score_
                best['q'] = q
                best['model'] = copy.deepcopy(history_)
            history[q] = history_
        # get the final result on the test set.
        args.model_params = {'q': best['q']}
        best['model'] = main(args, test=True)
        best['score'] = best['model']['score']
        history['best'] = best

    elif args.model == 'AE':
        if args.tuning:
            feat_dim = X.shape[1]

            def get_AE_parameters(d, num=10):
                latent_sizes = []
                for i in range(num):
                    v = np.ceil(1 + i * (d - 2) / 9).astype(int)
                    if v not in latent_sizes:
                        latent_sizes.append(v)

                hidden_sizes = [
                    min((d - 1),
                        np.ceil(2 * v).astype(int)) for v in latent_sizes
                ]

                hidden_neurons = []
                for i, (hid, lat) in enumerate(zip(hidden_sizes,
                                                   latent_sizes)):
                    v = [d, hid, lat, hid, d]
                    hidden_neurons.append(v)
                return hidden_neurons

            hidden_neurons_arr = get_AE_parameters(feat_dim, num=10)
        else:
            feat_dim = X.shape[1]
            latent_dim = np.ceil(feat_dim / 2).astype(int)
            hid = min((feat_dim - 1), np.ceil(2 * latent_dim).astype(int))
            hidden_neurons = [feat_dim, hid, latent_dim, hid, feat_dim]
            hidden_neurons_arr = [hidden_neurons]

        history = {}
        best = {'score': 0, 'model': None}
        lg.debug(f'Tuning: hidden_neurons = {hidden_neurons_arr}')
        for hidden_neurons in hidden_neurons_arr:
            args.model_params = {'hidden_neurons': hidden_neurons}
            history_ = main(args, test=False)
            score_ = history_['score']
            if score_ > best['score']:
                best['score'] = score_
                best['hidden_neurons'] = hidden_neurons
                best['model'] = copy.deepcopy(history_)
            history[tuple(hidden_neurons)] = history_
        # get the final result on the test set.
        args.model_params = {'hidden_neurons': best['hidden_neurons']}
        best['model'] = main(args, test=True)
        best['score'] = best['model']['score']
        history['best'] = best

    else:
        msg = f'{args.model}'
        raise NotImplementedError(msg)
    # lg.info(f'\n*** best: ' + str(history['best']))
    out_file = os.path.join(args.out_dir, args.direction, args.dataset,
                            args.feature, f'header_{args.header}', args.model,
                            f'tuning_{args.tuning}', 'res.dat')
    check_path(out_file)
    dump(history, out_file)

    return history
示例#13
0
def main():
    res = []
    res_file = 'res2'
    is_parallel = False
    if is_parallel:

        def set_args(dataset, feature, header, model, tuning):
            args = parser()
            args.dataset = dataset
            args.feature = feature
            args.header = header
            args.model = model
            args.tuning = tuning
            print(args)
            return args

        # if backend='loky', the time taken is less than that of serial. but if backend='multiprocessing', we can
        # get very similar time cost comparing with serial.
        _res = []
        with Parallel(n_jobs=20, backend='loky') as parallel:
            _res = parallel(
                delayed(_representation.main_no_tuning_vs_tuning)  # delayed
                (set_args(dataset, feature, header, model, tuning))  # params
                for dataset, feature, header, model, tuning in list(
                    itertools.product(DATASETS, FEATURES, HEADER, MODELS,
                                      TUNING))  # for
            )  # parallel
        # reorganize results
        res = []
        for history, (dataset, feature, header, model, tuning) in zip(
                _res,
                list(
                    itertools.product(DATASETS, FEATURES, HEADER, MODELS,
                                      TUNING))):
            res.append([
                dataset, feature, f'header_{header}', model,
                f'tuning_{tuning}', history
            ])
        out_file = f'examples/representation/out/src/{DATE}/{res_file}.dat'
    else:  # without parallel
        for dataset in DATASETS:
            for feature in FEATURES:
                for header in HEADER:
                    for model in MODELS:
                        for tuning in TUNING:
                            try:
                                print(
                                    f'*** {dataset}-{feature}-header_{header}, {model}-tuning_{tuning}'
                                )
                                args = parser()
                                args.dataset = dataset
                                args.feature = feature
                                args.header = header
                                args.model = model
                                args.tuning = tuning
                                history = _representation.main_no_tuning_vs_tuning(
                                    args)
                                res_ = [
                                    dataset, feature, f'header_{header}',
                                    model, f'tuning_{tuning}', history
                                ]
                                res.append(res_)
                                # avoid losing any result, so save it immediately
                                out_file = f'{args.out_dir}/{args.direction}/~{res_file}.dat'
                                dump(res, out_file)
                                save2txt(res,
                                         os.path.splitext(out_file)[0] +
                                         '.csv',
                                         delimiter=',')
                            except Exception as e:
                                lg.error(e)

        out_file = f'{args.out_dir}/{args.direction}/{DATE}/{res_file}.dat'

    check_path(out_file)
    dump(res, out_file)
    save2txt(res, os.path.splitext(out_file)[0] + '.csv', delimiter=',')
    lg.info(f'final result: {out_file}')
示例#14
0
文件: uchi.py 项目: Learn-Live/odet
 def _generate_pcap(self):
     regenerate = False
     # step 1: obtain pcap and label
     if self.dataset_name == 'UCHI(SFRIG_2021)':
         self.IP = 'mac_70:2c:1f:39:25:6e'  # IP for the new data changes over time, so here use mac address instead
         self.orig_flows = os.path.join(
             self.out_dir,
             f'iot2021-orig_sfrig_{self.direction}_flows-{self.IP}.dat')
         remove_file(self.Xy_file, self.overwrite)
         if not os.path.exists(self.orig_flows):
             lg.warning(f'{self.orig_flows} does not exist.')
             check_path(self.orig_flows)
             # hard coding (is not a good idea)
             meta = get_iot2021_flows(
                 in_dir=f'../Datasets/UCHI/IOT_2021/data-clean/refrigerator',
                 dataset_name=self.dataset_name,
                 out_dir=self.out_dir,
                 direction=self.direction)
             dump(meta, out_file=self.orig_flows)
             regenerate = True
         else:
             pass
     elif self.dataset_name == 'UCHI/IOT_2019/ghome_192.168.143.20' or self.dataset_name == 'UCHI(GHOME_2019)':
         self.IP = '192.168.143.20'
         self.orig_flows = os.path.join(
             self.out_dir,
             f'ghome2019-orig_sfrig_{self.direction}_flows-{self.IP}.dat')
         remove_file(self.Xy_file, self.overwrite)
         if not os.path.exists(self.orig_flows):
             lg.warning(f'{self.orig_flows} does not exist.')
             check_path(self.orig_flows)
             meta = get_ghome2019_flows(
                 in_dir=f'../Datasets/UCHI/IOT_2019/',
                 dataset_name='ghome_192.168.143.20',
                 out_dir=self.out_dir,
                 direction=self.direction)
             dump(meta, out_file=self.orig_flows)
             regenerate = True
         else:
             pass
     elif self.dataset_name == 'UCHI/IOT_2019/scam_192.168.143.42' or self.dataset_name == 'UCHI(SCAM_2019)':
         self.IP = '192.168.143.42'
         self.orig_flows = os.path.join(
             self.out_dir,
             f'scam2019-orig_scam_{self.direction}_flows-{self.IP}.dat')
         remove_file(self.Xy_file, self.overwrite)
         if not os.path.exists(self.orig_flows):
             lg.warning(f'{self.orig_flows} does not exist.')
             check_path(self.orig_flows)
             meta = get_scam2019_flows(in_dir=f'../Datasets/UCHI/IOT_2019/',
                                       dataset_name='scam_192.168.143.42',
                                       out_dir=self.out_dir,
                                       direction=self.direction)
             dump(meta, out_file=self.orig_flows)
             regenerate = True
         else:
             pass
     elif self.dataset_name == 'UCHI/IOT_2019/bstch_192.168.143.48' or self.dataset_name == 'UCHI(BSTCH_2019)':
         self.IP = '192.168.143.48'
         self.orig_flows = os.path.join(
             self.out_dir,
             f'bstch2019-orig_bstch_{self.direction}_flows-{self.IP}.dat')
         remove_file(self.Xy_file, self.overwrite)
         if not os.path.exists(self.orig_flows):
             lg.warning(f'{self.orig_flows} does not exist.')
             check_path(self.orig_flows)
             meta = get_bstch2019_flows(
                 in_dir=f'../Datasets/UCHI/IOT_2019/',
                 dataset_name='bstch_192.168.143.48',
                 out_dir=self.out_dir,
                 direction=self.direction)
             dump(meta, out_file=self.orig_flows)
             regenerate = True
         else:
             pass
     elif self.dataset_name == 'UCHI/IOT_2019/smtv_10.42.0.1' or self.dataset_name == 'UCHI(SMTV_2019)':
         self.IP = '10.42.0.1'
         self.orig_flows = os.path.join(
             self.out_dir,
             f'smtv2019-orig_smtv_{self.direction}_flows-{self.IP}.dat')
         remove_file(self.Xy_file, self.overwrite)
         if not os.path.exists(self.orig_flows):
             lg.warning(f'{self.orig_flows} does not exist.')
             check_path(self.orig_flows)
             meta = get_smtv2019_flows(in_dir=f'../Datasets/UCHI/IOT_2019/',
                                       dataset_name='smtv_10.42.0.1',
                                       out_dir=self.out_dir,
                                       direction=self.direction)
             dump(meta, out_file=self.orig_flows)
             regenerate = True
         else:
             pass
     else:
         raise ValueError('dataset does not exist.')
示例#15
0
def _main():
	""" Main function

	Returns
	-------

	"""
	res = []
	out_file = f'{OUT_DIR}/src/{RESULT_DIR}/res.dat'
	is_parallel = False
	if is_parallel:  # with parallel
		def set_args(dataset, feature, header, model, tuning):
			args = parser()
			args.dataset = dataset
			args.feature = feature
			args.header = header
			args.model = model
			args.tuning = tuning
			lg.debug(args)
			return args

		# if backend='loky', the time taken is less than that of serial. but if backend='multiprocessing', we can
		# get very similar time cost comparing with serial.
		_res = []
		with Parallel(n_jobs=20, backend='loky') as parallel:
			_res = parallel(delayed(_representation.main_no_tuning_vs_tuning)  # delayed
			                (set_args(dataset, feature, header, model, tuning))  # params
			                for dataset, feature, header, model, tuning in
			                list(itertools.product(DATASETS, FEATURES, HEADER, MODELS, TUNING))  # for
			                )  # parallel
		# reorganize results
		res = []
		for history, (dataset, feature, header, model, tuning) in zip(_res, list(
				itertools.product(DATASETS, FEATURES, HEADER, MODELS, TUNING))):
			res.append([dataset, feature, f'header_{header}', model, f'tuning_{tuning}', history['best']])
	else:  # without parallel
		for dataset, feature, header, model, tuning in list(itertools.product(DATASETS,
		                                                                      FEATURES, HEADER, MODELS, TUNING)):
			try:
				lg.info(f'*** {dataset}-{feature}-header_{header}, {model}-tuning_{tuning}')
				args = parser()
				args.dataset = dataset
				args.feature = feature
				args.header = header
				args.model = model
				args.tuning = tuning
				args.overwrite = OVERWRITE
				history = _representation.main_no_tuning_vs_tuning(args)
				res.append([dataset, feature, f'header_{header}', model, f'tuning_{tuning}', history['best']])
				# avoid losing any result, so save it immediately.
				_out_file = f'{args.out_dir}/{args.direction}/{RESULT_DIR}/~res.csv'
				check_path(_out_file)
				save2txt(res, _out_file, delimiter=',')
			except Exception as e:
				lg.error(f'Error: {e}. [{dataset}, {feature}, {header}, {model}, {tuning}]')

	# save the final results: '.dat' and '.csv'
	check_path(out_file)
	dump(res, out_file)
	out_file = os.path.splitext(out_file)[0] + '.csv'
	remove_file(out_file, OVERWRITE)
	save2txt(res, out_file, delimiter=',')
	lg.info(f'final result: {out_file}')