예제 #1
0
    def _setup(self, cfg):
        # self.training_iteration = 0
        self.test_labels = None
        self.val_labels = None
        self.val_scores = None
        self.test_scores = None

        trial_idx = cfg['__trial_index__']
        train, val = cfg['train_dates'][trial_idx]
        test = cfg['test_dates']

        self.dataset = CICFlowADDataset(root=os.path.abspath(cfg['data_path']),
                                        n_known_outlier_classes=1,
                                        train_dates=cfg['period'][train],
                                        val_dates=cfg['period'][val],
                                        test_dates=test,
                                        shuffle=True)

        self.model = Supervised()
        self.model.set_trainer(optimizer_name=cfg['optimizer_name'],
                               lr=cfg['lr'],
                               n_epochs=cfg['n_epochs'],
                               lr_milestones=cfg['lr_milestone'],
                               batch_size=cfg['batch_size'],
                               weight_decay=cfg['weight_decay'],
                               device=cfg['device'],
                               n_jobs_dataloader=cfg["n_jobs_dataloader"])
        self.model.setup(self.dataset, cfg['net_name'])
예제 #2
0
    def _setup(self, params):
        # self.training_iteration = 0
        self.test_labels = None
        self.val_labels = None
        self.val_scores = None
        self.test_scores = None
        self.params = params
        self.cfg = params['cfg']
        self.incremental = params['incremental']
        self.dates = self._get_train_test(params['dates'])

        self.dataset = CICFlowADDataset(root=os.path.abspath(
            self.params['data_path']),
                                        n_known_outlier_classes=1,
                                        train_dates=[params['dates'][0]],
                                        val_dates=[params['dates'][0]],
                                        test_dates=[params['dates'][0]],
                                        shuffle=True)

        self.model = DeepSVDD(self.cfg['objective'], self.cfg['nu'])
        self.model.set_trainer(optimizer_name=self.cfg['optimizer_name'],
                               lr=self.cfg['lr'],
                               n_epochs=self.cfg['n_epochs'],
                               lr_milestones=self.cfg['lr_milestone'],
                               batch_size=self.cfg['batch_size'],
                               weight_decay=self.cfg['weight_decay'],
                               device=self.params['device'],
                               n_jobs_dataloader=self.cfg["n_jobs_dataloader"])
        self.model.setup(self.dataset, self.cfg['net_name'])
        self.model.load_model(params['model_path'])
        self.model.test(self.dataset)
    def _setup(self, cfg):
        # self.training_iteration = 0
        self.test_labels = None
        self.val_labels = None
        self.val_scores = None
        self.test_scores = None

        self.cfg = cfg

        trial_idx = cfg['__trial_index__']
        train, val = cfg['train_dates'][trial_idx]
        test = cfg['test_dates']

        self.dataset = CICFlowADDataset(root=os.path.abspath(cfg['data_path']),
                                        n_known_outlier_classes=1,
                                        test_dates=test,
                                        shuffle=True,
                                        split=True)

        def get_data_from_loader(loader):
            X = ()
            for data in loader:
                inputs, _, _, _ = data
                inputs = inputs.to(cfg['device'])
                X_batch = inputs.view(inputs.size(0), -1)
                X += (X_batch.cpu().data.numpy(), )
            return np.concatenate(X)

        self.isoforest = IsoForest(hybrid=False,
                                   n_estimators=int(cfg['n_estimators']),
                                   max_samples=cfg['max_samples'],
                                   contamination=cfg['contamination'],
                                   n_jobs=4,
                                   seed=cfg['seed'])
예제 #4
0
    def _train(self):
        try:
            train, test = next(self.dates)
        except StopIteration:
            return {'done': True}

        self.dataset = CICFlowADDataset(root=os.path.abspath(
            self.params['data_path']),
                                        n_known_outlier_classes=1,
                                        train_dates=[train],
                                        val_dates=[train],
                                        test_dates=[test],
                                        shuffle=True)

        if self.incremental:
            self.model.train(dataset=self.dataset,
                             optimizer_name=self.cfg['optimizer_name'],
                             lr=self.cfg['lr'],
                             n_epochs=1,
                             lr_milestones=self.cfg['lr_milestone'],
                             batch_size=self.cfg['batch_size'],
                             weight_decay=self.cfg['weight_decay'],
                             device=self.params['device'],
                             n_jobs_dataloader=self.cfg["n_jobs_dataloader"])

        self.model.test(self.dataset, set_split="test")
        self.model.test(self.dataset, set_split="train")

        test_labels, test_scores, _ = self.model.trainer.get_results("test")

        results = locals().copy()
        del results["self"]

        self.results = results

        rocs = {
            phase + '_auc_roc': roc_auc_score(labels, scores)
            for phase in ["test"]
            for labels, scores, _ in [self.model.trainer.get_results(phase)]
        }

        prs = {
            phase + '_auc_pr': auc(recall, precision)
            for phase in ["test"]
            for labels, scores, _ in [self.model.trainer.get_results(phase)]
            for precision, recall, _ in
            [precision_recall_curve(labels, scores)]
        }

        return {**rocs, **prs}
    def _setup(self, cfg):
        # self.training_iteration = 0
        self.test_labels = None
        self.val_labels = None
        self.val_scores = None
        self.test_scores = None

        dates = np.array([cfg['dates']])

        self.dataset = CICFlowADDataset(root=os.path.abspath(cfg['data_path']),
                                        n_known_outlier_classes=1,
                                        test_dates=dates,
                                        shuffle=True,
                                        split=True)

        self.model = DeepSVDD(cfg['objective'], cfg['nu'])
        self.model.set_trainer(optimizer_name=cfg['optimizer_name'],
                               lr=cfg['lr'],
                               n_epochs=cfg['n_epochs'],
                               lr_milestones=cfg['lr_milestone'],
                               batch_size=cfg['batch_size'],
                               weight_decay=cfg['weight_decay'],
                               device=cfg['device'],
                               n_jobs_dataloader=cfg["n_jobs_dataloader"])
        self.model.setup(self.dataset, cfg['net_name'])
        self.model.load_model(cfg['model_path'])


        if cfg['pretrain']:
            self.model = self.model.pretrain(
                self.dataset,
                optimizer_name=cfg['optimizer_name'],
                lr=cfg['lr'],
                n_epochs=cfg['ae_n_epochs'],
                lr_milestones=cfg['ae_lr_milestone'],
                batch_size=cfg['ae_batch_size'],
                weight_decay=cfg['ae_weight_decay'],
                device=cfg['device'],
                n_jobs_dataloader=cfg["n_jobs_dataloader"])
예제 #6
0
def train_evaluate(parameterization,
                   reporter,
                   validation,
                   data_path,
                   n_known_outlier_classes,
                   ratio_known_normal,
                   ratio_known_outlier,
                   cfg,
                   n_jobs_dataloader,
                   net_name,
                   pretrain,
                   n_splits=5):
    sys.path.append('../')

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    period = np.array(['2019-11-08', '2019-11-09', '2019-11-11', '2019-11-12'])
    # period = np.array(['2019-11-08','2019-11-09'])

    if (validation == 'kfold'):
        split = KFold(n_splits=n_splits)
    elif (validation == 'time_series'):
        split = TimeSeriesSplit(n_splits=n_splits)
    else:
        # Dummy object with split method that return indexes of train/test split 0.8/0.2. Similar to train_test_split without shuffle
        split = type(
            'obj', (object, ), {
                'split':
                lambda p: [([x for x in range(int(len(p) * 0.8))],
                            [x for x in range(int(len(p) * 0.8), len(p))])]
            })

    test_aucs = []
    pretrain = parameterization['pretrain']
    for train, test in (split.split(period)):

        dataset = CICFlowADDataset(
            root=os.path.abspath(data_path),
            n_known_outlier_classes=n_known_outlier_classes,
            ratio_known_normal=ratio_known_normal,
            ratio_known_outlier=ratio_known_outlier,
            train_dates=period[train],
            test_dates=period[test],
            shuffle=True)

        # Initialize Supervised model and set neural network phi

        model = Supervised().set_network(net_name)

        if pretrain:

            model = model.pretrain(
                dataset,
                optimizer_name=cfg.settings['ae_optimizer_name'],
                lr=parameterization['lr'],
                n_epochs=cfg.settings['ae_n_epochs'],
                lr_milestones=cfg.settings['ae_lr_milestone'],
                batch_size=cfg.settings['ae_batch_size'],
                weight_decay=cfg.settings['ae_weight_decay'],
                device=device,
                n_jobs_dataloader=n_jobs_dataloader)

            # Save pretraining results
            # Supervised.save_ae_results(export_json=xp_path + '/ae_results.json')

        # Train model on dataset

        model = model.train(dataset,
                            optimizer_name=cfg.settings['ae_optimizer_name'],
                            lr=parameterization['lr'],
                            n_epochs=cfg.settings['n_epochs'],
                            lr_milestones=cfg.settings['lr_milestone'],
                            batch_size=cfg.settings['batch_size'],
                            weight_decay=cfg.settings['weight_decay'],
                            device=device,
                            n_jobs_dataloader=n_jobs_dataloader,
                            reporter=reporter)

        model.test(dataset, device=device, n_jobs_dataloader=n_jobs_dataloader)
        test_auc = model.results['auc_roc']

        test_aucs.append(test_auc)

    reporter(mean_auc=evaluate_aucs(test_aucs=test_aucs))
예제 #7
0
def train_evaluate(parameterization,
                   validation,
                   data_path,
                   n_known_outlier_classes,
                   ratio_known_normal,
                   ratio_known_outlier,
                   ratio_pollution,
                   cfg,
                   n_jobs_dataloader,
                   n_splits=3):

    device = 'cpu'

    period = np.array(
        ['2019-11-08', '2019-11-09', '2019-11-11', '2019-11-12', '2019-11-13'])

    if (validation == 'kfold'):
        split = KFold(n_splits=n_splits)
    elif (validation == 'time_series'):
        split = TimeSeriesSplit(n_splits=n_splits)
    else:
        # Dummy object with split method that return indexes of train/test split 0.8/0.2. Similar to train_test_split without shuffle
        split = type(
            'obj', (object, ), {
                'split':
                lambda p: [([x for x in range(int(len(p) * 0.8))],
                            [x for x in range(int(len(p) * 0.8), len(p))])]
            })

    test_aucs = []

    for train, test in split.split(period):

        dataset = CICFlowADDataset(
            root=os.path.abspath(data_path),
            n_known_outlier_classes=n_known_outlier_classes,
            ratio_known_normal=ratio_known_normal,
            ratio_known_outlier=ratio_known_outlier,
            train_dates=period[train],
            test_dates=period[test],
            ratio_pollution=ratio_pollution)

        # Initialize DeepSAD model and set neural network phi

        # Log random sample of known anomaly classes if more than 1 class
        if n_known_outlier_classes > 1:
            logger.info('Known anomaly classes: %s' %
                        (dataset.known_outlier_classes, ))

        # Initialize Isolation Forest model
        Isoforest = IsoForest(hybrid=False,
                              n_estimators=int(
                                  parameterization['n_estimators']),
                              max_samples=parameterization['max_samples'],
                              contamination=parameterization['contamination'],
                              n_jobs=4,
                              seed=cfg.settings['seed'])

        # Train model on dataset
        Isoforest.train(dataset,
                        device=device,
                        n_jobs_dataloader=n_jobs_dataloader)

        # Test model
        Isoforest.test(dataset,
                       device=device,
                       n_jobs_dataloader=n_jobs_dataloader)

        test_auc = Isoforest.results['auc_roc']

        test_aucs.append(test_auc)

    reporter(mean_auc=evaluate_aucs(test_aucs=test_aucs))