def _setup(self, cfg): # self.training_iteration = 0 self.test_labels = None self.val_labels = None self.val_scores = None self.test_scores = None trial_idx = cfg['__trial_index__'] train, val = cfg['train_dates'][trial_idx] test = cfg['test_dates'] self.dataset = CICFlowADDataset(root=os.path.abspath(cfg['data_path']), n_known_outlier_classes=1, train_dates=cfg['period'][train], val_dates=cfg['period'][val], test_dates=test, shuffle=True) self.model = Supervised() self.model.set_trainer(optimizer_name=cfg['optimizer_name'], lr=cfg['lr'], n_epochs=cfg['n_epochs'], lr_milestones=cfg['lr_milestone'], batch_size=cfg['batch_size'], weight_decay=cfg['weight_decay'], device=cfg['device'], n_jobs_dataloader=cfg["n_jobs_dataloader"]) self.model.setup(self.dataset, cfg['net_name'])
def _setup(self, params): # self.training_iteration = 0 self.test_labels = None self.val_labels = None self.val_scores = None self.test_scores = None self.params = params self.cfg = params['cfg'] self.incremental = params['incremental'] self.dates = self._get_train_test(params['dates']) self.dataset = CICFlowADDataset(root=os.path.abspath( self.params['data_path']), n_known_outlier_classes=1, train_dates=[params['dates'][0]], val_dates=[params['dates'][0]], test_dates=[params['dates'][0]], shuffle=True) self.model = DeepSVDD(self.cfg['objective'], self.cfg['nu']) self.model.set_trainer(optimizer_name=self.cfg['optimizer_name'], lr=self.cfg['lr'], n_epochs=self.cfg['n_epochs'], lr_milestones=self.cfg['lr_milestone'], batch_size=self.cfg['batch_size'], weight_decay=self.cfg['weight_decay'], device=self.params['device'], n_jobs_dataloader=self.cfg["n_jobs_dataloader"]) self.model.setup(self.dataset, self.cfg['net_name']) self.model.load_model(params['model_path']) self.model.test(self.dataset)
def _setup(self, cfg): # self.training_iteration = 0 self.test_labels = None self.val_labels = None self.val_scores = None self.test_scores = None self.cfg = cfg trial_idx = cfg['__trial_index__'] train, val = cfg['train_dates'][trial_idx] test = cfg['test_dates'] self.dataset = CICFlowADDataset(root=os.path.abspath(cfg['data_path']), n_known_outlier_classes=1, test_dates=test, shuffle=True, split=True) def get_data_from_loader(loader): X = () for data in loader: inputs, _, _, _ = data inputs = inputs.to(cfg['device']) X_batch = inputs.view(inputs.size(0), -1) X += (X_batch.cpu().data.numpy(), ) return np.concatenate(X) self.isoforest = IsoForest(hybrid=False, n_estimators=int(cfg['n_estimators']), max_samples=cfg['max_samples'], contamination=cfg['contamination'], n_jobs=4, seed=cfg['seed'])
def _train(self): try: train, test = next(self.dates) except StopIteration: return {'done': True} self.dataset = CICFlowADDataset(root=os.path.abspath( self.params['data_path']), n_known_outlier_classes=1, train_dates=[train], val_dates=[train], test_dates=[test], shuffle=True) if self.incremental: self.model.train(dataset=self.dataset, optimizer_name=self.cfg['optimizer_name'], lr=self.cfg['lr'], n_epochs=1, lr_milestones=self.cfg['lr_milestone'], batch_size=self.cfg['batch_size'], weight_decay=self.cfg['weight_decay'], device=self.params['device'], n_jobs_dataloader=self.cfg["n_jobs_dataloader"]) self.model.test(self.dataset, set_split="test") self.model.test(self.dataset, set_split="train") test_labels, test_scores, _ = self.model.trainer.get_results("test") results = locals().copy() del results["self"] self.results = results rocs = { phase + '_auc_roc': roc_auc_score(labels, scores) for phase in ["test"] for labels, scores, _ in [self.model.trainer.get_results(phase)] } prs = { phase + '_auc_pr': auc(recall, precision) for phase in ["test"] for labels, scores, _ in [self.model.trainer.get_results(phase)] for precision, recall, _ in [precision_recall_curve(labels, scores)] } return {**rocs, **prs}
def _setup(self, cfg): # self.training_iteration = 0 self.test_labels = None self.val_labels = None self.val_scores = None self.test_scores = None dates = np.array([cfg['dates']]) self.dataset = CICFlowADDataset(root=os.path.abspath(cfg['data_path']), n_known_outlier_classes=1, test_dates=dates, shuffle=True, split=True) self.model = DeepSVDD(cfg['objective'], cfg['nu']) self.model.set_trainer(optimizer_name=cfg['optimizer_name'], lr=cfg['lr'], n_epochs=cfg['n_epochs'], lr_milestones=cfg['lr_milestone'], batch_size=cfg['batch_size'], weight_decay=cfg['weight_decay'], device=cfg['device'], n_jobs_dataloader=cfg["n_jobs_dataloader"]) self.model.setup(self.dataset, cfg['net_name']) self.model.load_model(cfg['model_path']) if cfg['pretrain']: self.model = self.model.pretrain( self.dataset, optimizer_name=cfg['optimizer_name'], lr=cfg['lr'], n_epochs=cfg['ae_n_epochs'], lr_milestones=cfg['ae_lr_milestone'], batch_size=cfg['ae_batch_size'], weight_decay=cfg['ae_weight_decay'], device=cfg['device'], n_jobs_dataloader=cfg["n_jobs_dataloader"])
def train_evaluate(parameterization, reporter, validation, data_path, n_known_outlier_classes, ratio_known_normal, ratio_known_outlier, cfg, n_jobs_dataloader, net_name, pretrain, n_splits=5): sys.path.append('../') device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') period = np.array(['2019-11-08', '2019-11-09', '2019-11-11', '2019-11-12']) # period = np.array(['2019-11-08','2019-11-09']) if (validation == 'kfold'): split = KFold(n_splits=n_splits) elif (validation == 'time_series'): split = TimeSeriesSplit(n_splits=n_splits) else: # Dummy object with split method that return indexes of train/test split 0.8/0.2. Similar to train_test_split without shuffle split = type( 'obj', (object, ), { 'split': lambda p: [([x for x in range(int(len(p) * 0.8))], [x for x in range(int(len(p) * 0.8), len(p))])] }) test_aucs = [] pretrain = parameterization['pretrain'] for train, test in (split.split(period)): dataset = CICFlowADDataset( root=os.path.abspath(data_path), n_known_outlier_classes=n_known_outlier_classes, ratio_known_normal=ratio_known_normal, ratio_known_outlier=ratio_known_outlier, train_dates=period[train], test_dates=period[test], shuffle=True) # Initialize Supervised model and set neural network phi model = Supervised().set_network(net_name) if pretrain: model = model.pretrain( dataset, optimizer_name=cfg.settings['ae_optimizer_name'], lr=parameterization['lr'], n_epochs=cfg.settings['ae_n_epochs'], lr_milestones=cfg.settings['ae_lr_milestone'], batch_size=cfg.settings['ae_batch_size'], weight_decay=cfg.settings['ae_weight_decay'], device=device, n_jobs_dataloader=n_jobs_dataloader) # Save pretraining results # Supervised.save_ae_results(export_json=xp_path + '/ae_results.json') # Train model on dataset model = model.train(dataset, optimizer_name=cfg.settings['ae_optimizer_name'], lr=parameterization['lr'], n_epochs=cfg.settings['n_epochs'], lr_milestones=cfg.settings['lr_milestone'], batch_size=cfg.settings['batch_size'], weight_decay=cfg.settings['weight_decay'], device=device, n_jobs_dataloader=n_jobs_dataloader, reporter=reporter) model.test(dataset, device=device, n_jobs_dataloader=n_jobs_dataloader) test_auc = model.results['auc_roc'] test_aucs.append(test_auc) reporter(mean_auc=evaluate_aucs(test_aucs=test_aucs))
def train_evaluate(parameterization, validation, data_path, n_known_outlier_classes, ratio_known_normal, ratio_known_outlier, ratio_pollution, cfg, n_jobs_dataloader, n_splits=3): device = 'cpu' period = np.array( ['2019-11-08', '2019-11-09', '2019-11-11', '2019-11-12', '2019-11-13']) if (validation == 'kfold'): split = KFold(n_splits=n_splits) elif (validation == 'time_series'): split = TimeSeriesSplit(n_splits=n_splits) else: # Dummy object with split method that return indexes of train/test split 0.8/0.2. Similar to train_test_split without shuffle split = type( 'obj', (object, ), { 'split': lambda p: [([x for x in range(int(len(p) * 0.8))], [x for x in range(int(len(p) * 0.8), len(p))])] }) test_aucs = [] for train, test in split.split(period): dataset = CICFlowADDataset( root=os.path.abspath(data_path), n_known_outlier_classes=n_known_outlier_classes, ratio_known_normal=ratio_known_normal, ratio_known_outlier=ratio_known_outlier, train_dates=period[train], test_dates=period[test], ratio_pollution=ratio_pollution) # Initialize DeepSAD model and set neural network phi # Log random sample of known anomaly classes if more than 1 class if n_known_outlier_classes > 1: logger.info('Known anomaly classes: %s' % (dataset.known_outlier_classes, )) # Initialize Isolation Forest model Isoforest = IsoForest(hybrid=False, n_estimators=int( parameterization['n_estimators']), max_samples=parameterization['max_samples'], contamination=parameterization['contamination'], n_jobs=4, seed=cfg.settings['seed']) # Train model on dataset Isoforest.train(dataset, device=device, n_jobs_dataloader=n_jobs_dataloader) # Test model Isoforest.test(dataset, device=device, n_jobs_dataloader=n_jobs_dataloader) test_auc = Isoforest.results['auc_roc'] test_aucs.append(test_auc) reporter(mean_auc=evaluate_aucs(test_aucs=test_aucs))