def test_githubRootNoAuth(): testArl = "[github,refractionPOINT/python-limacharlie]" nElemFound = 0 with ARL(testArl) as r: for fileName, fileContent in r: assert (fileName) assert (0 != len(fileContent)) nElemFound += 1 assert (0 != nElemFound)
def test_straightFileCompat(): testArl = "https://raw.githubusercontent.com/refractionPOINT/sigma_rules/master/README.md" nElemFound = 0 with ARL(testArl) as r: for fileName, fileContent in r: assert (fileName) assert (0 != len(fileContent)) nElemFound += 1 assert (1 == nElemFound)
def test_httpsWithTar(): testArl = "[https,api.github.com/repos/refractionPOINT/sigma_rules/tarball/0.2.0]" nElemFound = 0 with ARL(testArl) as r: for fileName, fileContent in r: assert (fileName) assert (0 != len(fileContent)) nElemFound += 1 assert (0 != nElemFound)
def test_githubBranch(): testArl = "[github,refractionPOINT/sigma/lc-rules/windows_sysmon/?ref=lc-rules]" nElemFound = 0 with ARL(testArl) as r: for fileName, fileContent in r: assert (fileName) assert (0 != len(fileContent)) nElemFound += 1 assert (0 != nElemFound)
def test_githubSigleFileNoAuth(): testArl = "[github,refractionPOINT/sigma/README.md]" nElemFound = 0 with ARL(testArl) as r: for fileName, fileContent in r: assert (fileName) assert (0 != len(fileContent)) nElemFound += 1 assert (1 == nElemFound)
def test_githubSubdirNoAuth(): testArl = "[github,refractionPOINT/sigma/rules/windows/builtin]" nElemFound = 0 with ARL(testArl) as r: for fileName, fileContent in r: assert (fileName) assert (0 != len(fileContent)) nElemFound += 1 assert (0 != nElemFound)
def get_model(config: Dict[str, Any], args: argparse.Namespace, dataset: FairnessDataset) -> pl.LightningModule: """Selects and inits a model instance for training. Args: config: Dict with hyperparameters (learning rate, batch size, eta). args: Object from the argument parser that defines various settings of the model, dataset and training. dataset: Dataset instance that will be used for training. Returns: An instantiated model; one of the following: Model based on Adversarially Reweighted Learning (ARL). Model based on Distributionally Robust Optimization (DRO). Model based on Inverse Probability Weighting (IPW). Baseline model; simple fully-connected or convolutional (TODO) network. """ model: pl.LightningModule model = ARL( config=config, # for hparam tuning input_shape=dataset.dimensionality, pretrain_steps=args.pretrain_steps, prim_hidden=args.prim_hidden, adv_hidden=args.adv_hidden, optimizer=OPT_BY_NAME[args.opt], dataset_type=args.dataset_type, adv_input=set(args.adv_input), num_groups=len(dataset.protected_index2value), opt_kwargs={"initial_accumulator_value": 0.1} if args.tf_mode else {}) if args.tf_mode: def init_weights(layer): if type(layer) == torch.nn.Linear: torch.nn.init.xavier_uniform_(layer.weight) torch.nn.init.zeros_(layer.bias) model.apply(init_weights) return model
def test_maxSizeGood(): testArl = "https://raw.githubusercontent.com/refractionPOINT/sigma_rules/master/README.md" nElemFound = 0 try: with ARL(testArl, maxSize=1024) as r: for fileName, fileContent in r: nElemFound += 1 except: pass assert (1 == nElemFound)
def downloadRules(self): newRules = {} newDetections = set() # We assume all D&R rules are in detections.yaml and all internal # lookup resources those rules use are each in a "resources/RESOURCE_NAME" # sub-directory in the repo. with ARL('[github,%s/%s,token,%s]' % ( GITHUB_ORG, REPO_NAME, GITHUB_TOKEN, ), maxConcurrent=5) as r: for fileName, content in r: # The detections are in a single file "detections.yaml". # like: ruleName => {detect => ..., respond => ...} if 'detections.yaml' == fileName: try: newRules = yaml.safe_load(content) except: raise Exception( "failed to parse yaml from rules file %s: %s" % (fileName, traceback.format_exc())) # Resources are in a "resources/" directory. if fileName.startswith('resources/'): # This is a resource, use the filename without extension as name. resourceName = fileName.split('/', 1)[1] resourceName = resourceName[:resourceName.rfind('.')] # We assume all resources are lookups. self.publishResource(resourceName, 'lookup', content) for ruleName, rule in newRules.items(): # Make sure the rule goes in the "replicant" namespace. This way # we don't need to set the namespace in the yaml file. rule['namespace'] = 'replicant' for drResponse in rule['respond']: if 'report' == drResponse['action']: # We want to be notified for all rule reports. newDetections.add(drResponse['name']) # Update the rules in effect. self.svcRules = newRules # Make sure we're subscribed to all the notifications. for detection in newDetections: self.subscribeToDetect(detection)
def train(config: Dict[str, Any], args: argparse.Namespace, train_dataset: FairnessDataset, val_dataset: Optional[FairnessDataset] = None, test_dataset: Optional[FairnessDataset] = None, version=str(int(time())), fold_nbr=None) -> Tuple[pl.LightningModule, pl.Trainer]: """Single training run on a given dataset. Inits a model and optimizes its parameters on the given training dataset with a given set of hyperparameters. Logs various metrics and stops the training when the micro-average AUC on the validation set stops improving. Args: config: Dict with hyperparameters (learning rate, batch size, eta). args: Object from the argument parser that defines various settings of the model, dataset and training. train_dataset: Dataset instance to use for training. val_dataset: Optional; dataset instance to use for validation. test_dataset: Optional; dataset instance to use for testing. version: Version used for the logging directory. fold_nbr: Optional; used for the logging directory if training run is part of kfold cross validation. Returns: Model with the highest micro-average AUC on the validation set during the training run. Raises: AssertionError: If no model checkpoint callback exists. """ # create logdir if necessary logdir: str = args.log_dir os.makedirs(logdir, exist_ok=True) # create fold loaders and callbacks train_loader = DataLoader(train_dataset, batch_size=config['batch_size'], shuffle=True, num_workers=args.num_workers, pin_memory=True) callbacks: List[pl.callbacks.Callback] = [] callbacks.append( Logger(train_dataset, 'train', batch_size=args.eval_batch_size, save_scatter=(args.model in ['ARL', 'ARL_strong', 'ARL_weak']))) if val_dataset is not None: callbacks.append( Logger(val_dataset, 'validation', batch_size=args.eval_batch_size)) if not args.no_early_stopping: callbacks.append( EarlyStopping(monitor='validation/micro_avg_auc', min_delta=0.00, patience=10, verbose=True, mode='max')) if test_dataset is not None: callbacks.append( Logger(test_dataset, 'test', batch_size=args.eval_batch_size, save_scatter=(args.model in ['ARL', 'ARL_strong', 'ARL_weak']))) # Select model and instantiate model: pl.LightningModule = get_model(config, args, train_dataset) # create logger if args.grid_search: logger_version = '' else: logger_version = f'seed_{args.seed}' if fold_nbr is not None: logger_version += f'./fold_{fold_nbr}' logger = TensorBoardLogger(save_dir='./', name=logdir, version=logger_version) if not args.no_early_stopping: # create checkpoint checkpoint = ModelCheckpoint(save_weights_only=True, dirpath=logger.log_dir, mode='max', verbose=False, monitor='validation/micro_avg_auc') callbacks.append(checkpoint) # Create a PyTorch Lightning trainer trainer = pl.Trainer( logger=logger, gpus=1 if torch.cuda.is_available() else 0, max_steps=args.train_steps + args.pretrain_steps, callbacks=callbacks, gradient_clip_val=1 if args.model == 'DRO' else 0, progress_bar_refresh_rate=1 if args.p_bar else 0, ) # Training fit_time = time() if val_dataset is not None: trainer.fit(model, train_loader, val_dataloaders=DataLoader(val_dataset, batch_size=args.eval_batch_size, num_workers=args.num_workers)) else: trainer.fit(model, train_loader) print(f'time to fit was {time()-fit_time}') if not args.no_early_stopping: # necessary to make the type checker happy and since this is only run once, # runtime is not an issue assert trainer.checkpoint_callback is not None # Load best checkpoint after training if args.model == 'baseline': model = BaselineModel.load_from_checkpoint( trainer.checkpoint_callback.best_model_path) elif args.model == 'ARL': model = ARL.load_from_checkpoint( trainer.checkpoint_callback.best_model_path) elif args.model == 'DRO': model = DRO.load_from_checkpoint( trainer.checkpoint_callback.best_model_path) elif args.model == 'IPW': model = IPW.load_from_checkpoint( trainer.checkpoint_callback.best_model_path) return model, trainer
def get_model(config: Dict[str, Any], args: argparse.Namespace, dataset: FairnessDataset) -> pl.LightningModule: """Selects and inits a model instance for training. Args: config: Dict with hyperparameters (learning rate, batch size, eta). args: Object from the argument parser that defines various settings of the model, dataset and training. dataset: Dataset instance that will be used for training. Returns: An instantiated model; one of the following: Model based on Adversarially Reweighted Learning (ARL). Model based on Distributionally Robust Optimization (DRO). Model based on Inverse Probability Weighting (IPW). Baseline model; simple fully-connected or convolutional (TODO) network. """ model: pl.LightningModule if args.model == 'ARL': model = ARL( config=config, # for hparam tuning input_shape=dataset.dimensionality, pretrain_steps=args.pretrain_steps, prim_hidden=args.prim_hidden, adv_hidden=args.adv_hidden, optimizer=OPT_BY_NAME[args.opt], dataset_type=args.dataset_type, adv_input=set(args.adv_input), num_groups=len(dataset.protected_index2value), opt_kwargs={"initial_accumulator_value": 0.1} if args.tf_mode else {}) elif args.model == 'ARL_strong': model = ARL( config=config, # for hparam tuning input_shape=dataset.dimensionality, pretrain_steps=args.pretrain_steps, prim_hidden=args.prim_hidden, adv_hidden=args.adv_hidden, optimizer=OPT_BY_NAME[args.opt], dataset_type=args.dataset_type, adv_input=set(args.adv_input), num_groups=len(dataset.protected_index2value), adv_cnn_strength='strong', opt_kwargs={"initial_accumulator_value": 0.1} if args.tf_mode else {}) elif args.model == 'ARL_weak': model = ARL( config=config, # for hparam tuning input_shape=dataset.dimensionality, pretrain_steps=args.pretrain_steps, prim_hidden=args.prim_hidden, adv_hidden=args.adv_hidden, optimizer=OPT_BY_NAME[args.opt], dataset_type=args.dataset_type, adv_input=set(args.adv_input), num_groups=len(dataset.protected_index2value), adv_cnn_strength='weak', opt_kwargs={"initial_accumulator_value": 0.1} if args.tf_mode else {}) elif args.model == 'DRO': model = DRO( config=config, # for hparam tuning num_features=dataset.dimensionality, hidden_units=args.prim_hidden, pretrain_steps=args.pretrain_steps, k=args.k, optimizer=OPT_BY_NAME[args.opt], opt_kwargs={"initial_accumulator_value": 0.1} if args.tf_mode else {}) elif args.model == 'IPW': model = IPW( config=config, # for hparam tuning num_features=dataset.dimensionality, hidden_units=args.prim_hidden, optimizer=OPT_BY_NAME[args.opt], group_probs=dataset.group_probs, sensitive_label=args.sensitive_label, opt_kwargs={"initial_accumulator_value": 0.1} if args.tf_mode else {}) args.pretrain_steps = 0 # NO PRETRAINING elif args.model == 'baseline': model = BaselineModel( config=config, # for hparam tuning num_features=dataset.dimensionality, hidden_units=args.prim_hidden, optimizer=OPT_BY_NAME[args.opt], dataset_type=args.dataset_type, opt_kwargs={"initial_accumulator_value": 0.1} if args.tf_mode else {}) args.pretrain_steps = 0 # NO PRETRAINING # if Tensorflow mode is active, we use the TF default initialization, # which means Xavier/Glorot uniform (with gain 1) for the weights # and 0 bias if args.tf_mode: def init_weights(layer): if type(layer) == torch.nn.Linear: torch.nn.init.xavier_uniform_(layer.weight) torch.nn.init.zeros_(layer.bias) model.apply(init_weights) return model
def train_for_n_iters(train_dataset, test_dataset, model_params, lr_params, n_iters=5, train_steps=1000, test_every=10, pretrain_steps=250, print_loss=True, log_dir="logs/", model_name="ARL"): """ Trains the ARL model for n iterations, and averages the results. Args: train_dataset: Data iterator of the train set. test_dataset: Data iterator of the test set. model_params: A dictionary with model hyperparameters. lr_params: A dictionary with hyperparmaeters for optimizers. n_iters: How often to train the model with different seeds. train_steps: Number of training steps. test_every: How often to evaluate on test set. pretrain_steps: Number of pretrain steps (steps with no adversary). print_loss: Wheter to print the loss during training. log_dir: Directory where to save the tensorboard loggers. """ # Set the device on which to train. device = "cuda:0" if torch.cuda.is_available() else "cpu" model_params["device"] = device # Initiate metrics object. metrics = FairnessMetrics(n_iters, test_every) # Preparation of logging directories. experiment_dir = os.path.join( log_dir, datetime.datetime.now().strftime("%Y_%m_%d_%H_%M_%S")) checkpoint_dir = os.path.join(experiment_dir, "checkpoints") os.makedirs(experiment_dir, exist_ok=True) os.makedirs(checkpoint_dir, exist_ok=True) # Initialte TensorBoard loggers. summary_writer = SummaryWriter(experiment_dir) logger_learner = TensorBoardLogger(summary_writer, name="learner") logger_adv = TensorBoardLogger(summary_writer, name="adversary") logger_metrics = TensorBoardLogger(summary_writer, name="metrics") for i in range(n_iters): print(f"Training model {i + 1}/{n_iters}") seed_everything(42 + i) # Load the train dataset as a pytorch dataloader. train_loader = DataLoader(train_dataset, batch_size=model_params["batch_size"], shuffle=True) # Create the model. if model_name == "ARL": model = ARL(**model_params) elif model_name == "baseline": model = baseline(**model_params) else: print("Unknown model") # Transfer model to correct device. model = model.to(device) # Adagrad is the defeault optimizer. optimizer_learner = torch.optim.Adagrad(model.learner.parameters(), lr=lr_params["learner"]) if model_name == 'ARL': optimizer_adv = torch.optim.Adagrad(model.adversary.parameters(), lr=lr_params["adversary"]) elif model_name == 'baseline': optimizer_adv = None # Train the model with current seeds. if print_loss: print("Start training on device {}".format(device)) train_model( model, train_loader, test_dataset, train_steps, test_every, pretrain_steps, optimizer_learner, optimizer_adv, metrics, checkpoint_dir, logger_learner, logger_adv, logger_metrics, n_iters=i, print_loss=print_loss, device=device, ) # Average results and return metrics metrics.average_results() return metrics