def __init__(self, params): self.params = params self.train_dir = self.params.train_dir self.logs_dir = "{}_logs".format(self.train_dir) if self.train_dir is None: raise ValueError('Trained model directory not specified') self.num_gpus = self.params.num_gpus # create a mesage builder for logging self.message = utils.MessageBuilder() if self.params.cudnn_benchmark: cudnn.benchmark = True if self.params.num_gpus: self.batch_size = self.params.batch_size * self.num_gpus else: self.batch_size = self.params.batch_size if not self.params.data_pattern: raise IOError("'data_pattern' was not specified. " "Nothing to evaluate.") # load reader self.reader = readers_config[self.params.dataset](self.params, self.batch_size, self.num_gpus, is_training=False) # load model self.model = model_config.get_model_config(self.params.model, self.params.dataset, self.params, self.reader.n_classes, is_training=False) # add normalization as first layer of model if self.params.add_normalization: normalize_layer = self.reader.get_normalize_layer() self.model = torch.nn.Sequential(normalize_layer, self.model) self.model = torch.nn.DataParallel(self.model) self.model = self.model.cuda() self.model.eval() # define Smooth classifier dim = np.product(self.reader.img_size[1:]) self.smooth_model = Smooth(self.model, self.params, self.reader.n_classes, dim)
def __init__(self, params): # Set up environment variables before doing any other global initialization to # make sure it uses the appropriate environment variables. utils.set_default_param_values_and_env_vars(params) self.params = params # Setup logging & log the version. global_utils.setup_logging(params.logging_verbosity) logging.info("Pytorch version: {}.".format(torch.__version__)) logging.info("Hostname: {}.".format(socket.gethostname())) # print self.params parameters pp = pprint.PrettyPrinter(indent=2, compact=True) logging.info(pp.pformat(params.values())) self.num_gpus = self.params.num_gpus # create a mesage builder for logging self.message = global_utils.MessageBuilder() if self.params.num_gpus: self.batch_size = self.params.batch_size * self.num_gpus else: self.batch_size = self.params.batch_size if not self.params.data_pattern: raise IOError("'data_pattern' was not specified. " "Nothing to evaluate.") # load reader and model self.reader = readers_config[self.params.dataset](self.params, self.batch_size, self.num_gpus, is_training=False) self.model = model_config.get_model_config(self.params.model, self.params.dataset, self.params, self.reader.n_classes, is_training=False) # TODO: get the loss another way self.criterion = torch.nn.CrossEntropyLoss().cuda() if self.num_gpus: # self.model = torch.nn.DataParallel(self.model).cuda() self.model = self.model.cuda()
def __init__(self, params): self.params = params # Set up environment variables before doing any other global initialization to # make sure it uses the appropriate environment variables. utils.set_default_param_values_and_env_vars(params) # Setup logging & log the version. global_utils.setup_logging(params.logging_verbosity) logging.info("Tensorflow version: {}.".format(tf.__version__)) logging.info("Hostname: {}.".format(socket.gethostname())) # print self.params parameters pp = pprint.PrettyPrinter(indent=2, compact=True) logging.info(pp.pformat(params.values())) self.train_dir = self.params.train_dir self.logs_dir = "{}_logs".format(self.train_dir) if self.train_dir is None: raise ValueError('Trained model directory not specified') self.num_gpus = self.params.num_gpus self.variable_update = self.params.variable_update # create a mesage builder for logging self.message = global_utils.MessageBuilder() # class for dumping data if self.eval_under_attack and self.params.dump_files: self.dump = DumpFiles(params) if self.params.num_gpus: self.batch_size = self.params.batch_size * self.num_gpus else: self.batch_size = self.params.batch_size if self.params.eval_under_attack: attack_method = self.params.attack_method attack_cls = getattr(attacks, attack_method, None) if attack_cls is None: raise ValueError("Attack is not recognized.") attack_config = getattr(self.params, attack_method) self.attack = attack_cls(batch_size=self.batch_size, sample=self.params.attack_sample, **attack_config) data_pattern = self.params.data_pattern self.dataset = re.findall("[a-z0-9]+", data_pattern.lower())[0] if data_pattern is "": raise IOError("'data_pattern' was not specified. " "Nothing to evaluate.") self.local_parameter_device_flag = self.params.local_parameter_device self.task_index = 0 self.cluster_manager = None self.param_server_device = '/{}:0'.format( self.params.local_parameter_device) self.sync_queue_devices = [self.param_server_device] self.num_workers = 1 # Device to use for ops that need to always run on the local worker's CPU. self.cpu_device = '/cpu:0' # Device to use for ops that need to always run on the local worker's # compute device, and never on a parameter server device. self.raw_devices = ['/gpu:{}'.format(i) for i in range(self.num_gpus)] if self.params.variable_update == 'parameter_server': self.variable_mgr = variable_mgr.VariableMgrLocalFetchFromPS(self) elif self.variable_update == 'replicated': self.variable_mgr = variable_mgr.VariableMgrLocalReplicated( self, self.params.all_reduce_spec, self.params.agg_small_grads_max_bytes, self.params.agg_small_grads_max_group, self.params.allreduce_merge_scope) elif self.params.variable_update in 'independent': self.variable_mgr = variable_mgr.VariableMgrIndependent(self) else: raise ValueError('Invalid variable_update in eval mode: {}'.format( self.variable_update)) self.devices = self.variable_mgr.get_devices() # TODO: remove auto loss scale and check inf in grad self.enable_auto_loss_scale = False self.model = model_config.get_model_config(self.params.model, self.params.dataset, self.params) self.reader = readers_config[self.params.dataset](self.params, self.batch_size, self.raw_devices, self.cpu_device, is_training=False)
def __init__(self, params): self.params = params self.train_dir = self.params.train_dir self.logs_dir = "{}_logs".format(self.train_dir) if self.train_dir is None: raise ValueError('Trained model directory not specified') self.num_gpus = self.params.num_gpus # create a mesage builder for logging self.message = utils.MessageBuilder() if self.params.cudnn_benchmark: cudnn.benchmark = True if self.params.num_gpus: self.batch_size = self.params.batch_size * self.num_gpus else: self.batch_size = self.params.batch_size if not self.params.data_pattern: raise IOError("'data_pattern' was not specified. " "Nothing to evaluate.") # load reader self.reader = readers_config[self.params.dataset](self.params, self.batch_size, self.num_gpus, is_training=False) # load model self.model = model_config.get_model_config(self.params.model, self.params.dataset, self.params, self.reader.n_classes, is_training=False) # add normalization as first layer of model if self.params.add_normalization: normalize_layer = self.reader.get_normalize_layer() self.model = torch.nn.Sequential(normalize_layer, self.model) self.model = torch.nn.DataParallel(self.model) self.model = self.model.cuda() # init loss self.criterion = torch.nn.CrossEntropyLoss().cuda() # save files for analysis if self.params.dump_files: assert self.params.eval_under_attack, \ "dumping files only available when under attack" self.dump = DumpFiles(params) # eval under attack if self.params.eval_under_attack: attack_params = self.params.attack_params self.attack = utils.get_attack(self.model, self.reader.n_classes, self.params.attack_method, attack_params) if self.params.additive_noise or self.params.adaptive_noise: # define Smooth classifier dim = np.product(self.reader.img_size[1:]) self.smooth_model = Smooth(self.model, self.params, self.reader.n_classes, dim)
def __init__(self, params): """Creates a Trainer. """ utils.set_default_param_values_and_env_vars(params) self.params = params # Setup logging & log the version. utils.setup_logging(params.logging_verbosity) self.job_name = self.params.job_name # "" for local training self.is_distributed = bool(self.job_name) self.task_index = self.params.task_index self.local_rank = self.params.local_rank self.start_new_model = self.params.start_new_model self.train_dir = self.params.train_dir self.num_gpus = self.params.num_gpus if self.num_gpus and not self.is_distributed: self.batch_size = self.params.batch_size * self.num_gpus else: self.batch_size = self.params.batch_size # print self.params parameters if self.start_new_model and self.local_rank == 0: pp = pprint.PrettyPrinter(indent=2, compact=True) logging.info(pp.pformat(params.values())) if self.local_rank == 0: logging.info("PyTorch version: {}.".format(torch.__version__)) logging.info("NCCL Version {}".format(torch.cuda.nccl.version())) logging.info("Hostname: {}.".format(socket.gethostname())) if self.is_distributed: self.num_nodes = len(params.worker_hosts.split(';')) self.world_size = self.num_nodes * self.num_gpus self.rank = self.task_index * self.num_gpus + self.local_rank dist.init_process_group(backend='nccl', init_method='env://', timeout=datetime.timedelta(seconds=30)) if self.local_rank == 0: logging.info('World Size={} => Total batch size {}'.format( self.world_size, self.batch_size * self.world_size)) self.is_master = bool(self.rank == 0) else: self.world_size = 1 self.is_master = True # create a mesage builder for logging self.message = utils.MessageBuilder() # load reader and model self.reader = readers_config[self.params.dataset](self.params, self.batch_size, self.num_gpus, is_training=True) # load model self.model = model_config.get_model_config(self.params.model, self.params.dataset, self.params, self.reader.n_classes, is_training=True) # add normalization as first layer of model if self.params.add_normalization: # In order to certify radii in original coordinates rather than standardized coordinates, we # add the noise _before_ standardizing, which is why we have standardization be the first # layer of the classifier rather than as a part of preprocessing as is typical. normalize_layer = self.reader.get_normalize_layer() self.model = torch.nn.Sequential(normalize_layer, self.model) # define DistributedDataParallel job self.model = SyncBatchNorm.convert_sync_batchnorm(self.model) torch.cuda.set_device(params.local_rank) self.model = self.model.cuda() i = params.local_rank self.model = DistributedDataParallel(self.model, device_ids=[i], output_device=i) if self.local_rank == 0: logging.info('Model defined with DistributedDataParallel') # define set for saved ckpt self.saved_ckpts = set([0]) # define optimizer self.optimizer = utils.get_optimizer(self.params.optimizer, self.params.optimizer_params, self.params.init_learning_rate, self.params.weight_decay, self.model.parameters()) # define learning rate scheduler self.scheduler = utils.get_scheduler(self.optimizer, self.params.lr_scheduler, self.params.lr_scheduler_params) # if start_new_model is False, we restart training if not self.start_new_model: if self.local_rank == 0: logging.info('Restarting training...') self._load_state() # define Lipschitz regularization module if self.params.lipschitz_regularization: if self.local_rank == 0: logging.info( "Lipschitz regularization with decay {}, start after epoch {}" .format(self.params.lipschitz_decay, self.params.lipschitz_start_epoch)) self.lipschitz = LipschitzRegularization(self.model, self.params, self.reader, self.local_rank) # exponential moving average self.ema = None if getattr(self.params, 'ema', False) > 0: self.ema = utils.EMA(self.params.ema) # if adversarial training, create the attack class if self.params.adversarial_training: if self.local_rank == 0: logging.info('Adversarial Training') attack_params = self.params.adversarial_training_params if 'eps_iter' in attack_params.keys( ) and attack_params['eps_iter'] == -1: eps = attack_params['eps'] n_iter = attack_params['nb_iter'] attack_params['eps_iter'] = eps / n_iter * 2 if self.local_rank == 0: logging.info('Learning rate for attack: {}'.format( attack_params['eps_iter'])) self.attack = utils.get_attack( self.model, self.reader.n_classes, self.params.adversarial_training_name, attack_params) # init noise if self.params.adaptive_noise and self.params.additive_noise: raise ValueError( "Adaptive and Additive Noise should not be set together") if self.params.adaptive_noise: if self.local_rank == 0: logging.info('Training with Adaptive Noise: {} {}'.format( self.params.noise_distribution, self.params.noise_scale)) elif self.params.additive_noise: if self.local_rank == 0: logging.info('Training with Noise: {} {}'.format( self.params.noise_distribution, self.params.noise_scale)) if self.params.adaptive_noise or self.params.additive_noise: self.noise = utils.Noise(self.params) # stability training if self.params.stability_training: if self.local_rank == 0: logging.info("Training with Stability Training: {}".format( self.params.stability_training_lambda)) if not any([ self.params.adversarial_training, self.params.adaptive_noise, self.params.additive_noise ]): raise ValueError( "Adversarial Training or Adaptive Noise should be activated" )
def __init__(self, params): """Creates a Trainer. """ self.params = params params_sanity_checks(params) # Sets up the environment that Trainer should run in. setup(params) # Setup logging & log the version. global_utils.setup_logging(params.logging_verbosity) logging.info("Tensorflow version: {}.".format(tf.__version__)) logging.info("Hostname: {}.".format(socket.gethostname())) # print self.params parameters pp = pprint.PrettyPrinter(indent=2, compact=True) logging.info(pp.pformat(params.values())) self.job_name = self.params.job_name # "" for local training self.task_index = self.params.task_index self.is_master = (self.job_name in ('', 'worker') and self.task_index == 0) self.start_new_model = self.params.start_new_model self.train_dir = self.params.train_dir self.num_gpus = self.params.num_gpus self.variable_update = self.params.variable_update # create a mesage builder for logging self.message = global_utils.MessageBuilder() self.batch_size = self.params.batch_size * self.num_gpus if self.job_name: self.global_batch_size = self.batch_size * \ (self.cluster.num_tasks('worker') + 1) else: self.global_batch_size = self.batch_size self.trace_filename = self.params.trace_file self.sync_queue_counter = 0 # TODO: remove auto loss scale and check inf in grad self.enable_auto_loss_scale = (self.params.use_fp16 and self.params.fp16_enable_auto_loss_scale) self.loss_scale = None self.loss_scale_normal_steps = None # PS server is used for distributed jobs not using all-reduce. use_ps_server = self.job_name and \ (self.variable_update != 'distributed_all_reduce' and \ self.variable_update != 'collective_all_reduce') # collective_all_reduce doesn't need a controller or ps self.distributed_collective = (self.variable_update == 'collective_all_reduce' and self.job_name) self.local_parameter_device_flag = self.params.local_parameter_device if self.job_name: self.cluster_manager = platforms_util.get_cluster_manager( params, utils.create_config_proto(params)) assert isinstance(self.cluster_manager, cnn_util.BaseClusterManager) worker_prefix = '/job:worker/replica:0/task:{}'.format( self.task_index) if use_ps_server: self.param_server_device = tf.train.replica_device_setter( worker_device=worker_prefix + '/cpu:0', cluster=self.cluster_manager.get_cluster_spec()) # This device on which the queues for managing synchronization between # servers should be stored. self.sync_queue_devices = [ '/job:ps/replica:0/task:{}/cpu:0'.format(i) for i in range(self.cluster_manager.num_ps()) ] else: self.sync_queue_devices = [ '/job:worker/replica:0/task:0/cpu:0' ] else: self.task_index = 0 self.cluster_manager = None worker_prefix = '' self.param_server_device = '/{}:0'.format( self.params.local_parameter_device) self.sync_queue_devices = [self.param_server_device] if self.cluster_manager: self.num_workers = self.cluster_manager.num_workers() elif self.params.variable_update == 'horovod': import horovod.tensorflow as hvd # pylint: disable=g-import-not-at-top self.num_workers = hvd.size() else: self.num_workers = 1 self.num_ps = self.cluster_manager.num_ps( ) if self.cluster_manager else 0 if self.num_workers > 1 and self.params.all_reduce_spec == 'nccl': raise ValueError('--all_reduce_spec=nccl is invalid in a ' 'multi-worker job') # Device to use for ops that need to always run on the local worker's CPU. self.cpu_device = '{}/cpu:0'.format(worker_prefix) # Device to use for ops that need to always run on the local worker's # compute device, and never on a parameter server device. self.raw_devices = [ '{}/gpu:{}'.format(worker_prefix, i) for i in range(self.num_gpus) ] if self.params.variable_update == 'parameter_server': if self.job_name: self.variable_mgr = \ variable_mgr.VariableMgrDistributedFetchFromPS(self) else: self.variable_mgr = variable_mgr.VariableMgrLocalFetchFromPS( self) elif self.variable_update == 'replicated': if self.job_name: raise ValueError( 'Invalid variable_update in distributed mode: %s' % self.variable_update) self.variable_mgr = variable_mgr.VariableMgrLocalReplicated( self, self.params.all_reduce_spec, self.params.agg_small_grads_max_bytes, self.params.agg_small_grads_max_group, self.params.allreduce_merge_scope) elif self.variable_update == 'distributed_all_reduce': assert self.params.cross_replica_sync self.variable_mgr = variable_mgr.VariableMgrDistributedAllReduce( self, self.params.all_reduce_spec, ('worker' if self.num_workers > 1 else 'localhost'), self.num_workers, self.params.agg_small_grads_max_bytes, self.params.agg_small_grads_max_group, self.params.allreduce_merge_scope) elif self.params.variable_update == 'collective_all_reduce': assert self.params.cross_replica_sync self.variable_mgr = variable_mgr.VariableMgrCollectiveAllReduce( self, self.params.all_reduce_spec, self.num_workers, self.num_gpus, self.task_index, self.params.allreduce_merge_scope) elif self.variable_update == 'distributed_replicated': assert self.params.cross_replica_sync if not self.job_name: raise ValueError( 'Invalid variable_update in local mode: {}'.format( self.variable_update)) self.variable_mgr = variable_mgr.VariableMgrDistributedReplicated( self) elif self.params.variable_update in ('independent', 'horovod'): if self.job_name: raise ValueError( 'Invalid variable_update in distributed mode: {}'.format( self.variable_update)) self.variable_mgr = variable_mgr.VariableMgrIndependent(self) else: raise ValueError('Invalid variable_update: {}'.format( self.variable_update)) # Device to use for running on the local worker's compute device, but # with variables assigned to parameter server devices. self.devices = self.variable_mgr.get_devices() if self.job_name: if use_ps_server: self.global_step_device = self.param_server_device elif self.params.variable_update == 'collective_all_reduce': self.global_step_device = self.cpu_device else: self.global_step_device = '/job:worker/replica:0/task:0/cpu:0' else: self.global_step_device = self.cpu_device self.enable_auto_loss_scale = False self.model = model_config.get_model_config(self.params.model, self.params.dataset, self.params) self.reader = readers_config[self.params.dataset](self.params, self.batch_size, self.raw_devices, self.cpu_device, is_training=True) # define the number of steps self.num_steps_by_epoch = self.reader.n_train_files / self.global_batch_size self.max_steps = self.params.num_epochs * self.num_steps_by_epoch
def __init__(self, params): """Creates a Trainer. """ utils.set_default_param_values_and_env_vars(params) self.params = params # Setup logging & log the version. global_utils.setup_logging(params.logging_verbosity) self.job_name = self.params.job_name # "" for local training self.is_distributed = bool(self.job_name) self.task_index = self.params.task_index self.local_rank = self.params.local_rank self.start_new_model = self.params.start_new_model self.train_dir = self.params.train_dir self.num_gpus = self.params.num_gpus if self.num_gpus and not self.is_distributed: self.batch_size = self.params.batch_size * self.num_gpus else: self.batch_size = self.params.batch_size # print self.params parameters if self.start_new_model and self.local_rank == 0: pp = pprint.PrettyPrinter(indent=2, compact=True) logging.info(pp.pformat(params.values())) if self.local_rank == 0: logging.info("PyTorch version: {}.".format(torch.__version__)) logging.info("NCCL Version {}".format(torch.cuda.nccl.version())) logging.info("Hostname: {}.".format(socket.gethostname())) if self.is_distributed: self.num_nodes = len(params.worker_hosts.split(';')) self.world_size = self.num_nodes * self.num_gpus self.rank = self.task_index * self.num_gpus + self.local_rank dist.init_process_group(backend='nccl', init_method='env://', timeout=datetime.timedelta(seconds=30)) if self.local_rank == 0: logging.info('World Size={} => Total batch size {}'.format( self.world_size, self.batch_size * self.world_size)) self.is_master = bool(self.rank == 0) else: self.world_size = 1 self.is_master = True # create a mesage builder for logging self.message = global_utils.MessageBuilder() # load reader and model self.reader = readers_config[self.params.dataset](self.params, self.batch_size, self.num_gpus, is_training=True) self.model = model_config.get_model_config(self.params.model, self.params.dataset, self.params, self.reader.n_classes, is_training=True) # define DistributedDataParallel job self.model = SyncBatchNorm.convert_sync_batchnorm(self.model) torch.cuda.set_device(params.local_rank) self.model = self.model.cuda() i = params.local_rank self.model = DistributedDataParallel(self.model, device_ids=[i], output_device=i) if self.local_rank == 0: logging.info('Model defined with DistributedDataParallel') # define set for saved ckpt self.saved_ckpts = set([0]) # define optimizer self.optimizer = get_optimizer(self.params.optimizer, self.params.optimizer_params, self.params.init_learning_rate, self.params.weight_decay, self.model.parameters()) # define learning rate scheduler self.scheduler = get_scheduler(self.optimizer, self.params.lr_scheduler, self.params.lr_scheduler_params) # if start_new_model is False, we restart training if not self.start_new_model: if self.local_rank == 0: logging.info('Restarting training...') self.load_state() # define Lipschitz Reg module self.lipschitz_reg = LipschitzRegularization(self.model, self.params, self.reader, self.local_rank) # exponential moving average self.ema = None if getattr(self.params, 'ema', False) > 0: self.ema = utils.EMA(self.params.ema) # if adversarial training, create the attack class if self.params.adversarial_training: if self.local_rank == 0: logging.info('Adversarial Training') attack_params = self.params.adversarial_training_params self.attack = utils.get_attack( self.model, self.reader.n_classes, self.params.adversarial_training_name, attack_params)
def __init__(self, params): # Set up environment variables before doing any other global initialization to # make sure it uses the appropriate environment variables. utils.set_default_param_values_and_env_vars(params) self.params = params # Setup logging & log the version. global_utils.setup_logging(params.logging_verbosity) logging.info("Pytorch version: {}.".format(torch.__version__)) logging.info("Hostname: {}.".format(socket.gethostname())) # print self.params parameters pp = pprint.PrettyPrinter(indent=2, compact=True) logging.info(pp.pformat(params.values())) self.train_dir = self.params.train_dir self.logs_dir = "{}_logs".format(self.train_dir) if self.train_dir is None: raise ValueError('Trained model directory not specified') self.num_gpus = self.params.num_gpus # create a mesage builder for logging self.message = global_utils.MessageBuilder() if self.params.cudnn_benchmark: cudnn.benchmark = True if self.params.num_gpus: self.batch_size = self.params.batch_size * self.num_gpus else: self.batch_size = self.params.batch_size if not self.params.data_pattern: raise IOError("'data_pattern' was not specified. " "Nothing to evaluate.") # load reader and model self.reader = readers_config[self.params.dataset]( self.params, self.batch_size, self.num_gpus, is_training=False) self.model = model_config.get_model_config( self.params.model, self.params.dataset, self.params, self.reader.n_classes, is_training=False) # TODO: get the loss another way self.criterion = torch.nn.CrossEntropyLoss().cuda() self.model = torch.nn.DataParallel(self.model) self.model = self.model.cuda() self.add_noise = False eot = getattr(self.params, 'eot', False) if getattr(self.params, 'add_noise', False) and eot == False: self.add_noise = True self.noise = utils.AddNoise(self.params) if self.params.eval_under_attack: if self.params.dump_files: self.dump = DumpFiles(params) if eot: attack_model = utils.EOTWrapper( self.model, self.reader.n_classes, self.params) else: attack_model = self.model attack_params = self.params.attack_params self.attack = utils.get_attack( self.model, self.reader.n_classes, self.params.attack_method, attack_params)
def __init__(self, params): """Creates a Trainer. """ utils.set_default_param_values_and_env_vars(params) self.params = params # Setup logging & log the version. global_utils.setup_logging(params.logging_verbosity) logging.info("PyTorch version: {}.".format(torch.__version__)) logging.info("Hostname: {}.".format(socket.gethostname())) # print self.params parameters pp = pprint.PrettyPrinter(indent=2, compact=True) logging.info(pp.pformat(params.values())) self.job_name = self.params.job_name # "" for local training self.is_distributed = bool(self.job_name) self.task_index = self.params.task_index self.local_rank = self.params.local_rank self.start_new_model = self.params.start_new_model self.train_dir = self.params.train_dir self.num_gpus = self.params.num_gpus if self.num_gpus and not self.is_distributed: self.batch_size = self.params.batch_size * self.num_gpus else: self.batch_size = self.params.batch_size if self.is_distributed: self.num_nodes = len(params.worker_hosts.split(';')) self.world_size = self.num_nodes * self.num_gpus self.rank = self.task_index * self.num_gpus + self.local_rank dist.init_process_group( backend='nccl', init_method='env://', timeout=datetime.timedelta(seconds=30)) if self.local_rank == 0: logging.info('world size={}'.format(self.world_size)) logging.info('Distributed init done, local_rank={}, rank={}'.format( self.local_rank, self.rank)) self.is_master = bool(self.rank == 0) else: self.is_master = True # create a mesage builder for logging self.message = global_utils.MessageBuilder() # load reader and model self.reader = readers_config[self.params.dataset]( self.params, self.batch_size, self.num_gpus, is_training=True) self.model = model_config.get_model_config( self.params.model, self.params.dataset, self.params, self.reader.n_classes, is_training=True) if not params.job_name: self.model = torch.nn.DataParallel(self.model) self.model = self.model.cuda() else: torch.cuda.set_device(params.local_rank) self.model = self.model.cuda() i = params.local_rank self.model = DistributedDataParallel( self.model, device_ids=[i], output_device=i) logging.info('model defined with DistributedDataParallel') # if adversarial training, create the attack class if self.params.adversarial_training: attack_params = self.params.adversarial_training_params # from advertorch import attacks # self.adversaries = {} # self.adversaries["PGDLinf"] = attacks.LinfPGDAttack( # self.model, eps=0.031, nb_iter=10, eps_iter=2*0.031/10, # rand_init=True, clip_min=0.0, clip_max=1.0) # # self.adversaries["PGDL2"] = attacks.L2PGDAttack( # self.model, eps=0.83, nb_iter=10, eps_iter=2*0.83/10, # rand_init=True, clip_min=0.0, clip_max=1.0) self.attack = utils.get_attack( self.model, self.reader.n_classes, self.params.adversarial_training_name, attack_params)