예제 #1
0
    def __init__(self, params):

        self.params = params

        self.train_dir = self.params.train_dir
        self.logs_dir = "{}_logs".format(self.train_dir)
        if self.train_dir is None:
            raise ValueError('Trained model directory not specified')
        self.num_gpus = self.params.num_gpus

        # create a mesage builder for logging
        self.message = utils.MessageBuilder()

        if self.params.cudnn_benchmark:
            cudnn.benchmark = True

        if self.params.num_gpus:
            self.batch_size = self.params.batch_size * self.num_gpus
        else:
            self.batch_size = self.params.batch_size

        if not self.params.data_pattern:
            raise IOError("'data_pattern' was not specified. "
                          "Nothing to evaluate.")

        # load reader
        self.reader = readers_config[self.params.dataset](self.params,
                                                          self.batch_size,
                                                          self.num_gpus,
                                                          is_training=False)

        # load model
        self.model = model_config.get_model_config(self.params.model,
                                                   self.params.dataset,
                                                   self.params,
                                                   self.reader.n_classes,
                                                   is_training=False)
        # add normalization as first layer of model
        if self.params.add_normalization:
            normalize_layer = self.reader.get_normalize_layer()
            self.model = torch.nn.Sequential(normalize_layer, self.model)
        self.model = torch.nn.DataParallel(self.model)
        self.model = self.model.cuda()
        self.model.eval()

        # define Smooth classifier
        dim = np.product(self.reader.img_size[1:])
        self.smooth_model = Smooth(self.model, self.params,
                                   self.reader.n_classes, dim)
예제 #2
0
    def __init__(self, params):

        # Set up environment variables before doing any other global initialization to
        # make sure it uses the appropriate environment variables.
        utils.set_default_param_values_and_env_vars(params)

        self.params = params

        # Setup logging & log the version.
        global_utils.setup_logging(params.logging_verbosity)
        logging.info("Pytorch version: {}.".format(torch.__version__))
        logging.info("Hostname: {}.".format(socket.gethostname()))

        # print self.params parameters
        pp = pprint.PrettyPrinter(indent=2, compact=True)
        logging.info(pp.pformat(params.values()))

        self.num_gpus = self.params.num_gpus

        # create a mesage builder for logging
        self.message = global_utils.MessageBuilder()

        if self.params.num_gpus:
            self.batch_size = self.params.batch_size * self.num_gpus
        else:
            self.batch_size = self.params.batch_size

        if not self.params.data_pattern:
            raise IOError("'data_pattern' was not specified. "
                          "Nothing to evaluate.")

        # load reader and model
        self.reader = readers_config[self.params.dataset](self.params,
                                                          self.batch_size,
                                                          self.num_gpus,
                                                          is_training=False)
        self.model = model_config.get_model_config(self.params.model,
                                                   self.params.dataset,
                                                   self.params,
                                                   self.reader.n_classes,
                                                   is_training=False)
        # TODO: get the loss another way
        self.criterion = torch.nn.CrossEntropyLoss().cuda()

        if self.num_gpus:
            # self.model = torch.nn.DataParallel(self.model).cuda()
            self.model = self.model.cuda()
예제 #3
0
    def __init__(self, params):

        self.params = params

        # Set up environment variables before doing any other global initialization to
        # make sure it uses the appropriate environment variables.
        utils.set_default_param_values_and_env_vars(params)

        # Setup logging & log the version.
        global_utils.setup_logging(params.logging_verbosity)
        logging.info("Tensorflow version: {}.".format(tf.__version__))
        logging.info("Hostname: {}.".format(socket.gethostname()))

        # print self.params parameters
        pp = pprint.PrettyPrinter(indent=2, compact=True)
        logging.info(pp.pformat(params.values()))

        self.train_dir = self.params.train_dir
        self.logs_dir = "{}_logs".format(self.train_dir)
        if self.train_dir is None:
            raise ValueError('Trained model directory not specified')
        self.num_gpus = self.params.num_gpus
        self.variable_update = self.params.variable_update

        # create a mesage builder for logging
        self.message = global_utils.MessageBuilder()

        # class for dumping data
        if self.eval_under_attack and self.params.dump_files:
            self.dump = DumpFiles(params)

        if self.params.num_gpus:
            self.batch_size = self.params.batch_size * self.num_gpus
        else:
            self.batch_size = self.params.batch_size

        if self.params.eval_under_attack:
            attack_method = self.params.attack_method
            attack_cls = getattr(attacks, attack_method, None)
            if attack_cls is None:
                raise ValueError("Attack is not recognized.")
            attack_config = getattr(self.params, attack_method)
            self.attack = attack_cls(batch_size=self.batch_size,
                                     sample=self.params.attack_sample,
                                     **attack_config)

        data_pattern = self.params.data_pattern
        self.dataset = re.findall("[a-z0-9]+", data_pattern.lower())[0]
        if data_pattern is "":
            raise IOError("'data_pattern' was not specified. "
                          "Nothing to evaluate.")

        self.local_parameter_device_flag = self.params.local_parameter_device
        self.task_index = 0
        self.cluster_manager = None
        self.param_server_device = '/{}:0'.format(
            self.params.local_parameter_device)
        self.sync_queue_devices = [self.param_server_device]

        self.num_workers = 1

        # Device to use for ops that need to always run on the local worker's CPU.
        self.cpu_device = '/cpu:0'

        # Device to use for ops that need to always run on the local worker's
        # compute device, and never on a parameter server device.
        self.raw_devices = ['/gpu:{}'.format(i) for i in range(self.num_gpus)]

        if self.params.variable_update == 'parameter_server':
            self.variable_mgr = variable_mgr.VariableMgrLocalFetchFromPS(self)
        elif self.variable_update == 'replicated':
            self.variable_mgr = variable_mgr.VariableMgrLocalReplicated(
                self, self.params.all_reduce_spec,
                self.params.agg_small_grads_max_bytes,
                self.params.agg_small_grads_max_group,
                self.params.allreduce_merge_scope)
        elif self.params.variable_update in 'independent':
            self.variable_mgr = variable_mgr.VariableMgrIndependent(self)
        else:
            raise ValueError('Invalid variable_update in eval mode: {}'.format(
                self.variable_update))

        self.devices = self.variable_mgr.get_devices()

        # TODO: remove auto loss scale and check inf in grad
        self.enable_auto_loss_scale = False

        self.model = model_config.get_model_config(self.params.model,
                                                   self.params.dataset,
                                                   self.params)
        self.reader = readers_config[self.params.dataset](self.params,
                                                          self.batch_size,
                                                          self.raw_devices,
                                                          self.cpu_device,
                                                          is_training=False)
예제 #4
0
    def __init__(self, params):

        self.params = params

        self.train_dir = self.params.train_dir
        self.logs_dir = "{}_logs".format(self.train_dir)
        if self.train_dir is None:
            raise ValueError('Trained model directory not specified')
        self.num_gpus = self.params.num_gpus

        # create a mesage builder for logging
        self.message = utils.MessageBuilder()

        if self.params.cudnn_benchmark:
            cudnn.benchmark = True

        if self.params.num_gpus:
            self.batch_size = self.params.batch_size * self.num_gpus
        else:
            self.batch_size = self.params.batch_size

        if not self.params.data_pattern:
            raise IOError("'data_pattern' was not specified. "
                          "Nothing to evaluate.")

        # load reader
        self.reader = readers_config[self.params.dataset](self.params,
                                                          self.batch_size,
                                                          self.num_gpus,
                                                          is_training=False)

        # load model
        self.model = model_config.get_model_config(self.params.model,
                                                   self.params.dataset,
                                                   self.params,
                                                   self.reader.n_classes,
                                                   is_training=False)
        # add normalization as first layer of model
        if self.params.add_normalization:
            normalize_layer = self.reader.get_normalize_layer()
            self.model = torch.nn.Sequential(normalize_layer, self.model)
        self.model = torch.nn.DataParallel(self.model)
        self.model = self.model.cuda()

        # init loss
        self.criterion = torch.nn.CrossEntropyLoss().cuda()

        # save files for analysis
        if self.params.dump_files:
            assert self.params.eval_under_attack, \
                "dumping files only available when under attack"
            self.dump = DumpFiles(params)

        # eval under attack
        if self.params.eval_under_attack:
            attack_params = self.params.attack_params
            self.attack = utils.get_attack(self.model, self.reader.n_classes,
                                           self.params.attack_method,
                                           attack_params)

        if self.params.additive_noise or self.params.adaptive_noise:
            # define Smooth classifier
            dim = np.product(self.reader.img_size[1:])
            self.smooth_model = Smooth(self.model, self.params,
                                       self.reader.n_classes, dim)
예제 #5
0
    def __init__(self, params):
        """Creates a Trainer.
    """
        utils.set_default_param_values_and_env_vars(params)
        self.params = params

        # Setup logging & log the version.
        utils.setup_logging(params.logging_verbosity)

        self.job_name = self.params.job_name  # "" for local training
        self.is_distributed = bool(self.job_name)
        self.task_index = self.params.task_index
        self.local_rank = self.params.local_rank
        self.start_new_model = self.params.start_new_model
        self.train_dir = self.params.train_dir
        self.num_gpus = self.params.num_gpus
        if self.num_gpus and not self.is_distributed:
            self.batch_size = self.params.batch_size * self.num_gpus
        else:
            self.batch_size = self.params.batch_size

        # print self.params parameters
        if self.start_new_model and self.local_rank == 0:
            pp = pprint.PrettyPrinter(indent=2, compact=True)
            logging.info(pp.pformat(params.values()))

        if self.local_rank == 0:
            logging.info("PyTorch version: {}.".format(torch.__version__))
            logging.info("NCCL Version {}".format(torch.cuda.nccl.version()))
            logging.info("Hostname: {}.".format(socket.gethostname()))

        if self.is_distributed:
            self.num_nodes = len(params.worker_hosts.split(';'))
            self.world_size = self.num_nodes * self.num_gpus
            self.rank = self.task_index * self.num_gpus + self.local_rank
            dist.init_process_group(backend='nccl',
                                    init_method='env://',
                                    timeout=datetime.timedelta(seconds=30))
            if self.local_rank == 0:
                logging.info('World Size={} => Total batch size {}'.format(
                    self.world_size, self.batch_size * self.world_size))
            self.is_master = bool(self.rank == 0)
        else:
            self.world_size = 1
            self.is_master = True

        # create a mesage builder for logging
        self.message = utils.MessageBuilder()

        # load reader and model
        self.reader = readers_config[self.params.dataset](self.params,
                                                          self.batch_size,
                                                          self.num_gpus,
                                                          is_training=True)

        # load model
        self.model = model_config.get_model_config(self.params.model,
                                                   self.params.dataset,
                                                   self.params,
                                                   self.reader.n_classes,
                                                   is_training=True)
        # add normalization as first layer of model
        if self.params.add_normalization:
            # In order to certify radii in original coordinates rather than standardized coordinates, we
            # add the noise _before_ standardizing, which is why we have standardization be the first
            # layer of the classifier rather than as a part of preprocessing as is typical.
            normalize_layer = self.reader.get_normalize_layer()
            self.model = torch.nn.Sequential(normalize_layer, self.model)

        # define DistributedDataParallel job
        self.model = SyncBatchNorm.convert_sync_batchnorm(self.model)
        torch.cuda.set_device(params.local_rank)
        self.model = self.model.cuda()
        i = params.local_rank
        self.model = DistributedDataParallel(self.model,
                                             device_ids=[i],
                                             output_device=i)
        if self.local_rank == 0:
            logging.info('Model defined with DistributedDataParallel')

        # define set for saved ckpt
        self.saved_ckpts = set([0])

        # define optimizer
        self.optimizer = utils.get_optimizer(self.params.optimizer,
                                             self.params.optimizer_params,
                                             self.params.init_learning_rate,
                                             self.params.weight_decay,
                                             self.model.parameters())

        # define learning rate scheduler
        self.scheduler = utils.get_scheduler(self.optimizer,
                                             self.params.lr_scheduler,
                                             self.params.lr_scheduler_params)

        # if start_new_model is False, we restart training
        if not self.start_new_model:
            if self.local_rank == 0:
                logging.info('Restarting training...')
            self._load_state()

        # define Lipschitz regularization module
        if self.params.lipschitz_regularization:
            if self.local_rank == 0:
                logging.info(
                    "Lipschitz regularization with decay {}, start after epoch {}"
                    .format(self.params.lipschitz_decay,
                            self.params.lipschitz_start_epoch))
            self.lipschitz = LipschitzRegularization(self.model, self.params,
                                                     self.reader,
                                                     self.local_rank)

        # exponential moving average
        self.ema = None
        if getattr(self.params, 'ema', False) > 0:
            self.ema = utils.EMA(self.params.ema)

        # if adversarial training, create the attack class
        if self.params.adversarial_training:
            if self.local_rank == 0:
                logging.info('Adversarial Training')
            attack_params = self.params.adversarial_training_params
            if 'eps_iter' in attack_params.keys(
            ) and attack_params['eps_iter'] == -1:
                eps = attack_params['eps']
                n_iter = attack_params['nb_iter']
                attack_params['eps_iter'] = eps / n_iter * 2
                if self.local_rank == 0:
                    logging.info('Learning rate for attack: {}'.format(
                        attack_params['eps_iter']))
            self.attack = utils.get_attack(
                self.model, self.reader.n_classes,
                self.params.adversarial_training_name, attack_params)

        # init noise
        if self.params.adaptive_noise and self.params.additive_noise:
            raise ValueError(
                "Adaptive and Additive Noise should not be set together")
        if self.params.adaptive_noise:
            if self.local_rank == 0:
                logging.info('Training with Adaptive Noise: {} {}'.format(
                    self.params.noise_distribution, self.params.noise_scale))
        elif self.params.additive_noise:
            if self.local_rank == 0:
                logging.info('Training with Noise: {} {}'.format(
                    self.params.noise_distribution, self.params.noise_scale))
        if self.params.adaptive_noise or self.params.additive_noise:
            self.noise = utils.Noise(self.params)

        # stability training
        if self.params.stability_training:
            if self.local_rank == 0:
                logging.info("Training with Stability Training: {}".format(
                    self.params.stability_training_lambda))
            if not any([
                    self.params.adversarial_training,
                    self.params.adaptive_noise, self.params.additive_noise
            ]):
                raise ValueError(
                    "Adversarial Training or Adaptive Noise should be activated"
                )
예제 #6
0
    def __init__(self, params):
        """Creates a Trainer.
    """
        self.params = params
        params_sanity_checks(params)

        # Sets up the environment that Trainer should run in.
        setup(params)

        # Setup logging & log the version.
        global_utils.setup_logging(params.logging_verbosity)
        logging.info("Tensorflow version: {}.".format(tf.__version__))
        logging.info("Hostname: {}.".format(socket.gethostname()))

        # print self.params parameters
        pp = pprint.PrettyPrinter(indent=2, compact=True)
        logging.info(pp.pformat(params.values()))

        self.job_name = self.params.job_name  # "" for local training
        self.task_index = self.params.task_index
        self.is_master = (self.job_name in ('', 'worker')
                          and self.task_index == 0)
        self.start_new_model = self.params.start_new_model
        self.train_dir = self.params.train_dir
        self.num_gpus = self.params.num_gpus
        self.variable_update = self.params.variable_update

        # create a mesage builder for logging
        self.message = global_utils.MessageBuilder()

        self.batch_size = self.params.batch_size * self.num_gpus
        if self.job_name:
            self.global_batch_size = self.batch_size * \
                (self.cluster.num_tasks('worker') + 1)
        else:
            self.global_batch_size = self.batch_size

        self.trace_filename = self.params.trace_file
        self.sync_queue_counter = 0

        # TODO: remove auto loss scale and check inf in grad
        self.enable_auto_loss_scale = (self.params.use_fp16 and
                                       self.params.fp16_enable_auto_loss_scale)
        self.loss_scale = None
        self.loss_scale_normal_steps = None

        # PS server is used for distributed jobs not using all-reduce.
        use_ps_server = self.job_name and \
          (self.variable_update != 'distributed_all_reduce' and \
           self.variable_update != 'collective_all_reduce')

        # collective_all_reduce doesn't need a controller or ps
        self.distributed_collective = (self.variable_update
                                       == 'collective_all_reduce'
                                       and self.job_name)

        self.local_parameter_device_flag = self.params.local_parameter_device
        if self.job_name:
            self.cluster_manager = platforms_util.get_cluster_manager(
                params, utils.create_config_proto(params))
            assert isinstance(self.cluster_manager,
                              cnn_util.BaseClusterManager)

            worker_prefix = '/job:worker/replica:0/task:{}'.format(
                self.task_index)
            if use_ps_server:
                self.param_server_device = tf.train.replica_device_setter(
                    worker_device=worker_prefix + '/cpu:0',
                    cluster=self.cluster_manager.get_cluster_spec())
                # This device on which the queues for managing synchronization between
                # servers should be stored.
                self.sync_queue_devices = [
                    '/job:ps/replica:0/task:{}/cpu:0'.format(i)
                    for i in range(self.cluster_manager.num_ps())
                ]
            else:
                self.sync_queue_devices = [
                    '/job:worker/replica:0/task:0/cpu:0'
                ]
        else:
            self.task_index = 0
            self.cluster_manager = None
            worker_prefix = ''
            self.param_server_device = '/{}:0'.format(
                self.params.local_parameter_device)
            self.sync_queue_devices = [self.param_server_device]

        if self.cluster_manager:
            self.num_workers = self.cluster_manager.num_workers()
        elif self.params.variable_update == 'horovod':
            import horovod.tensorflow as hvd  # pylint: disable=g-import-not-at-top
            self.num_workers = hvd.size()
        else:
            self.num_workers = 1
        self.num_ps = self.cluster_manager.num_ps(
        ) if self.cluster_manager else 0

        if self.num_workers > 1 and self.params.all_reduce_spec == 'nccl':
            raise ValueError('--all_reduce_spec=nccl is invalid in a '
                             'multi-worker job')

        # Device to use for ops that need to always run on the local worker's CPU.
        self.cpu_device = '{}/cpu:0'.format(worker_prefix)

        # Device to use for ops that need to always run on the local worker's
        # compute device, and never on a parameter server device.
        self.raw_devices = [
            '{}/gpu:{}'.format(worker_prefix, i) for i in range(self.num_gpus)
        ]

        if self.params.variable_update == 'parameter_server':
            if self.job_name:
                self.variable_mgr = \
                    variable_mgr.VariableMgrDistributedFetchFromPS(self)
            else:
                self.variable_mgr = variable_mgr.VariableMgrLocalFetchFromPS(
                    self)
        elif self.variable_update == 'replicated':
            if self.job_name:
                raise ValueError(
                    'Invalid variable_update in distributed mode: %s' %
                    self.variable_update)
            self.variable_mgr = variable_mgr.VariableMgrLocalReplicated(
                self, self.params.all_reduce_spec,
                self.params.agg_small_grads_max_bytes,
                self.params.agg_small_grads_max_group,
                self.params.allreduce_merge_scope)
        elif self.variable_update == 'distributed_all_reduce':
            assert self.params.cross_replica_sync
            self.variable_mgr = variable_mgr.VariableMgrDistributedAllReduce(
                self, self.params.all_reduce_spec,
                ('worker' if self.num_workers > 1 else 'localhost'),
                self.num_workers, self.params.agg_small_grads_max_bytes,
                self.params.agg_small_grads_max_group,
                self.params.allreduce_merge_scope)
        elif self.params.variable_update == 'collective_all_reduce':
            assert self.params.cross_replica_sync
            self.variable_mgr = variable_mgr.VariableMgrCollectiveAllReduce(
                self, self.params.all_reduce_spec, self.num_workers,
                self.num_gpus, self.task_index,
                self.params.allreduce_merge_scope)
        elif self.variable_update == 'distributed_replicated':
            assert self.params.cross_replica_sync
            if not self.job_name:
                raise ValueError(
                    'Invalid variable_update in local mode: {}'.format(
                        self.variable_update))
            self.variable_mgr = variable_mgr.VariableMgrDistributedReplicated(
                self)
        elif self.params.variable_update in ('independent', 'horovod'):
            if self.job_name:
                raise ValueError(
                    'Invalid variable_update in distributed mode: {}'.format(
                        self.variable_update))
            self.variable_mgr = variable_mgr.VariableMgrIndependent(self)
        else:
            raise ValueError('Invalid variable_update: {}'.format(
                self.variable_update))

        # Device to use for running on the local worker's compute device, but
        # with variables assigned to parameter server devices.
        self.devices = self.variable_mgr.get_devices()
        if self.job_name:
            if use_ps_server:
                self.global_step_device = self.param_server_device
            elif self.params.variable_update == 'collective_all_reduce':
                self.global_step_device = self.cpu_device
            else:
                self.global_step_device = '/job:worker/replica:0/task:0/cpu:0'
        else:
            self.global_step_device = self.cpu_device

        self.enable_auto_loss_scale = False

        self.model = model_config.get_model_config(self.params.model,
                                                   self.params.dataset,
                                                   self.params)
        self.reader = readers_config[self.params.dataset](self.params,
                                                          self.batch_size,
                                                          self.raw_devices,
                                                          self.cpu_device,
                                                          is_training=True)

        # define the number of steps
        self.num_steps_by_epoch = self.reader.n_train_files / self.global_batch_size
        self.max_steps = self.params.num_epochs * self.num_steps_by_epoch
예제 #7
0
    def __init__(self, params):
        """Creates a Trainer.
    """
        utils.set_default_param_values_and_env_vars(params)
        self.params = params

        # Setup logging & log the version.
        global_utils.setup_logging(params.logging_verbosity)

        self.job_name = self.params.job_name  # "" for local training
        self.is_distributed = bool(self.job_name)
        self.task_index = self.params.task_index
        self.local_rank = self.params.local_rank
        self.start_new_model = self.params.start_new_model
        self.train_dir = self.params.train_dir
        self.num_gpus = self.params.num_gpus
        if self.num_gpus and not self.is_distributed:
            self.batch_size = self.params.batch_size * self.num_gpus
        else:
            self.batch_size = self.params.batch_size

        # print self.params parameters
        if self.start_new_model and self.local_rank == 0:
            pp = pprint.PrettyPrinter(indent=2, compact=True)
            logging.info(pp.pformat(params.values()))

        if self.local_rank == 0:
            logging.info("PyTorch version: {}.".format(torch.__version__))
            logging.info("NCCL Version {}".format(torch.cuda.nccl.version()))
            logging.info("Hostname: {}.".format(socket.gethostname()))

        if self.is_distributed:
            self.num_nodes = len(params.worker_hosts.split(';'))
            self.world_size = self.num_nodes * self.num_gpus
            self.rank = self.task_index * self.num_gpus + self.local_rank
            dist.init_process_group(backend='nccl',
                                    init_method='env://',
                                    timeout=datetime.timedelta(seconds=30))
            if self.local_rank == 0:
                logging.info('World Size={} => Total batch size {}'.format(
                    self.world_size, self.batch_size * self.world_size))
            self.is_master = bool(self.rank == 0)
        else:
            self.world_size = 1
            self.is_master = True

        # create a mesage builder for logging
        self.message = global_utils.MessageBuilder()

        # load reader and model
        self.reader = readers_config[self.params.dataset](self.params,
                                                          self.batch_size,
                                                          self.num_gpus,
                                                          is_training=True)
        self.model = model_config.get_model_config(self.params.model,
                                                   self.params.dataset,
                                                   self.params,
                                                   self.reader.n_classes,
                                                   is_training=True)
        # define DistributedDataParallel job
        self.model = SyncBatchNorm.convert_sync_batchnorm(self.model)
        torch.cuda.set_device(params.local_rank)
        self.model = self.model.cuda()
        i = params.local_rank
        self.model = DistributedDataParallel(self.model,
                                             device_ids=[i],
                                             output_device=i)
        if self.local_rank == 0:
            logging.info('Model defined with DistributedDataParallel')

        # define set for saved ckpt
        self.saved_ckpts = set([0])

        # define optimizer
        self.optimizer = get_optimizer(self.params.optimizer,
                                       self.params.optimizer_params,
                                       self.params.init_learning_rate,
                                       self.params.weight_decay,
                                       self.model.parameters())

        # define learning rate scheduler
        self.scheduler = get_scheduler(self.optimizer,
                                       self.params.lr_scheduler,
                                       self.params.lr_scheduler_params)

        # if start_new_model is False, we restart training
        if not self.start_new_model:
            if self.local_rank == 0:
                logging.info('Restarting training...')
            self.load_state()

        # define Lipschitz Reg module
        self.lipschitz_reg = LipschitzRegularization(self.model, self.params,
                                                     self.reader,
                                                     self.local_rank)

        # exponential moving average
        self.ema = None
        if getattr(self.params, 'ema', False) > 0:
            self.ema = utils.EMA(self.params.ema)

        # if adversarial training, create the attack class
        if self.params.adversarial_training:
            if self.local_rank == 0:
                logging.info('Adversarial Training')
            attack_params = self.params.adversarial_training_params
            self.attack = utils.get_attack(
                self.model, self.reader.n_classes,
                self.params.adversarial_training_name, attack_params)
예제 #8
0
  def __init__(self, params):

    # Set up environment variables before doing any other global initialization to
    # make sure it uses the appropriate environment variables.
    utils.set_default_param_values_and_env_vars(params)

    self.params = params

    # Setup logging & log the version.
    global_utils.setup_logging(params.logging_verbosity)
    logging.info("Pytorch version: {}.".format(torch.__version__))
    logging.info("Hostname: {}.".format(socket.gethostname()))

    # print self.params parameters
    pp = pprint.PrettyPrinter(indent=2, compact=True)
    logging.info(pp.pformat(params.values()))

    self.train_dir = self.params.train_dir
    self.logs_dir = "{}_logs".format(self.train_dir)
    if self.train_dir is None:
      raise ValueError('Trained model directory not specified')
    self.num_gpus = self.params.num_gpus

    # create a mesage builder for logging
    self.message = global_utils.MessageBuilder()

    if self.params.cudnn_benchmark:
      cudnn.benchmark = True

    if self.params.num_gpus:
      self.batch_size = self.params.batch_size * self.num_gpus
    else:
      self.batch_size = self.params.batch_size

    if not self.params.data_pattern:
      raise IOError("'data_pattern' was not specified. "
        "Nothing to evaluate.")

    # load reader and model
    self.reader = readers_config[self.params.dataset](
      self.params, self.batch_size, self.num_gpus, is_training=False)
    self.model = model_config.get_model_config(
        self.params.model, self.params.dataset, self.params,
        self.reader.n_classes, is_training=False)
    # TODO: get the loss another way
    self.criterion = torch.nn.CrossEntropyLoss().cuda()

    self.model = torch.nn.DataParallel(self.model)
    self.model = self.model.cuda()

    self.add_noise = False
    eot = getattr(self.params, 'eot', False)
    if getattr(self.params, 'add_noise', False) and eot == False:
      self.add_noise = True
      self.noise = utils.AddNoise(self.params)

    if self.params.eval_under_attack:
      if self.params.dump_files:
        self.dump = DumpFiles(params)
      if eot:
        attack_model = utils.EOTWrapper(
          self.model, self.reader.n_classes, self.params)
      else:
        attack_model = self.model

      attack_params = self.params.attack_params
      self.attack = utils.get_attack(
                      self.model,
                      self.reader.n_classes,
                      self.params.attack_method,
                      attack_params)
  def __init__(self, params):
    """Creates a Trainer.
    """
    utils.set_default_param_values_and_env_vars(params)
    self.params = params

    # Setup logging & log the version.
    global_utils.setup_logging(params.logging_verbosity)
    logging.info("PyTorch version: {}.".format(torch.__version__))
    logging.info("Hostname: {}.".format(socket.gethostname()))

    # print self.params parameters
    pp = pprint.PrettyPrinter(indent=2, compact=True)
    logging.info(pp.pformat(params.values()))

    self.job_name = self.params.job_name  # "" for local training
    self.is_distributed = bool(self.job_name)
    self.task_index = self.params.task_index
    self.local_rank = self.params.local_rank
    self.start_new_model = self.params.start_new_model
    self.train_dir = self.params.train_dir
    self.num_gpus = self.params.num_gpus
    if self.num_gpus and not self.is_distributed:
      self.batch_size = self.params.batch_size * self.num_gpus
    else:
      self.batch_size = self.params.batch_size

    if self.is_distributed:
      self.num_nodes = len(params.worker_hosts.split(';'))
      self.world_size = self.num_nodes * self.num_gpus
      self.rank = self.task_index * self.num_gpus + self.local_rank
      dist.init_process_group(
        backend='nccl', init_method='env://',
        timeout=datetime.timedelta(seconds=30))
      if self.local_rank == 0:
        logging.info('world size={}'.format(self.world_size))
      logging.info('Distributed init done, local_rank={}, rank={}'.format(
        self.local_rank, self.rank))
      self.is_master = bool(self.rank == 0)
    else:
      self.is_master = True

    # create a mesage builder for logging
    self.message = global_utils.MessageBuilder()

    # load reader and model
    self.reader = readers_config[self.params.dataset](
      self.params, self.batch_size, self.num_gpus, is_training=True)
    self.model = model_config.get_model_config(
        self.params.model, self.params.dataset, self.params,
        self.reader.n_classes, is_training=True)
    if not params.job_name:
      self.model = torch.nn.DataParallel(self.model)
      self.model = self.model.cuda()
    else:
      torch.cuda.set_device(params.local_rank)
      self.model = self.model.cuda()
      i = params.local_rank
      self.model = DistributedDataParallel(
        self.model, device_ids=[i], output_device=i)
      logging.info('model defined with DistributedDataParallel')

    # if adversarial training, create the attack class
    if self.params.adversarial_training:
      attack_params = self.params.adversarial_training_params

      # from advertorch import attacks
      # self.adversaries = {}
      # self.adversaries["PGDLinf"] = attacks.LinfPGDAttack(
      #   self.model, eps=0.031, nb_iter=10, eps_iter=2*0.031/10,
      #   rand_init=True, clip_min=0.0, clip_max=1.0)
      #
      # self.adversaries["PGDL2"] = attacks.L2PGDAttack(
      #   self.model, eps=0.83, nb_iter=10, eps_iter=2*0.83/10,
      #   rand_init=True, clip_min=0.0, clip_max=1.0)

      self.attack = utils.get_attack(
                      self.model,
                      self.reader.n_classes,
                      self.params.adversarial_training_name,
                      attack_params)