def apply_optimizers( self, train_net, train_init_net, grad_map, blob_to_device=None, ): CPU = muji.OnCPU() # if given, blob_to_device is a map from blob to device_option blob_to_device = blob_to_device or {} for param, optimizer in viewitems(self.param_to_optim): assert optimizer is not None, \ "default optimizer must have been set in add_layer" # note that not all params has gradient and thus we sent None if # gradient does not exists device = get_param_device( param, grad_map.get(str(param)), param_to_device=blob_to_device, default_device=CPU, ) if device is not None: # extra info is not applicable for optimizers del device.extra_info[:] with core.DeviceScope(device): optimizer( train_net, train_init_net, param, grad_map.get(str(param)))
def apply_regularizers_after_optimizer( self, train_net, train_init_net, grad_map, blob_to_device=None, ): logger.info("apply regularizer after optimizer") CPU = muji.OnCPU() # if given, blob_to_device is a map from blob to device_option blob_to_device = blob_to_device or {} for param, regularizer in viewitems(self.param_to_reg): if regularizer is None: continue assert isinstance(regularizer, Regularizer) logger.info("add regularizer {0} for param {1} to optimizer".format(regularizer, param)) device = get_param_device( param, grad_map.get(str(param)), param_to_device=blob_to_device, default_device=CPU, ) with core.DeviceScope(device): regularizer( train_net, train_init_net, param, grad=grad_map.get(str(param)), by=RegularizationBy.AFTER_OPTIMIZER )
def apply_regularizers_after_optimizer( self, train_net, train_init_net, grad_map, blob_to_device=None, ): CPU = muji.OnCPU() # if given, blob_to_device is a map from blob to device_option blob_to_device = blob_to_device or {} for param, regularizer in viewitems(self.param_to_reg): if regularizer is None or not regularizer.apply_after_optimizer: continue assert isinstance(regularizer, Regularizer) device = get_param_device( param, grad_map.get(str(param)), param_to_device=blob_to_device, default_device=CPU, ) with core.DeviceScope(device): regularizer( train_net, train_init_net, param, grad_map.get(str(param)))
def apply_optimizers( self, train_net, train_init_net, grad_map, blob_to_device=None, ): CPU = muji.OnCPU() # if given, blob_to_device is a map from blob to device_option blob_to_device = blob_to_device or {} for param, optimizer in viewitems(self.param_to_optim): assert optimizer is not None, \ "default optimizer must have been set in add_layer" # note that not all params has gradient and thus we sent None if # gradient does not exists device = get_param_device( param, grad_map.get(str(param)), param_to_device=blob_to_device, default_device=CPU, ) with core.DeviceScope(device): optimizer( train_net, train_init_net, param, grad_map.get(str(param)))
def apply_optimizers( self, train_net, train_init_net, grad_map, blob_to_device=None, ): CPU = core.DeviceOption(caffe2_pb2.CPU) # if given, blob_to_device is a map from blob to device_option blob_to_device = blob_to_device or {} for param, optimizer in self.param_to_optim.items(): assert optimizer is not None, \ "default optimizer must have been set in add_layer" # note that not all params has gradient and thus we sent None if # gradient does not exists device = get_param_device( param, grad_map.get(str(param)), param_to_device=blob_to_device, default_device=CPU, ) with core.DeviceScope(device): optimizer(train_net, train_init_net, param, grad_map.get(str(param)))
def infer_blob_device(blob_name): return optimizer.get_param_device(blob_name, "{}_grad".format(blob_name), param_to_device)
def modify_net(self, net, init_net=None, grad_map=None, blob_to_device=None, modify_output_record=False): assert grad_map is not None CPU = core.DeviceOption(caffe2_pb2.CPU) final_param_map = {} if self.blobs_to_include is None: final_param_map = grad_map else: for blob in self.blobs_to_include: param = core.BlobReference(blob) if not net.BlobIsDefined(param): raise Exception( 'param {0} is not defined in net {1}'.format( param, net.Name())) final_param_map[param] = grad_map[param] if self.blobs_to_exclude is not None: for blob in self.blobs_to_exclude: final_param_map.pop(blob, None) for param, grad in final_param_map.items(): # currently sparse gradients won't be clipped # futher implementation is needed to enable it if isinstance(grad, core.GradientSlice): continue device = get_param_device( param, grad_map[str(param)], param_to_device=blob_to_device, default_device=CPU, ) with core.DeviceScope(device): if self.grad_clip_method == self.BY_NORM: if self.clip_norm_type == self.L2_NORM: p = 2 elif self.clip_norm_type == self.L1_NORM: p = 1 grad_norm = net.LpNorm( [grad], net.NextScopedBlob(prefix=str(grad) + '_l{}_norm'.format(p)), p=p, ) if p == 2: grad_norm = net.Pow([grad_norm], exponent=0.5) op_inputs = [grad, grad_norm] if self.use_parameter_norm: param_norm = net.LpNorm( [param], net.NextScopedBlob(prefix=str(param) + '_l{}_norm'.format(p)), p=p, ) if p == 2: param_norm = net.Pow([param_norm], exponent=0.5) op_inputs.append(param_norm) if self.compute_norm_ratio: net.Div([grad_norm, param_norm], [ net.NextScopedBlob(prefix=str(param) + "_norm_ratio") ]) net.ClipTensorByScaling( op_inputs, [grad], threshold=self.clip_threshold, ) elif self.grad_clip_method == self.BY_VALUE: net.Clip( [grad], [grad], max=self.clip_max, min=self.clip_min, )
def modify_net(self, net, init_net=None, grad_map=None, blob_to_device=None, modify_output_record=False): assert grad_map is not None CPU = core.DeviceOption(caffe2_pb2.CPU) for param, grad in grad_map.items(): # currently sparse gradients won't be clipped # futher implementation is needed to enable it if isinstance(grad, core.GradientSlice): continue device = get_param_device( param, grad_map[str(param)], param_to_device=blob_to_device, default_device=CPU, ) with core.DeviceScope(device): if self.grad_clip_method == self.BY_NORM: if self.clip_norm_type == self.L2_NORM: p = 2 elif self.clip_norm_type == self.L1_NORM: p = 1 grad_norm = net.LpNorm( [grad], net.NextScopedBlob(prefix=str(grad) + '_l{}_norm'.format(p)), p=p, ) if p == 2: grad_norm = net.Pow([grad_norm], exponent=0.5) op_inputs = [grad, grad_norm] if self.use_parameter_norm: param_norm = net.LpNorm( [param], net.NextScopedBlob(prefix=str(param) + '_l{}_norm'.format(p)), p=p, ) if p == 2: param_norm = net.Pow([param_norm], exponent=0.5) op_inputs.append(param_norm) if self.compute_norm_ratio: net.Div([grad_norm, param_norm], [ net.NextScopedBlob(prefix=str(param) + '_norm_ratio') ]) net.ClipTensorByScaling( op_inputs, [grad], threshold=self.clip_threshold, )
def infer_blob_device(blob_name): return optimizer.get_param_device( blob_name, "{}_grad".format(blob_name), param_to_device )
def modify_net(self, net, init_net=None, grad_map=None, blob_to_device=None): assert grad_map is not None CPU = core.DeviceOption(caffe2_pb2.CPU) for param, grad in grad_map.items(): # currently sparse gradients won't be clipped # futher implementation is needed to enable it if isinstance(grad, core.GradientSlice): continue device = get_param_device( param, grad_map[str(param)], param_to_device=blob_to_device, default_device=CPU, ) with core.DeviceScope(device): if self.grad_clip_method == self.BY_NORM: if self.clip_norm_type == self.L2_NORM: p = 2 elif self.clip_norm_type == self.L1_NORM: p = 1 grad_norm = net.LpNorm( [grad], net.NextScopedBlob(prefix=str(grad) + '_l{}_norm'.format(p)), p=p, ) if p == 2: grad_norm = net.Pow([grad_norm], exponent=0.5) op_inputs = [grad, grad_norm] if self.use_parameter_norm: param_norm = net.LpNorm( [param], net.NextScopedBlob( prefix=str(param) + '_l{}_norm'.format(p)), p=p, ) if p == 2: param_norm = net.Pow([param_norm], exponent=0.5) op_inputs.append(param_norm) if self.compute_norm_ratio: net.Div( [grad_norm, param_norm], [net.NextScopedBlob( prefix=str(param) + '_norm_ratio')] ) net.ClipTensorByScaling( op_inputs, [grad], threshold=self.clip_threshold, )
def _build_l1_bn( model, optimizer, weights_only=False, use_param_info_optim=True, max_gradient_norm=None, allow_lr_injection=False, ): param_to_device = _get_param_to_device(model) # Validate there are no duplicate params model.Validate() params = [] for param_info in model.GetOptimizationParamInfo(): if weights_only and param_info.blob not in model.weights: continue # add L1 norm for spatial bn if param_info.name.endswith('bn_s'): params.append(param_info) lr_multiplier = None if max_gradient_norm is not None: lr_multiplier = _calc_norm_ratio( model, params, 'norm_clipped_grad_update', param_to_device, max_gradient_norm, ) if allow_lr_injection: if not model.net.BlobIsDefined(_LEARNING_RATE_INJECTION): lr_injection = model.param_init_net.ConstantFill( [], _LEARNING_RATE_INJECTION, shape=[1], value=1.0, ) else: lr_injection = _LEARNING_RATE_INJECTION if lr_multiplier is None: lr_multiplier = lr_injection else: lr_multiplier = model.net.Mul( [lr_multiplier, lr_injection], 'lr_multiplier', broadcast=1, ) optimizer.add_lr_multiplier(lr_multiplier) for param_info in params: param_name = str(param_info.blob) device = get_param_device(param_name, param_info.grad, param_to_device) with core.DeviceScope(device): if param_info.optimizer and use_param_info_optim: param_info.optimizer(model.net, model.param_init_net, param_info) else: optimizer(model.net, model.param_init_net, param_info) return optimizer