def build_lr(self, net, param_init_net, base_learning_rate, learning_rate_blob=None, policy="fixed", iter_val=0, **kwargs): if learning_rate_blob is None: learning_rate_blob = self.make_unique_blob_name('lr') iteration = utils.BuildUniqueMutexIter(param_init_net, net, iter_val=iter_val) if not net.BlobIsDefined(learning_rate_blob): # There is one interesting thing here: since we are minimizing, we are # doing "descent" so the learning rate is set to be negative. lr = net.LearningRate([iteration], learning_rate_blob, base_lr=-base_learning_rate, policy=policy, **kwargs) else: lr = net.GetBlobRef(learning_rate_blob) if self._lr_multiplier is not None: lr_multiplier = net.CopyFromCPUInput( self._lr_multiplier, self.make_unique_blob_name('lr_multiplier')) lr = net.Mul( [lr, lr_multiplier], self.make_unique_blob_name('scaled_lr'), broadcast=1, ) if self._local_lr_multiplier is not None: current_scope = scope.CurrentDeviceScope() if (current_scope is not None and current_scope.device_type == caffe2_pb2.CUDA and not self._local_lr_multiplier_on_gpu): local_lr_multiplier = net.CopyFromCPUInput( self._local_lr_multiplier, self.make_unique_blob_name('local_lr_multiplier')) else: local_lr_multiplier = self._local_lr_multiplier lr = net.Mul( [lr, local_lr_multiplier], self.make_unique_blob_name('local_scaled_lr'), broadcast=1, ) return lr, iteration
def testBuildUniqueMutexIter(self): init_net = core.Net("init_net") net = core.Net("net") utils.BuildUniqueMutexIter(init_net, net) for op in init_net.Proto().op: self.assertEqual(op.device_option.extra_info[0], "device_type_override:cpu") for op in net.Proto().op: self.assertEqual(op.device_option.extra_info[0], "device_type_override:cpu")
def _run_on_loss(self, net, param_init_net, param, grad=None): iteration = utils.BuildUniqueMutexIter(param_init_net, net) # Since we are most likely to do a minimization discount = net.NextScopedBlob(param + "_log_barrier_discount") net.LearningRate([iteration], [discount], base_lr=-self.reg_lambda, policy=self.discount_policy, **self.discount_options) # TODO(xlwang): param might still be negative at the initialization time or # slightly negative due to the distributed training. Enforce it's non-negativity # for now (at least above machine epsilon) param_non_neg = net.NextScopedBlob(param + "_non_neg") net.Clip([param], [param_non_neg], min=self.kEpsilon) param_log = net.NextScopedBlob(param + "_log") net.Log([param_non_neg], [param_log]) param_log_sum = net.NextScopedBlob(param + "_log_sum") net.SumElements([param_log], [param_log_sum]) output_blob = net.NextScopedBlob(param + "_log_barrier") net.Mul([param_log_sum, discount], [output_blob], broadcast=1) return output_blob
def _run(self, net, param_init_net, param_info): # Note: This is number of persistent scalars in YellowFin optimizer. # It should always be the number of scalars being used. The same # number should be used in class for the operation. SCALARS_MEMORY_SIZE = 5 param = param_info.blob grad = param_info.grad moment = param_init_net.ConstantFill([param], param + "_moment", value=0.0) curv_win = param_init_net.ConstantFill([], param + "_curv_win", shape=[self.curv_win_width], value=0.0) g_avg = param_init_net.ConstantFill([param], param + "_g_avg", value=0.0) g2_avg = param_init_net.ConstantFill([param], param + "_g2_avg", value=0.0) lr_avg = param_init_net.ConstantFill([], param + "_lr_avg", shape=[1], value=self.alpha) mu_avg = param_init_net.ConstantFill([], param + "_mu_avg", shape=[1], value=self.mu) scalars_memory = param_init_net.ConstantFill( [], param + "_scalars_memory", shape=[SCALARS_MEMORY_SIZE], value=0.0) assert self.alpha > 0 assert not isinstance(grad, core.GradientSlice), \ "YellowFin does not support sparse gradients" iteration = utils.BuildUniqueMutexIter(param_init_net, net, iter_val=0) self._aux_params.shared.append(iteration) self._aux_params.local.append(moment) self._aux_params.local.append(lr_avg) self._aux_params.local.append(mu_avg) self._aux_params.local.append(curv_win) self._aux_params.local.append(g_avg) self._aux_params.local.append(g2_avg) self._aux_params.local.append(scalars_memory) yf_in_out_args = [ param, moment, lr_avg, mu_avg, curv_win, g_avg, g2_avg, scalars_memory ] net.YellowFin(yf_in_out_args + [grad, iteration], yf_in_out_args, beta=self.beta, epsilon=self.epsilon, curv_win_width=self.curv_win_width, zero_debias=self.zero_debias)