def layer_norm(x, begin_norm_axis=1, epsilon=1e-12, param_attr=None, bias_attr=None): """ Replace build-in layer_norm op with this function """ helper = LayerHelper('layer_norm', **locals()) mean = layers.reduce_mean(x, dim=begin_norm_axis, keep_dim=True) shift_x = layers.elementwise_sub(x=x, y=mean, axis=0) variance = layers.reduce_mean( layers.square(shift_x), dim=begin_norm_axis, keep_dim=True) r_stdev = layers.rsqrt(variance + epsilon) norm_x = layers.elementwise_mul(x=shift_x, y=r_stdev, axis=0) param_shape = [reduce(lambda x, y: x * y, norm_x.shape[begin_norm_axis:])] param_dtype = norm_x.dtype scale = helper.create_parameter( attr=param_attr, shape=param_shape, dtype=param_dtype, default_initializer=fluid.initializer.Constant(1.)) bias = helper.create_parameter( attr=bias_attr, shape=param_shape, dtype=param_dtype, is_bias=True, default_initializer=fluid.initializer.Constant(0.)) out = layers.elementwise_mul(x=norm_x, y=scale, axis=-1) out = layers.elementwise_add(x=out, y=bias, axis=-1) return out
def forward(self, x): """ Forward process of LayerNorm. """ mean = layers.reduce_mean(x, dim=list(range(self._begin_norm_axis, len(x.shape))), keep_dim=True) shift_x = layers.elementwise_sub(x=x, y=mean, axis=0) variance = layers.reduce_mean(layers.square(shift_x), dim=list(range(self._begin_norm_axis, len(x.shape))), keep_dim=True) r_stdev = layers.rsqrt(variance + self._epsilon) norm_x = layers.elementwise_mul(x=shift_x, y=r_stdev, axis=0) out = layers.elementwise_mul(x=norm_x, y=self._scale_w, axis=-1) out = layers.elementwise_add(x=out, y=self._bias_w, axis=-1) return out
def func(self, place): shape = [2, 3, 7, 9] eps = 0.0001 dtype = np.float64 x = layers.data('x', shape, False, dtype) x.persistable = True y = layers.rsqrt(x) x_arr = np.random.uniform(0.1, 1, shape).astype(dtype) gradient_checker.double_grad_check( [x], y, x_init=x_arr, place=place, eps=eps) gradient_checker.double_grad_check_for_dygraph( self.rsqrt_wrapper, [x], y, x_init=x_arr, place=place)