def create_parameter(layers, shape, dtype): # use layerhelper to init bias, scale, mean, variance helper = LayerHelper("batch_norm", **locals()) param_name = "batch_norm_" + str(layers) scale = helper.create_parameter(attr=fluid.ParamAttr(name=param_name + '.w' + '_0'), shape=[shape], dtype=dtype, default_initializer=Constant(1.0)) scale.stop_gradient = True bias = helper.create_parameter(attr=fluid.ParamAttr(name=param_name + '.b' + '_0'), shape=[shape], dtype=dtype, is_bias=True) bias.stop_gradient = True mean = helper.create_parameter(attr=ParamAttr(name=param_name + '.w' + '_1', initializer=Constant(0.0), trainable=False), shape=[shape], dtype=dtype) mean.stop_gradient = True variance = helper.create_parameter(attr=ParamAttr( name=param_name + '.w' + '_2', initializer=Constant(1.0), trainable=False), shape=[shape], dtype=dtype) variance.stop_gradient = True return scale, bias, mean, variance
def layer_norm(x, begin_norm_axis=1, epsilon=1e-12, param_attr=None, bias_attr=None): """ Replace build-in layer_norm op with this function """ helper = LayerHelper('layer_norm', **locals()) mean = layers.reduce_mean(x, dim=begin_norm_axis, keep_dim=True) shift_x = layers.elementwise_sub(x=x, y=mean, axis=0) variance = layers.reduce_mean( layers.square(shift_x), dim=begin_norm_axis, keep_dim=True) r_stdev = layers.rsqrt(variance + epsilon) norm_x = layers.elementwise_mul(x=shift_x, y=r_stdev, axis=0) param_shape = [reduce(lambda x, y: x * y, norm_x.shape[begin_norm_axis:])] param_dtype = norm_x.dtype scale = helper.create_parameter( attr=param_attr, shape=param_shape, dtype=param_dtype, default_initializer=fluid.initializer.Constant(1.)) bias = helper.create_parameter( attr=bias_attr, shape=param_shape, dtype=param_dtype, is_bias=True, default_initializer=fluid.initializer.Constant(0.)) out = layers.elementwise_mul(x=norm_x, y=scale, axis=-1) out = layers.elementwise_add(x=out, y=bias, axis=-1) return out
def rank_attention(input, rank_offset, rank_param_shape, rank_param_attr, max_rank=3): """ **Rank Attention layer** This Op can calculate rank attention between input and rank_param, and rank_param gives the organization of data. Notice: It currently supports GPU device. This Op exists in contrib, which means that it is not shown to the public. Args: input: Tensor with data type float32, float64. rank_offset: Tensor with data type int32. rank_para_shape: The shape of rank_param. rank_param_attr: Attribute initializer of rank_param. max_rank: The max rank of input's ranks. Returns: Variable: A Tensor with the same data type as input's. Examples: .. code-block:: python import paddle.fluid as fluid import numpy as np input = fluid.data(name="input", shape=[None, 2], dtype="float32") rank_offset = fluid.data(name="rank_offset", shape=[None, 7], dtype="int32") out = fluid.contrib.layers.rank_attention(input=input, rank_offset=rank_offset, rank_param_shape=[18,3], rank_param_attr= fluid.ParamAttr(learning_rate=1.0, name="ubm_rank_param.w_0", initializer= fluid.initializer.Xavier(uniform=False)), max_rank=3) """ helper = LayerHelper('rank_attention', **locals()) dtype = helper.input_dtype(input_param_name='input') input_shape = input.shape assert input_shape[1] * max_rank * max_rank == rank_param_shape[0] rank_param = helper.create_parameter(attr=rank_param_attr, shape=rank_param_shape, dtype=dtype) rank_param.stop_gradient = False output = helper.create_variable_for_type_inference(dtype) ins_rank = helper.create_variable_for_type_inference(dtype=dtype, stop_gradient=True) helper.append_op(type="rank_attention", inputs={ "X": input, "RankOffset": rank_offset, "RankParam": rank_param }, outputs={"Out": output}, attrs={"MaxRank": max_rank}) return output
class L1(fluid.imperative.Layer): def __init__(self, prefix): super(L1, self).__init__(prefix) self._helper = LayerHelper( self.full_name(), param_attr=fluid.ParamAttr(initializer=fluid.initializer.Constant( value=0.1))) self.w1 = self._helper.create_parameter(attr=self._helper.param_attr, shape=[2, 2], dtype='float32', is_bias=False) self.w2 = self._helper.create_parameter(attr=self._helper.param_attr, shape=[2, 2], dtype='float32', is_bias=False) def forward(self): return self.w1 + self.w2
def _l2_norm_scale(self, input, init_scale=1.0, channel_shared=False): from paddle.fluid.layer_helper import LayerHelper helper = LayerHelper("Scale") l2_norm = fluid.layers.l2_normalize( input, axis=1) # l2 norm along channel shape = [1] if channel_shared else [input.shape[1]] scale = helper.create_parameter( attr=helper.param_attr, shape=shape, dtype=input.dtype, default_initializer=Constant(init_scale)) out = fluid.layers.elementwise_mul( x=l2_norm, y=scale, axis=-1 if channel_shared else 1) return out
def pact(x): helper = LayerHelper("pact", **locals()) dtype = 'float32' init_thres = 20 u_param_attr = paddle.ParamAttr( name=x.name + '_pact', initializer=paddle.nn.initializer.Constant(value=init_thres), regularizer=paddle.regularizer.L2Decay(0.0001), learning_rate=1) u_param = helper.create_parameter(attr=u_param_attr, shape=[1], dtype=dtype) part_a = paddle.nn.functional.relu(x - u_param) part_b = paddle.nn.functional.relu(-u_param - x) x = x - part_a + part_b return x
def pact(x, name=None): helper = LayerHelper("pact", **locals()) dtype = 'float32' init_thres = 20 u_param_attr = fluid.ParamAttr( name=x.name + '_pact', initializer=fluid.initializer.ConstantInitializer(value=init_thres), regularizer=fluid.regularizer.L2Decay(0.0001), learning_rate=1) u_param = helper.create_parameter(attr=u_param_attr, shape=[1], dtype=dtype) x = fluid.layers.elementwise_sub( x, fluid.layers.relu(fluid.layers.elementwise_sub(x, u_param))) x = fluid.layers.elementwise_add( x, fluid.layers.relu(fluid.layers.elementwise_sub(-u_param, x))) return x
def pact(x): helper = LayerHelper("pact", **locals()) dtype = 'float32' init_thres = values[x.name.split('_tmp_input')[0]] u_param_attr = fluid.ParamAttr( name=x.name + '_pact', initializer=fluid.initializer.ConstantInitializer( value=init_thres), regularizer=fluid.regularizer.L2Decay(0.0001), learning_rate=1) u_param = helper.create_parameter(attr=u_param_attr, shape=[1], dtype=dtype) part_a = fluid.layers.relu(fluid.layers.elementwise_sub(x, u_param)) part_b = fluid.layers.relu(fluid.layers.elementwise_sub(-u_param, x)) x = x - part_a + part_b return x
def pact(x): """ Process a variable using the pact method you define Args: x(Tensor): Paddle Tensor, need to be preprocess before quantization Returns: The processed Tensor x. """ helper = LayerHelper("pact", **locals()) dtype = 'float32' init_thres = 20 u_param_attr = fluid.ParamAttr( name=x.name + '_pact', initializer=fluid.initializer.ConstantInitializer(value=init_thres), regularizer=fluid.regularizer.L2Decay(0.0001), learning_rate=1) u_param = helper.create_parameter(attr=u_param_attr, shape=[1], dtype=dtype) x = fluid.layers.elementwise_sub( x, fluid.layers.relu(fluid.layers.elementwise_sub(x, u_param))) x = fluid.layers.elementwise_add( x, fluid.layers.relu(fluid.layers.elementwise_sub(-u_param, x))) return x
def tdm_child(x, node_nums, child_nums, param_attr=None, dtype='int32'): """ **Tdm Child** According to the input node_id on the given tree, return the corresponding child node_id and whether child is a leaf node by leaf_mask value. .. code-block:: text Given: tree[[0], [1, 2], [3, 4], [5, 6]] # A binary tree with seven nodes x = [[2], [3]] node_nums = 7 child_nums = 2 we get: child = [[5, 6], [0, 0]] leaf_mask = [[1, 1], [0, 0]] Args: x(Variable): Variable contained the node_id information, dtype support int32/int64. node_nums(int): Number of total nodes. child_nums(int): Maximum number of child nodes per node. param_attr(ParamAttr): To specify the tdm-tree-info parameter property. Default: None, which means the default weight parameter property is used. See usage for details in: ref: `api_fluid_ParamAttr`, should has shape(node_nums, 3 + child_nums), dtype support int32/int64. The dimension[1] of tdm-tree-info contains the following: 1. Item_id(int, shape(1)), if node is a leaf node, give its item_id corresponding to node_id, else give 0. 2. Layer_id(int, shape(1)), indicates which layer the node is on. 3. Parent_id(int, shape(1)), node's parent node. 4. Child_id(int, shape(child_nums)), all child node's node_id of this node should be given. If the number of child nodes is insufficient, padding 0 until child nums equal to child_nums dtype(str): The data type of output child and leaf_mask, support int32/int64. Returns: tuple: A tuple including input node's child(Variable) and leaf_mask(Variable). If child is a leaf node, leaf_mask equal ot 1, otherwise equal to 0. Examples: .. code-block:: python import paddle.fluid as fluid import numpy as np x = fluid.data(name="x", shape=[None, 1], dtype="int32", lod_level=1) tree_info = [[0,0,0,1,2], [0,1,0,3,4],[0,1,0,5,6], [0,2,1,0,0],[1,2,1,0,0],[2,2,2,0,0],[3,2,2,0,0]] tree_info_np = np.array(tree_info) tree_info_np = np.reshape(tree_info_np, (7,5)) node_nums = 7 child_nums = 2 child, leaf_mask = fluid.contrib.layers.tdm_child(x, node_nums, child_nums, param_attr=fluid.ParamAttr( initializer=fluid.initializer.NumpyArrayInitializer( tree_info_np))) place = fluid.CPUPlace() exe = fluid.Executor(place) exe.run(fluid.default_startup_program()) xx = np.array([[2],[3]]).reshape((2,1)).astype("int32") child_res, leaf_mask_res = exe.run(feed={"x":xx}, fetch_list=[child, leaf_mask]) """ helper = LayerHelper("tdm_child", **locals()) check_dtype(dtype, 'dtype', ['int32', 'int64'], 'fluid.contrib.layers.tdm_child') c_dtype = convert_np_dtype_to_dtype_(dtype) tree_info = helper.create_parameter(attr=helper.param_attr, shape=[node_nums, 3 + child_nums], dtype=dtype, default_initializer=Constant(0)) tree_info.stop_gradient = True child = helper.create_variable_for_type_inference(dtype=dtype) leaf_mask = helper.create_variable_for_type_inference(dtype=dtype) helper.append_op(type='tdm_child', inputs={ 'X': x, 'TreeInfo': tree_info }, outputs={ 'Child': child, 'LeafMask': leaf_mask }, attrs={ 'child_nums': child_nums, 'dtype': c_dtype }, stop_gradient=True) return (child, leaf_mask)
def search_pyramid_hash(input, num_emb, space_len, pyramid_layer, rand_len, drop_out_percent, is_training, use_filter, white_list_len, black_list_len, seed, lr, param_attr=None, param_attr_wl=None, param_attr_bl=None, name=None, distribute_update_vars=None, dtype='float32'): """ **Pyramid hash embedding** Args: input (Variable): LoDTensor<int32> Variable contained the IDs' information. num_emb (int): The embedding size of output. space_len (int): The length of pyramid hash embedding space. pyramid_layer (int): The number of pyramid layers. It should be greater than 2. rand_len (int): The minimum length of pyramid hash cell. drop_out_percent (float): The probability of dropping out the input token randomly. It should satisfy: [0., 1.] is_training (bool): Whether in training or testing phrase. use_filter(bool): If set True, the white filter and black filter should be given by :attr:`param_attr_wl` and :attr:`param_attr_bl` . white_list_len(int): If set :math:`white_list_len>0` , white filter with shape [white_list_len, 1] should be provided by param_attr_wl. black_list_len(int): If set :math:`black_list_len>0` , black filter with shape [black_list_len, 1] should be provided by param_attr_bl. seed(int): The number of random seed. lr(float): The learning rate of weight created by :attr:`param_attr` with shape [space_len+rand_len, 1] in this layer. param_attr(ParamAttr): To specify the weight parameter property. Default: None, which means the default weight parameter property is used. See usage for details in :ref:`api_fluid_ParamAttr` . param_attr_wl(ParamAttr): Specified parameters of white filter. param_attr_bl(ParamAttr): Specified parameters of black filter. distribute_update_vars(list[ParamAttr.name]): Decided which params should be updated in distribute training. Used in Distribute Transpiler to create a trainer/server program. name(str, optional): The default value is None. Normally there is no need for user to set this property. For more information, please refer to :ref:`api_guide_Name` . dtype(str): The data type of output variable, float32. Returns: Variable: LoDTensor of pyramid hash embedding. """ helper = LayerHelper('search_pyramid_hash', **locals()) w_shape = [space_len + rand_len, 1] w = helper.create_parameter(attr=param_attr, shape=w_shape, dtype=dtype, is_bias=False) w.stop_gradient = True input_vars = {'X': input, 'W': w} if white_list_len > 0: wl_shape = [white_list_len, 1] white_list = helper.create_parameter(attr=param_attr_wl, shape=wl_shape, dtype=dtype, is_bias=False) white_list.stop_gradient = True input_vars['WhiteList'] = white_list if black_list_len >= 0: bl_shape = [black_list_len, 1] black_list = helper.create_parameter(attr=param_attr_bl, shape=bl_shape, dtype=dtype, is_bias=False) black_list.stop_gradient = True input_vars['BlackList'] = black_list distribute_update_vars_str = "" if distribute_update_vars: assert isinstance(distribute_update_vars, list) special_name_list = [] if param_attr: special_name_list.append(param_attr.name) if param_attr_wl: special_name_list.append(param_attr_wl.name) if param_attr_bl: special_name_list.append(param_attr_bl.name) for param in distribute_update_vars: if param not in special_name_list: raise ValueError( "Pyramid Hash layer didn't have parameter {}".format( param)) distribute_update_vars_str = ",".join(distribute_update_vars) res = helper.create_variable_for_type_inference(dtype) drop_pos = helper.create_variable_for_type_inference(dtype) x_temp_out = helper.create_variable_for_type_inference(dtype) helper.append_op(type='pyramid_hash', inputs=input_vars, outputs={ "Out": res, "X_Temp_Out": x_temp_out, 'DropPos': drop_pos }, attrs={ 'num_emb': num_emb, 'space_len': space_len, 'pyramid_layer': pyramid_layer, 'rand_len': rand_len, 'drop_out_percent': drop_out_percent, 'is_training': is_training, 'use_filter': use_filter, 'white_list_len': white_list_len, 'black_list_len': black_list_len, 'seed': seed, 'lr': lr, 'distribute_update_vars': distribute_update_vars_str }) return res
def fused_embedding_seq_pool(input, size, is_sparse=False, padding_idx=None, combiner='sum', param_attr=None, dtype='float32'): """ **Embedding Sequence pool** This layer is the fusion of lookup table and sequence_pool. Args: input (Variable): Input is a Tensor<int64> Variable, which contains the IDs' information. The value of the input IDs should satisfy :math:`0<= id < size[0]`. size (tuple|list): The shape of the lookup_table parameter. It should have two elements which indicate the size of the dictionary of embedding and the size of each embedding vector respectively. is_sparse (bool): The flag indicating whether to use sparse update. Default: False. padding_idx (int|long|None): It will output all-zero padding data whenever lookup encounters :math:`padding\_idx` in Ids. If set :attr:`None`, it makes no effect to output. If :math:`padding\_idx < 0`, the :math:`padding\_idx` will automatically be converted to :math:`size[0] + padding\_idx` to use. Default: None. combiner (str): The pooling type of sequence_pool, and only support `sum`. Default: sum. param_attr (ParamAttr): Parameters for this layer. dtype (np.dtype|core.VarDesc.VarType|str): The dtype refers to the data type of output tensor. It can be float32, float_16, int etc. Returns: The sequence pooling variable which is a Tensor. Examples: .. code-block:: python import numpy as np import paddle.fluid as fluid dict_size = 20 data_t = fluid.layers.data( name='word', shape=[1], dtype='int64', lod_level=1) padding_idx = np.random.randint(1, 10) out = fluid.contrib.fused_embedding_seq_pool( input=data_t, size=[dict_size, 32], param_attr='w', padding_idx=padding_idx, is_sparse=False) """ helper = LayerHelper('fused_embedding_seq_pool', **locals()) w = helper.create_parameter(attr=helper.param_attr, shape=size, dtype=dtype, is_bias=False) out = helper.create_variable_for_type_inference(dtype) padding_idx = -1 if padding_idx is None else padding_idx if padding_idx >= 0 else ( size[0] + padding_idx) helper.append_op(type='fused_embedding_seq_pool', inputs={ 'Ids': input, 'W': w }, outputs={'Out': out}, attrs={ 'is_sparse': is_sparse, 'combiner': combiner, 'padding_idx': padding_idx }) return out
def tree_conv(nodes_vector, edge_set, output_size, num_filters=1, max_depth=2, act='tanh', param_attr=None, bias_attr=None, name=None): """ ${comment} Args: nodes_vector(${nodes_vector_type}): ${nodes_vector_comment} edge_set(${edge_set_type}): ${edge_set_comment} output_size(int): output feature width num_filters(int): number of filters, Default 1 max_depth(int): max depth of filters, Default 2 act(str): activation function, Default tanh param_attr(ParamAttr): the parameter attribute for the filters, Default None bias_attr(ParamAttr): the parameter attribute for the bias of this layer, Default None name(str): a name of this layer(optional). If set None, the layer will be named automatically, Default None Returns: out(${out_type}): ${out_comment} Examples: .. code-block:: python import paddle.fluid as fluid # 10 for max_node_size of dataset, 5 for vector width nodes_vector = fluid.layers.data( name='vectors', shape=[10, 5], dtype='float32') # 10 for max_node_size of dataset, 2 for every edge has two nodes # edges must be directional edge_set = fluid.layers.data(name='edge_set', shape=[ 10, 2], dtype='float32') # the shape of output will be [10, 6, 1], # 10 for max_node_size of dataset, 6 for output size, 1 for 1 filter out_vector = fluid.layers.tree_conv(nodes_vector, edge_set, 6, 1, 2) # After reshape, output tensor could be nodes_vector for next tree convolution out_vector = fluid.layers.reshape(out_vector, shape=[-1, 10, 6]) out_vector_2 = fluid.layers.tree_conv(out_vector, edge_set, 3, 4, 2) # also output tensor could be pooling(the pooling in paper called global pooling) pooled = fluid.layers.reduce_max(out_vector, dim=2) # global pooling """ helper = LayerHelper("tree_conv", **locals()) dtype = helper.input_dtype('nodes_vector') feature_size = nodes_vector.shape[2] W_shape = [feature_size, 3, output_size, num_filters] W = helper.create_parameter(attr=param_attr, shape=W_shape, dtype=dtype, is_bias=False) out = helper.create_variable_for_type_inference(dtype=dtype) helper.append_op(type='tree_conv', inputs={ 'NodesVector': nodes_vector, 'EdgeSet': edge_set, 'Filter': W }, outputs={ 'Out': out, }, attrs={'max_depth': max_depth}) if helper.bias_attr: pre_activation = helper.append_bias_op(out) else: pre_activation = out return helper.append_activation(pre_activation)
def match_matrix_tensor(x, y, channel_num, act=None, param_attr=None, dtype='float32', name=None): """ Calculate the semantic matching matrix of two word sequences with variable length. Given a query A of length `n` and a title B of length `m`, the input shape are respectively [n, h] and [m, h], which h is hidden_size. If :attr:`channel_num` is set to 3, it will generate a learnable parameter matrix W with shape [h, 3, h]. Then the semantic matching matrix of query A and title B is calculated by A * W * B.T = [n, h]*[h, 3, h]*[h, m] = [n, 3, m]. The learnable parameter matrix `W` is equivalent to a fully connected layer in the calculation process. If :attr:`act` is provided, the corresponding activation function will be applied to output matrix. The :attr:`x` and :attr:`y` should be LodTensor and only one level LoD is supported. .. code-block:: text Given a 1-level LoDTensor x: x.lod = [ [2, 3, ]] x.data = [[0.3, 0.1], [0.2, 0.3], [ 0.5, 0.6], [0.7, 0.1], [0.3, 0.4]] x.dims = [5, 2] y is a Tensor: y.lod = [[3, 1, ]] y.data = [[0.1, 0.2], [0.3, 0.7], [0.9, 0.2], [0.4, 0.1]] y.dims = [4, 2] set channel_num 2, then we get a 1-level LoDTensor: # where 12 = channel_num * x.lod[0][0] * y.lod[0][0] out.lod = [[12, 6]] out.dims = [18, 1] # where 18 = 12 + 6 Args: x (Variable): Input variable x which should be 1-level LodTensor. y (Variable): Input variable y which should be 1-level LodTensor. channel_num (int): The channel number of learnable parameter W. act (str, default None): Activation to be applied to the output of this layer. param_attr (ParamAttr|list of ParamAttr, default None): The parameter attribute for learnable parameters/weights of this layer. dtype ('float32'): The data type of w data. name (str|None): A name for this layer(optional). If set None, the layer will be named automatically. Default: None Returns: Variable: output with LoD specified by this layer. Examples: .. code-block:: python import numpy as np from paddle.fluid import layers from paddle.fluid import contrib x_lod_tensor = layers.data(name='x', shape=[10], lod_level=1) y_lod_tensor = layers.data(name='y', shape=[10], lod_level=1) out, out_tmp = contrib.match_matrix_tensor( x=x_lod_tensor, y=y_lod_tensor, channel_num=3) """ helper = LayerHelper('match_matrix_tensor', **locals()) x_shape = list(x.shape) y_shape = list(y.shape) assert len(x_shape) == 2 and len( y_shape) == 2 and x_shape[-1] == y_shape[-1] weight_shape = [x_shape[-1], channel_num, y_shape[-1]] w = helper.create_parameter(attr=helper.param_attr, shape=weight_shape, dtype=dtype, is_bias=False) mm_res = helper.create_variable_for_type_inference(dtype) tmp_res = helper.create_variable_for_type_inference(dtype, stop_gradient=True) helper.append_op(type='match_matrix_tensor', inputs={ 'X': x, 'Y': y, 'W': w, }, outputs={ "Out": mm_res, "Tmp": tmp_res }, attrs={'dim_t': channel_num}) return helper.append_activation(mm_res), tmp_res
def var_conv_2d(input, row, col, input_channel, output_channel, filter_size, stride=1, param_attr=None, act=None, dtype='float32', name=None): """ The var_conv_2d layer calculates the output base on the :attr:`input` with variable length, row, col, input channel, filter size and strides. Both :attr:`input`, :attr:`row`, and :attr:`col` are 1-level LodTensor. The convolution operation is same as conv2d layer with padding. Besides, input.dims[1] should be 1. .. code-block:: text If input_channel is 2 and given row lodTensor and col lodTensor as follows: row.lod = [[5, 4]] col.lod = [[6, 7]] input is a lodTensor: input.lod = [[60, 56]] # where 60 = input_channel * 5 * 6 input.dims = [116, 1] # where 116 = 60 + 56 If set output_channel is 3, filter_size is [3, 3], stride is [1, 1]: # where 90 = output_channel * [(5-1)/stride + 1] * [(6-1)/stride + 1] output.lod = [[90, 84]] output.dims = [174, 1] # where 174 = 90 + 84 Args: input (Variable): The input should be 1-level LodTensor with dims[1] equals 1. row (Variable): The row should be 1-level LodTensor to provide height information. col (Variable): The col should be 1-level LodTensor to provide width information. input_channel (int): The number of input channel. output_channel (int): The number of output channel. filter_size (int|tuple|None): The filter size. If filter_size is a tuple, it must contain two integers, (filter_size_H, filter_size_W). Otherwise, the filter will be a square. stride (int|tuple): The stride size. If stride is a tuple, it must contain two integers, (stride_H, stride_W). Otherwise, the stride_H = stride_W = stride. Default: stride = 1. param_attr (ParamAttr|None): The parameter attribute for learnable parameters/weights of var_conv2d. If it is set to None or one attribute of ParamAttr, var_conv2d will create ParamAttr as param_attr. If the Initializer of the param_attr is not set, the parameter is initialized with :math:`Normal(0.0, std)`, and the :math:`std` is :math:`(\\frac{2.0 }{filter\_elem\_num})^{0.5}`. Default: None. act (str): Activation type, if it is set to None, activation is not appended. Default: None dtype ('float32'): The data type of parameter and output. name (str|None): A name for this layer(optional). If set None, the layer will be named automatically. Default: None Returns: Variable: Output variable with LoD specified by this layer. Examples: .. code-block:: python import numpy as np from paddle.fluid import layers from paddle.fluid import contrib x_lod_tensor = layers.data(name='x', shape=[1], lod_level=1) row_lod_tensor = layers.data(name='row', shape=[6], lod_level=1) col_lod_tensor = layers.data(name='col', shape=[6], lod_level=1) out = contrib.var_conv_2d(input=x_lod_tensor, row=row_lod_tensor, col=col_lod_tensor, input_channel=3, output_channel=5, filter_size=[3, 3], stride=1) """ helper = LayerHelper('var_conv_2d', **locals()) x_shape = list(input.shape) assert len(x_shape) == 2 filter_size = utils.convert_to_list(filter_size, 2, 'filter_size') stride = utils.convert_to_list(stride, 2, 'stride') filter_shape = [ int(output_channel), int(input_channel) * filter_size[0] * filter_size[1] ] filter_param = helper.create_parameter( attr=helper.param_attr, shape=filter_shape, dtype=dtype, ) conv_res = helper.create_variable_for_type_inference(dtype) tmp_res = helper.create_variable_for_type_inference(dtype, stop_gradient=True) helper.append_op(type='var_conv_2d', inputs={ 'X': input, 'ROW': row, 'COLUMN': col, 'W': filter_param, }, outputs={ "Out": conv_res, "Col": tmp_res }, attrs={ 'InputChannel': input_channel, 'OutputChannel': output_channel, 'StrideH': stride[0], 'StrideW': stride[1], 'KernelH': filter_size[0], 'KernelW': filter_size[1], }) return helper.append_activation(conv_res)
class SimpleRNNCell(fluid.imperative.Layer): def __init__(self, name_scope, step_input_size, hidden_size, output_size, param_attr): super(SimpleRNNCell, self).__init__(name_scope) self.step_input_size = step_input_size self.hidden_size = hidden_size self.output_size = output_size self._dype = core.VarDesc.VarType.FP32 from paddle.fluid.layer_helper import LayerHelper self._helper = LayerHelper( 'SimpleRNNCell', act="tanh", param_attr=param_attr) def _build_once(self, inputs, pre_hidden): i2h_param_shape = [self.step_input_size, self.hidden_size] h2h_param_shape = [self.hidden_size, self.hidden_size] h2o_param_shape = [self.output_size, self.hidden_size] self._i2h_w = self._helper.create_parameter( attr=self._helper.param_attr, shape=i2h_param_shape, dtype=self._dtype, is_bias=False) self._h2h_w = self._helper.create_parameter( attr=self._helper.param_attr, shape=h2h_param_shape, dtype=self._dtype, is_bias=False) self._h2o_w = self._helper.create_parameter( attr=self._helper.param_attr, shape=h2o_param_shape, dtype=self._dtype, is_bias=False) def forward(self, input, pre_hidden): tmp_i2h = self._helper.create_variable_for_type_inference(self._dtype) tmp_h2h = self._helper.create_variable_for_type_inference(self._dtype) hidden = self._helper.create_variable_for_type_inference(self._dype) out = self._helper.create_variable_for_type_inference(self._dype) softmax_out = self._helper.create_variable_for_type_inference( self._dtype) reduce_out = self._helper.create_variable_for_type_inference( self._dtype) self._helper.append_op( type="mul", inputs={"X": input, "Y": self._i2h_w}, outputs={"Out": tmp_i2h}, attrs={"x_num_col_dims": 1, "y_num_col_dims": 1}) self._helper.append_op( type="mul", inputs={"X": pre_hidden, "Y": self._h2h_w}, outputs={"Out": tmp_h2h}, attrs={"x_num_col_dims": 1, "y_num_col_dims": 1}) self._helper.append_op( type="elementwise_add", inputs={'X': tmp_h2h, 'Y': tmp_i2h}, outputs={'Out': hidden}, attrs={'axis': -1, 'use_mkldnn': False}) hidden = self._helper.append_activation(hidden) self._helper.append_op( type="mul", inputs={"X": hidden, "Y": self._h2o_w}, outputs={"Out": out}, attrs={"x_num_col_dims": 1, "y_num_col_dims": 1}) self._helper.append_op( type="softmax", inputs={"X": out}, outputs={"Out": softmax_out}, attrs={"use_cudnn": False}) self._helper.append_op( type='reduce_sum', inputs={'X': softmax_out}, outputs={'Out': reduce_out}, attrs={'dim': None, 'keep_dim': False, 'reduce_all': True}) return reduce_out, hidden
class QuantizeTranspiler(object): def __init__(self, weight_bits=8, activation_bits=8, activation_quantize_type='abs_max', weight_quantize_type='abs_max', window_size=10000, moving_rate=0.9): """ Convert and rewrite the fluid Program according to weight and activation quantization type. Args: weight_bits (int): quantization bit number for weights, the bias is not quantized. activation_bits (int): quantization bit number for activation. activation_quantize_type (str): quantization type for activation, now support 'abs_max', 'range_abs_max'. If use 'abs_max' mode, the quantization scale will be calculated dynamically each step in both training and testing period. If use 'range_abs_max', a static quantization scale will be calculated during training and used in inference. weight_quantize_type (str): quantization type for weights, support 'abs_max'. The 'range_abs_max' usually is not used for weight, since weights are fixed once the model is well trained. window_size (int): the window size for 'range_abs_max' quantization. Examples: .. code-block:: python # the original program will be rewrite, if you don't want to # change it, please clone at first. # quantize_program = program.clone() t = fluid.QuantizeTranspiler() t.transpile(quantize_program) """ self.weight_bits = weight_bits self.activation_bits = activation_bits quant_type = ['abs_max', 'range_abs_max', 'moving_average_abs_max'] if weight_quantize_type not in quant_type: raise ValueError( "Unknown weight_quantize_type: '%s'. It can only be ", "'abs_max' or 'range_abs_max' or 'moving_average_abs_max'.", str(weight_quantize_type)) if activation_quantize_type not in quant_type: raise ValueError( "Unknown activation_quantize_type : '%s'. It can only be ", "'abs_max' or 'range_abs_max' or 'moving_average_abs_max'.", str(activation_quantize_type)) self.weight_quantize_type = weight_quantize_type self.activation_quantize_type = activation_quantize_type self.window_size = window_size self.moving_rate = moving_rate self.helper = LayerHelper(self.__class__.__name__) self.fake_quant_op_types = [ 'fake_quantize_abs_max', 'fake_quantize_range_abs_max', 'fake_quantize_moving_average_abs_max' ] self.fake_dequant_op_types = ['fake_dequantize_max_abs'] self.is_test = None self.global_step = None def training_transpile(self, program=None, startup_program=None): """Rewrites a training input program in place for simulated quantization. Insert fake quantization and de-quantization ops into program to simulate the error introduced by quantization. And change the graident ops' input by using the faked quantization weights and activation. Since the program is transformed in place, the graph connection will change. Args: program (Program): the input program to be transpile. """ self.is_test = False program = default_main_program() if program is None else program startup_program = default_startup_program() if startup_program is \ None else startup_program # marked the variable which has been quantized and dequantized. dequanted_vars = [ collections.OrderedDict() for _ in range(len(program.blocks)) ] grad_op_types = ['%s_grad' % (type) for type in _QUANTIZABLE_OP_TYPES] params = [p.name for p in program.global_block().iter_parameters()] def _transpile_forward(block, op): idx = block.ops.index(op) block_id = block.idx # insert quant op and dequant op for name in op.input_arg_names: #if share input between ops if name in dequanted_vars[block_id]: dequant_var = dequanted_vars[block_id][name] else: var = block.var(name) quant_bits = self.weight_bits if var.name in params \ else self.activation_bits quant_type = self.weight_quantize_type if var.name \ in params else self.activation_quantize_type quant_var, scale_var = self._insert_quant_op( block, idx, var, quant_bits, quant_type) dequant_var = self._insert_dequant_op( block, idx + 1, quant_var, scale_var, quant_bits) dequanted_vars[block_id][name] = dequant_var # rename the forward op inputs op._rename_input(name, dequant_var.name) def _transpile_backward(block, op): block_id = block.idx no_dequanted_input_vars = True for name in op.input_arg_names: if name in dequanted_vars[block_id]: dequant_var = dequanted_vars[block_id][name] op._rename_input(name, dequant_var.name) no_dequanted_input_vars = False if no_dequanted_input_vars: raise ValueError("There is no dequanted inputs for op %s." % (op.type)) with program_guard(program, startup_program): self._create_global_step() for block in program.blocks: ops = list(block.ops) block_id = block.idx for op in ops: # rewrite the forward ProgramDes if op.type in _QUANTIZABLE_OP_TYPES: _transpile_forward(block, op) # rename the backward op inputs if op.type in grad_op_types: _transpile_backward(block, op) def _create_global_step(self): if self.weight_quantize_type == 'range_abs_max' or \ self.activation_quantize_type == 'range_abs_max': self.global_step = autoincreased_step_counter() def freeze_program(self, program, place, scope=None): """Freeze input training program for inference. Args: program (Program): the input program to be transpile. """ self.is_test = True scope = global_scope() if scope is None else scope program = default_main_program() if program is None else program persistable_vars = [ v.name for v in filter(lambda var: var.persistable, program.list_vars()) ] op_in_rename_map = [ collections.OrderedDict() for _ in range(len(program.blocks)) ] op_out_rename_map = [ collections.OrderedDict() for _ in range(len(program.blocks)) ] var_scale_map = [ collections.OrderedDict() for _ in range(len(program.blocks)) ] def _remove_fake_quant_and_dequant_op(block, op): idx = block.ops.index(op) block_id = block.idx k = op.output('Out')[0] v = op.input('X')[0] if v not in op_in_rename_map[block_id]: op_in_rename_map[block_id][k] = v else: op_in_rename_map[block_id][k] = op_in_rename_map[block_id][v] block._remove_op(idx) def _insert_post_dequant_op(block, op): idx = block.ops.index(op) block_id = block.idx max_range = None scale_var = None for name in op.input_arg_names: #rename input name of the op to the input name of last op which has be removed if name in op_in_rename_map[block_id]: op._rename_input(name, op_in_rename_map[block_id][name]) scale_v = var_scale_map[block_id][_original_var_name(name)] if _original_var_name(name) in persistable_vars: param_range = (1 << (self.weight_bits - 1)) - 1 act_range = (1 << (self.activation_bits - 1)) - 1 assert _is_float(scale_v) max_range = param_range * act_range / scale_v else: assert isinstance(scale_v, Variable) scale_var = scale_v if len(op.output_arg_names) != 1: raise ValueError("Only support one output, but op %s has" " more than one output." % (op.type)) out_var = block.var(op.output_arg_names[0]) dequant_var = block.create_var(name=_dequantized_var_name( out_var.name), type=out_var.type, shape=out_var.shape, dtype=out_var.dtype) # insert fake_dequantize_op dequant_op = block._insert_op( idx + 1, type="fake_dequantize_max_abs", attrs={'max_range': float(max_range)}, inputs={ "X": out_var, 'Scale': scale_var }, outputs={"Out": dequant_var}) op_out_rename_map[block_id][out_var.name] = dequant_var.name return dequant_var def _load_var(name): return np.array(scope.find_var(name).get_tensor()) def _restore_var(name, arr): t = scope.find_var(name).get_tensor() t.set(arr, place) for block in program.blocks: ops = list(block.ops) block_id = block.idx for op in ops: op_type = op.type # insert dequant_op after fc/conv, need to rename # input of the followed ops(of fc/conv) to the dquant_op for name in op.input_arg_names: if name in op_out_rename_map[block_id]: op._rename_input(name, op_out_rename_map[block_id][name]) if op_type in self.fake_quant_op_types: in_arg_name = op.input('X')[0] if in_arg_name in persistable_vars: if self.weight_quantize_type == 'abs_max': param = _load_var(in_arg_name) scale_v = np.max(np.abs(param)) else: scale_v = _load_var(op.output('OutScale')[0]) var_scale_map[block_id][in_arg_name] = scale_v else: scale_v = block.var(op.output('OutScale')[0]) var_scale_map[block_id][in_arg_name] = scale_v if in_arg_name in persistable_vars: _remove_fake_quant_and_dequant_op(block, op) # quantize weight and restore param_t = _load_var(in_arg_name) param_q_t = quant(param_t, scale_v, self.weight_bits) _restore_var(in_arg_name, param_q_t) if op_type in self.fake_dequant_op_types: _remove_fake_quant_and_dequant_op(block, op) if op_type in _QUANTIZABLE_OP_TYPES: dequant_var = _insert_post_dequant_op(block, op) # remove the unused var in ProgramDesc self._remove_unused_var(program) #program = program.clone() def convert_to_int8(self, program, place, scope=None): scope = global_scope() if scope is None else scope program = default_main_program() if program is None else program def _load_var(name): return np.array(scope.find_var(name).get_tensor()) global_block = program.global_block() def convert_to_int8(var): int8_var_name = var.name + ".int8" int8_var = global_block.create_parameter( name=int8_var_name.encode('ascii'), type=var.type, dtype=core.VarDesc.VarType.INT8, shape=var.shape) tensor = _load_var(var.name) scope.var(int8_var_name) int8_tensor = scope.find_var(int8_var_name).get_tensor() int8_tensor.set(tensor.astype(np.int8), place) return int8_var input_map = {} for block in program.blocks: for op in list(block.ops): if op.type in _QUANTIZABLE_OP_TYPES: for name in op.input_arg_names: var = block.var(name) if var.persistable: if name not in input_map: int8_var = convert_to_int8(var) input_map[name] = int8_var.name op._rename_input(name, input_map[name]) self._remove_unused_var(program) def _remove_unused_var(self, program): all_remove_vars = [] for block in program.blocks: args = [] for op in block.ops: args += op.input_arg_names args += op.output_arg_names args = list(set(args)) #vals of all left ops var_names = block.vars.keys() # all vals sub_block_remove_vars = [] for var in var_names: if var not in args: sub_block_remove_vars.append(var) all_remove_vars.append(sub_block_remove_vars) remove_vars = [list(set(v)) for v in all_remove_vars] for i, block in enumerate(program.blocks): for v in remove_vars[i]: block._remove_var(v) def _insert_quant_abs_max_op(self, block, idx, var, quant_bits): """Insert fake_quantize_abs_max op. """ quant_var = block.create_var(name=_quantized_var_name(var.name), type=var.type, shape=var.shape, dtype=var.dtype) scale = block.create_var(name=_quantized_scale_name(var.name), type=var.type, shape=var.shape, dtype=var.dtype) quant_op = block._insert_op(idx, type='fake_quantize_abs_max', attrs={'bit_length': quant_bits}, inputs={'X': var}, outputs={ 'Out': quant_var, 'OutScale': scale }) return quant_var, scale def _insert_quant_range_abs_max_op(self, block, idx, var, quant_bits): """Insert fake_quantize_range_abs_max """ quant_var = block.create_var(name=_quantized_var_name(var.name), type=var.type, shape=var.shape, dtype=var.dtype) scale = self.helper.create_parameter(attr=ParamAttr( name=_quantized_scale_name(var.name), initializer=Constant(0.001), trainable=False), shape=[1], dtype=var.dtype) scale.stop_gradient = True ins = {'X': var, 'InScale': scale} outs = {'Out': quant_var, 'OutScale': scale} if not self.is_test: # A global step counter variable with type int64 scales = self.helper.create_global_variable( name=unique_name.generate('scales'), persistable=True, dtype=var.dtype, shape=[self.window_size]) self.helper.set_variable_initializer(scales, initializer=Constant(value=0)) ins['Iter'] = self.global_step outs['OutScales'] = scales attrs = { 'window_size': self.window_size, 'bit_length': quant_bits, 'is_test': self.is_test } quant_op = block._insert_op(idx, type='fake_quantize_range_abs_max', attrs=attrs, inputs=ins, outputs=outs) return quant_var, scale def _insert_quant_moving_average_abs_max_op(self, block, idx, var, quant_bits): """Insert fake_quantize_moving_average_abs_max """ quant_var = block.create_var(name=_quantized_var_name(var.name), type=var.type, shape=var.shape, dtype=var.dtype) state = self.helper.create_global_variable( name=unique_name.generate('state'), persistable=True, dtype=var.dtype, shape=[1]) self.helper.set_variable_initializer(state, initializer=Constant(value=1)) accum = self.helper.create_global_variable( name=unique_name.generate('accum'), persistable=True, dtype=var.dtype, shape=[1]) self.helper.set_variable_initializer(accum, initializer=Constant(value=1)) scale = self.helper.create_parameter(attr=ParamAttr( name=_quantized_scale_name(var.name), initializer=Constant(0.001), trainable=False), shape=[1], dtype=var.dtype) scale.stop_gradient = True ins = {'X': var, 'InScale': scale} outs = {'Out': quant_var, 'OutScale': scale} if not self.is_test: ins['InState'] = state ins['InAccum'] = accum outs['OutState'] = state outs['OutAccum'] = accum attrs = { 'bit_length': quant_bits, 'moving_rate': self.moving_rate, 'is_test': self.is_test } quant_op = block._insert_op( idx, type='fake_quantize_moving_average_abs_max', attrs=attrs, inputs=ins, outputs=outs) return quant_var, scale def _insert_quant_op(self, block, idx, var, quant_bits, quant_type): """ Insert fake_quantize_op """ if quant_type == 'abs_max': return self._insert_quant_abs_max_op(block, idx, var, quant_bits) elif quant_type == 'range_abs_max': return self._insert_quant_range_abs_max_op(block, idx, var, quant_bits) elif quant_type == 'moving_average_abs_max': return self._insert_quant_moving_average_abs_max_op( block, idx, var, quant_bits) def _insert_dequant_op(self, block, idx, var, scale, quant_bits): """ Insert fake_quantize_op """ dequant_var = block.create_var(name=_dequantized_var_name(var.name), type=var.type, shape=var.shape, dtype=var.dtype) # insert fake_dequantize_op max_range = (1 << (quant_bits - 1)) - 1 dequant_op = block._insert_op(idx, type="fake_dequantize_max_abs", attrs={'max_range': float(max_range)}, inputs={ "X": var, 'Scale': scale }, outputs={"Out": dequant_var}) return dequant_var
class TestPrimDistOp(unittest.TestCase): def setUp(self): self.main_program = paddle.static.Program() self.startup_program = paddle.static.Program() self.layer_help = LayerHelper('TestPrimDistOp') with paddle.static.program_guard(self.main_program, self.startup_program): self.init_prog() def init_prog(self): # block = self.main_program.global_block() # block = self.main_program.global_block() self.w = self.layer_help.create_parameter( dtype="float", shape=[20], attr=None) self.w_grad = paddle.static.data( name='w_grad', shape=[20], dtype='float') self.tmp1 = paddle.static.data(name='tmp1', shape=[20], dtype='float') self.tmp2 = paddle.static.data(name='tmp2', shape=[20], dtype='float') self.batch_reduced = paddle.static.data( name='batch_reduced', shape=[1], dtype='float') self.attrs = {} default_dist_context = get_default_distributed_context() _global_process_mesh = auto.ProcessMesh(list(range(nranks))) tensor_dist_attr = set_var_dist_attr( default_dist_context, self.tmp1, [-1], _global_process_mesh, mark_annotated=True) tensor_dist_attr = set_var_dist_attr( default_dist_context, self.tmp1, [-1], _global_process_mesh, mark_annotated=True) op = self.layer_help.append_op( type="add_p", inputs={'X': self.tmp1, 'Y': self.w}, outputs={'Z': self.w_grad}, attrs=self.attrs) op = self.layer_help.append_op( type="reduce_p", inputs={'X': self.tmp2}, outputs={'Y': self.batch_reduced}, attrs={"axis": [0]}) def test_loss_and_grad_allreduce(self): dist_context = DistributedContext(self.main_program, self.startup_program) completer = Completer(dist_context) completer.complete_prim_annotation(self.main_program) dist_context.block_state.parse_forward_blocks(self.main_program) dist_context.block_state.parse_backward_blocks(self.main_program) dist_context.grads_params = dict() dist_context.grads_params[self.w_grad.name] = self.w.name dist_context.synced_gradient = set() dist_context.data_parallel_group = list(range(nranks)) partitioner = Partitioner(dist_context, rank) dist_main_prog, dist_startup_prog, _ = partitioner.partition( self.main_program, self.startup_program, [(self.w, self.w_grad)]) ops = dist_main_prog.global_block().ops self.assertTrue(ops[1].type == "c_allreduce_sum") self.assertTrue(ops[3].type == "c_allreduce_sum")
class SimpleLSTMRNN(fluid.imperative.Layer): def __init__(self, name_scope, hidden_size, num_steps, num_layers=2, init_scale=0.1, dropout=None): super(SimpleLSTMRNN, self).__init__(name_scope) self._hidden_size = hidden_size self._num_layers = num_layers self._init_scale = init_scale self._dropout = dropout self._input = None self._num_steps = num_steps from paddle.fluid.layer_helper import LayerHelper self._helper = LayerHelper('SimpleLSTMRNN', act="tanh") def _build_once(self, input_embedding, init_hidden=None, init_cell=None): self.weight_1_arr = [] self.weight_2_arr = [] self.bias_arr = [] self.hidden_array = [] self.cell_array = [] self.mask_array = [] for i in range(self._num_layers): weight_1 = self._helper.create_parameter( attr=fluid.ParamAttr( initializer=fluid.initializer.UniformInitializer( low=-self._init_scale, high=self._init_scale)), shape=[self._hidden_size * 2, self._hidden_size * 4], dtype="float32", default_initializer=fluid.initializer.UniformInitializer( low=-self._init_scale, high=self._init_scale)) self.weight_1_arr.append(weight_1) bias_1 = self._helper.create_parameter( attr=fluid.ParamAttr( initializer=fluid.initializer.UniformInitializer( low=-self._init_scale, high=self._init_scale)), shape=[self._hidden_size * 4], dtype="float32", default_initializer=fluid.initializer.Constant(0.0)) self.bias_arr.append(bias_1) pre_hidden = fluid.layers.slice( init_hidden, axes=[0], starts=[i], ends=[i + 1]) pre_cell = fluid.layers.slice( init_cell, axes=[0], starts=[i], ends=[i + 1]) pre_hidden = fluid.layers.reshape( pre_hidden, shape=[-1, self._hidden_size]) pre_cell = fluid.layers.reshape( pre_cell, shape=[-1, self._hidden_size]) self.hidden_array.append(pre_hidden) self.cell_array.append(pre_cell) def forward(self, input_embedding, init_hidden=None, init_cell=None): res = [] for index in range(self._num_steps): self._input = fluid.layers.slice( input_embedding, axes=[1], starts=[index], ends=[index + 1]) self._input = fluid.layers.reshape( self._input, shape=[-1, self._hidden_size]) for k in range(self._num_layers): pre_hidden = self.hidden_array[k] pre_cell = self.cell_array[k] weight_1 = self.weight_1_arr[k] bias = self.bias_arr[k] nn = fluid.layers.concat([self._input, pre_hidden], 1) gate_input = fluid.layers.matmul(x=nn, y=weight_1) gate_input = fluid.layers.elementwise_add(gate_input, bias) i, j, f, o = fluid.layers.split( gate_input, num_or_sections=4, dim=-1) c = pre_cell * fluid.layers.sigmoid(f) + fluid.layers.sigmoid( i) * fluid.layers.tanh(j) m = fluid.layers.tanh(c) * fluid.layers.sigmoid(o) self.hidden_array[k] = m self.cell_array[k] = c self._input = m if self._dropout is not None and self._dropout > 0.0: self._input = fluid.layers.dropout( self._input, dropout_prob=self._dropout, dropout_implementation='upscale_in_train') res.append( fluid.layers.reshape( self._input, shape=[1, -1, self._hidden_size])) real_res = fluid.layers.concat(res, 0) real_res = fluid.layers.transpose(x=real_res, perm=[1, 0, 2]) last_hidden = fluid.layers.concat(self.hidden_array, 1) last_hidden = fluid.layers.reshape( last_hidden, shape=[-1, self._num_layers, self._hidden_size]) last_hidden = fluid.layers.transpose(x=last_hidden, perm=[1, 0, 2]) last_cell = fluid.layers.concat(self.cell_array, 1) last_cell = fluid.layers.reshape( last_cell, shape=[-1, self._num_layers, self._hidden_size]) last_cell = fluid.layers.transpose(x=last_cell, perm=[1, 0, 2]) return real_res, last_hidden, last_cell
class PtbModel(fluid.imperative.Layer): def __init__(self, name_scope, hidden_size, vocab_size, num_layers=2, num_steps=20, init_scale=0.1, dropout=None): super(PtbModel, self).__init__(name_scope) self.hidden_size = hidden_size self.vocab_size = vocab_size self.init_scale = init_scale self.num_layers = num_layers self.num_steps = num_steps self.dropout = dropout from paddle.fluid.layer_helper import LayerHelper self._helper = LayerHelper('PtbModel', act="tanh") self.simple_lstm_rnn = SimpleLSTMRNN( self.full_name(), hidden_size, num_steps, num_layers=num_layers, init_scale=init_scale, dropout=dropout) self.embedding = Embedding( self.full_name(), size=[vocab_size, hidden_size], dtype='float32', is_sparse=False, param_attr=fluid.ParamAttr( name='embedding_para', initializer=fluid.initializer.UniformInitializer( low=-init_scale, high=init_scale))) self.softmax_weight = self._helper.create_parameter( attr=fluid.ParamAttr(), shape=[self.hidden_size, self.vocab_size], dtype="float32", default_initializer=fluid.initializer.UniformInitializer( low=-self.init_scale, high=self.init_scale)) self.softmax_bias = self._helper.create_parameter( attr=fluid.ParamAttr(), shape=[self.vocab_size], dtype="float32", default_initializer=fluid.initializer.UniformInitializer( low=-self.init_scale, high=self.init_scale)) def _build_once(self, input, label, init_hidden, init_cell): pass def forward(self, input, label, init_hidden, init_cell): init_h = fluid.layers.reshape( init_hidden, shape=[self.num_layers, -1, self.hidden_size]) init_c = fluid.layers.reshape( init_cell, shape=[self.num_layers, -1, self.hidden_size]) x_emb = self.embedding(input) x_emb = fluid.layers.reshape( x_emb, shape=[-1, self.num_steps, self.hidden_size]) if self.dropout is not None and self.dropout > 0.0: x_emb = fluid.layers.dropout( x_emb, dropout_prob=self.drop_out, dropout_implementation='upscale_in_train') rnn_out, last_hidden, last_cell = self.simple_lstm_rnn(x_emb, init_h, init_c) rnn_out = fluid.layers.reshape( rnn_out, shape=[-1, self.num_steps, self.hidden_size]) projection = fluid.layers.matmul(rnn_out, self.softmax_weight) projection = fluid.layers.elementwise_add(projection, self.softmax_bias) projection = fluid.layers.reshape( projection, shape=[-1, self.vocab_size]) projection = fluid.layers.reshape( projection, shape=[-1, self.vocab_size]) loss = fluid.layers.softmax_with_cross_entropy( logits=projection, label=label, soft_label=False) loss = fluid.layers.reshape(loss, shape=[-1, self.num_steps]) loss = fluid.layers.reduce_mean(loss, dim=[0]) loss = fluid.layers.reduce_sum(loss) loss.permissions = True return loss, last_hidden, last_cell
def fluid_batch_norm(input, act=None, is_test=False, momentum=0.9, epsilon=1e-05, param_attr=None, bias_attr=None, mean_attr=None, var_attr=None, data_layout='NCHW', in_place=False, name=None, moving_mean_name=None, moving_variance_name=None, do_model_average_for_mean_and_var=False, fuse_with_relu=False): """ **Batch Normalization Layer** Editted by Lihang Liu for the reason of exposing mean_attr and var_attr. Can be used as a normalizer function for conv2d and fully_connected operations. The required data format for this layer is one of the following: 1. NHWC `[batch, in_height, in_width, in_channels]` 2. NCHW `[batch, in_channels, in_height, in_width]` Refer to `Batch Normalization: Accelerating Deep Network Training by Reducing Internal Covariate Shift <https://arxiv.org/pdf/1502.03167.pdf>`_ for more details. :math:`input` is the input features over a mini-batch. .. math:: \\mu_{\\beta} &\\gets \\frac{1}{m} \\sum_{i=1}^{m} x_i \\qquad &//\\ \ mini-batch\ mean \\\\ \\sigma_{\\beta}^{2} &\\gets \\frac{1}{m} \\sum_{i=1}^{m}(x_i - \\ \\mu_{\\beta})^2 \\qquad &//\ mini-batch\ variance \\\\ \\hat{x_i} &\\gets \\frac{x_i - \\mu_\\beta} {\\sqrt{\\ \\sigma_{\\beta}^{2} + \\epsilon}} \\qquad &//\ normalize \\\\ y_i &\\gets \\gamma \\hat{x_i} + \\beta \\qquad &//\ scale\ and\ shift Args: input(variable): The input variable which is a LoDTensor. act(string, Default None): Activation type, linear|relu|prelu|... is_test(bool, Default False): Used for training or training. momentum(float, Default 0.9): epsilon(float, Default 1e-05): param_attr(ParamAttr|None): The parameter attribute for Parameter `scale` of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm will create ParamAttr as param_attr. If the Initializer of the param_attr is not set, the parameter is initialized with Xavier. Default: None. bias_attr(ParamAttr|None): The parameter attribute for the bias of batch_norm. If it is set to None or one attribute of ParamAttr, batch_norm will create ParamAttr as bias_attr. If the Initializer of the bias_attr is not set, the bias is initialized zero. Default: None. data_layout(string, default NCHW): NCHW|NHWC in_place(bool, Default False): Make the input and output of batch norm reuse memory. name(string, Default None): A name for this layer(optional). If set None, the layer will be named automatically. moving_mean_name(string, Default None): The name of moving_mean which store the global Mean. moving_variance_name(string, Default None): The name of the moving_variance which store the global Variance. do_model_average_for_mean_and_var(bool, Default False): Do model average for mean and variance or not. fuse_with_relu (bool): if True, this OP performs relu after batch norm. Returns: Variable: A tensor variable which is the result after applying batch normalization on the input. Examples: .. code-block:: python hidden1 = fluid.layers.fc(input=x, size=200, param_attr='fc1.w') hidden2 = fluid.layers.batch_norm(input=hidden1) """ assert bias_attr is not False, "bias_attr should not be False in batch_norm." helper = LayerHelper('batch_norm', **locals()) dtype = helper.input_dtype() input_shape = input.shape if data_layout == 'NCHW': channel_num = input_shape[1] else: if data_layout == 'NHWC': channel_num = input_shape[-1] else: raise ValueError("unsupported data layout:" + data_layout) param_shape = [channel_num] # create parameter scale = helper.create_parameter( attr=helper.param_attr, shape=param_shape, dtype=dtype, default_initializer=Constant(1.0)) bias = helper.create_parameter( attr=helper.bias_attr, shape=param_shape, dtype=dtype, is_bias=True) if mean_attr is None: mean = helper.create_parameter( attr=ParamAttr( name=moving_mean_name, initializer=Constant(0.0), trainable=False, do_model_average=do_model_average_for_mean_and_var), shape=param_shape, dtype=input.dtype) else: mean = helper.create_parameter( attr=mean_attr, shape=param_shape, dtype=input.dtype) mean.stop_gradient = True if var_attr is None: variance = helper.create_parameter( attr=ParamAttr( name=moving_variance_name, initializer=Constant(1.0), trainable=False, do_model_average=do_model_average_for_mean_and_var), shape=param_shape, dtype=input.dtype) else: variance = helper.create_parameter( attr=var_attr, shape=param_shape, dtype=input.dtype) variance.stop_gradient = True # create output # mean and mean_out share the same memory mean_out = mean # variance and variance out share the same memory variance_out = variance saved_mean = helper.create_variable_for_type_inference( dtype=dtype, stop_gradient=True) saved_variance = helper.create_variable_for_type_inference( dtype=dtype, stop_gradient=True) batch_norm_out = input if in_place else helper.create_variable_for_type_inference( dtype) helper.append_op( type="batch_norm", inputs={ "X": input, "Scale": scale, "Bias": bias, "Mean": mean, "Variance": variance }, outputs={ "Y": batch_norm_out, "MeanOut": mean_out, "VarianceOut": variance_out, "SavedMean": saved_mean, "SavedVariance": saved_variance }, attrs={ "momentum": momentum, "epsilon": epsilon, "is_test": is_test, "use_mkldnn": False, "fuse_with_relu": fuse_with_relu }) return helper.append_activation(batch_norm_out)
class CW_L2_Attack(Attack): """ Uses Adam to minimize the CW L2 objective function Paper link: https://arxiv.org/abs/1608.04644 """ def __init__(self, model, learning_rate): super(CW_L2_Attack, self).__init__(model) self._predicts_normalized = None self._adversary = None # type: Adversary ######################################### # build cw attack computation graph # use CPU self.place = fluid.CPUPlace() # use GPU # place = fluid.CUDAPlace(0) self.exe = fluid.Executor(self.place) # clone the prebuilt program that has cnn to attack self.attack_main_program = fluid.Program( ) # prebuilt_program.clone(for_test=False) # create an empty program for variable init self.attack_startup_program = fluid.Program( ) # start_up_program.clone(for_test=False) # build cw attack compute graph within attack programs with fluid.program_guard(main_program=self.attack_main_program, startup_program=self.attack_startup_program): img_0_1_placehold = fluid.layers.data(name='img_data_scaled', shape=[1, 28, 28], dtype="float32") target_placehold = fluid.layers.data(name='target', shape=[10], dtype="float32") shape_placehold = fluid.layers.data(name="shape", shape=[1], dtype="float32") # k_placehold = fluid.layers.data(name='k',shape=[1],dtype="float32") c_placehold = fluid.layers.data(name='c', shape=[1], dtype="float32") # get fluid.layer object from prebuilt program # img_placehold_from_prebuilt_program = attack_main_program.block(0).var(self.model._input_name) # softmax_from_prebuilt_program = attack_main_program.block(0).var(self.model._softmax_name) # logits_from_prebuilt_program = attack_main_program.block(0).var(self.model._predict_name) t0, t1, t2, t3, t4 = self._loss_cw(img_0_1_placehold, target_placehold, shape_placehold, c_placehold) # , # img_placehold_from_prebuilt_program, # softmax_from_prebuilt_program, # logits_from_prebuilt_program) # Adam optimizer as suggested in paper optimizer = fluid.optimizer.Adam(learning_rate=learning_rate) optimizer.minimize(t2, parameter_list=['parameter']) # initial variables and parameters every time before attack self.exe.run(self.attack_startup_program) # init ad perturbation ret = fluid.global_scope().find_var("parameter").get_tensor() # print(np.array(ret)) ret.set(0.001 * np.random.random_sample((1, 28, 28)).astype('float32'), self.place) # print(np.array(ret)) # print(attack_main_program.current_block()["parameter"]) # pdb.set_trace() c1 = self.attack_main_program.block(0).var("conv2d_2.b_0") c2 = self.attack_main_program.block(0).var("conv2d_2.w_0") c3 = self.attack_main_program.block(0).var("conv2d_3.b_0") c4 = self.attack_main_program.block(0).var("conv2d_3.w_0") f1 = self.attack_main_program.block(0).var("fc_2.b_0") f2 = self.attack_main_program.block(0).var("fc_2.w_0") f3 = self.attack_main_program.block(0).var("fc_3.b_0") f4 = self.attack_main_program.block(0).var("fc_3.w_0") var_list = [c1, c2, c3, c4, f1, f2, f3, f4] fluid.io.load_vars( executor=self.exe, dirname="../advbox/attacks/mnist/", vars=var_list, main_program=self.attack_main_program) # ../advbox/attacks/mnist/ ######################################### def _apply(self, adversary, nb_classes=10, learning_rate=0.01, attack_iterations=100, epsilon=1, targeted=True, k=0, noise=2): # put adversary instance inside of the attack instance so all other function within can access self._adversary = adversary if not adversary.is_targeted_attack: raise ValueError( "This attack method only support targeted attack!") # locate the range of c which makes the attack successful c = epsilon img = self._adversary.original # original image to be attacked ''' guess = self.model.predict(img) print('guess img before preprocess:',guess) ''' for i in range(10): c = 2 * c print('Checking if the range {0:f} include a successful c.'.format( c)) is_adversary, f6 = self._cwb(img, c, attack_steps=attack_iterations, k=k, learning_rate=learning_rate, noise=noise, nb_classes=nb_classes) if is_adversary: break if not is_adversary: logging.info('This CW attack failed!') return adversary # binary search for smaller c that makes fx<=0 print('searching for the smallest c that makes attack possible.') c_low = 0 c_high = c while c_high - c_low >= epsilon: logging.info('c_high={}, c_low={}, diff={}, epsilon={}'.format( c_high, c_low, c_high - c_low, epsilon)) c_half = (c_low + c_high) / 2 is_adversary, f6 = self._cwb(img, c, attack_steps=attack_iterations, k=k, learning_rate=learning_rate, noise=noise, nb_classes=nb_classes) # pdb.set_trace() is_f6_smaller_than_0 = f6 <= 0 if is_adversary and is_f6_smaller_than_0: c_high = c_half else: c_low = c_half return adversary def _cwb(self, img, c, attack_steps, k, learning_rate, noise, nb_classes): ''' use CW attack on an original image for a limited number of iterations :return bool ''' smallest_f6 = None corresponding_constrained = None # inital data screen_nontarget_logit = np.zeros(shape=[nb_classes], dtype="float32") screen_nontarget_logit[self._adversary.target_label] = 1 feeder = fluid.DataFeeder( feed_list=["img_data_scaled", "target", "shape", "c"], # self.model._input_name,self.model._logits_name, place=self.place, program=self.attack_main_program) sub = -1 div = 2 img_0_1 = self._process_input(img, sub, div) # pdb.set_trace() for i in range(attack_steps): # print("steps:",i) result = self.exe.run( self.attack_main_program, feed=feeder.feed([(img_0_1, screen_nontarget_logit, np.zeros(shape=[1], dtype='float32'), c) ]), # img_0_1,0, fetch_list=[ self.maxlogit_i_not_t, self.maxlogit_target, self.loss, self.logits_i_not_t, self.constrained, self.softmax ]) ''' print("maxlogit_i_not_t:",result[0],\ "maxlogit_target:",result[1],\ "loss:",result[2], "logits_i_not_t:",result[3],\ "softmax:",result[5]) ''' f6 = result[0] - result[1] if i == 0: smallest_f6 = f6 corresponding_constrained = result[4] if f6 < smallest_f6: smallest_f6 = f6 corresponding_constrained = result[4] ###### # pdb.set_trace() # print(corresponding_constrained) # recover image (-1,1) from corresponding_constrained which is within (0,1) img_ad = self.reconstruct(corresponding_constrained) # convert into img.shape img_ad = np.squeeze(img_ad) img_ad = img_ad.reshape(img.shape) # let model guess adv_label = np.argmax(self.model.predict(img_ad)) # img,img_ad ''' print(self._adversary.original_label,self.model.predict(img)) print(self._adversary.target_label,screen_nontarget_logit) print(adv_label,self.model.predict(img_ad)) #pdb.set_trace() ''' # try to accept new result, success or fail return self._adversary.try_accept_the_example( img_ad, adv_label), f6 # img,img_ad # this build up the CW attack computation graph in Paddle def _loss_cw(self, img_0_1, target, shape, c): # ,img_input_entrance,softmax_entrance,logits_entrance #### # use layerhelper to init w self.helper = LayerHelper("Jay") # name a name for later to take it out self.param_attr = ParamAttr(name="parameter") # add this perturbation on w space, then, reconstruct as an image within (0,1) self.ad_perturbation = self.helper.create_parameter( attr=self.param_attr, shape=[1, 28, 28], dtype='float32', is_bias=False) self.y = 2 * img_0_1 - 1 # compute arctan for y to get w self.xplus1 = 1 + self.y self.xminus1 = 1 - self.y self.ln = fluid.layers.log(self.xplus1 / self.xminus1) self.w = fluid.layers.scale(x=self.ln, scale=0.5) self.w_ad = self.w + self.ad_perturbation self.tanh_w = fluid.layers.tanh(self.w_ad) self.constrained = 0.5 * (self.tanh_w + 1) self.softmax, self.logits = mnist_cnn_model(self.constrained) self.sub = fluid.layers.elementwise_sub(img_0_1, self.constrained) self.squared = fluid.layers.elementwise_mul(self.sub, self.sub) self.distance_L2 = fluid.layers.reduce_sum(self.squared) self.negetive_screen_nontarget_logit = fluid.layers.scale(target, scale=-1.0) self.screen_target_logit = self.negetive_screen_nontarget_logit.__add__( fluid.layers.ones(shape=[10], dtype="float32")) self.logits_i_not_t = fluid.layers.elementwise_mul( self.screen_target_logit, self.logits) self.logit_target = fluid.layers.elementwise_mul(target, self.logits) self.maxlogit_i_not_t = fluid.layers.reduce_max(self.logits_i_not_t) self.maxlogit_target = fluid.layers.reduce_sum(self.logit_target) self.difference_between_two_logits = self.maxlogit_i_not_t - self.maxlogit_target self.f6 = fluid.layers.relu(self.difference_between_two_logits) self.loss = c * self.f6 + self.distance_L2 return self.maxlogit_i_not_t, self.maxlogit_target, self.loss, self.logits_i_not_t, self.constrained # distance_L2 # reconstruct corresponding_constrained to an image in MNIST format def reconstruct(self, corresponding_constrained): """ Restore the img from corresponding_constrained float32 :return: numpy.ndarray """ return corresponding_constrained * 2 - 1 # mnist is belong to (-1,1) def _f6(self, w): ''' _f6 is the special f function CW chose as part of the objective function, this returns the values directly :return float32 ''' target = self._adversary.target_label img = (np.tanh(w) + 1) / 2 Z_output = self._Z(img) f6 = max( max([Z for i, Z in enumerate(Z_output) if i != target]) - Z_output[target], 0) return f6 def _Z(self, img): """ Get the Zx logits as a numpy array. :return: numpy.ndarray """ return self.model.get_logits(img) def _process_input(self, input_, sub, div): res = None if np.any(sub != 0): res = input_ - sub if not np.all(sub == 1): if res is None: # "res = input_ - sub" is not executed! res = input_ / (div) else: res /= div if res is None: # "res = (input_ - sub)/ div" is not executed! return input_ res = np.where(res == 0, 0.00001, res) res = np.where(res == 1, 0.99999, res) # no 0 or 1 return res