def build_network(self, inputs): init = flow.kaiming_initializer(shape=inputs.shape, mode="fan_out") out = conv3d_layer(self.name, inputs, self.conv2d.out_channels, kernel_size=self.kernel_dim, strides=self.stride, padding="SAME", use_bias=True, weight_initializer=init, trainable=self.trainable) self.kernel_dim = [self.time_dim, 1, 1] self.stride = [self.time_stride * self.time_dim, 1, 1] residual = self.APM.build_network(out) init = flow.kaiming_initializer(shape=residual.shape, mode="fan_out") residual = conv3d_layer("APP3DC_temporal_" + str(self.time), residual, self.conv2d.out_channels, kernel_size=self.kernel_dim, strides=self.stride, padding="VALID", use_bias=False, weight_initializer=init, trainable=self.trainable) global time time += 1 out = out + residual return out
def build_network(self,inputs): # weight_2d=self.conv2d.weight.data # weight_3d=np.zeros(weight_2d.shape) # weight_3d=flow.expand_dims(weight_3d,axis=2) # weight_3d[:, :, 0, :, :] = weight_2d # init=flow.constant_initializer(weight_3d) #init=flow.random_uniform_initializer(minval=0, maxval=0.5) init=flow.kaiming_initializer(shape=inputs.shape,mode="fan_out",nonlinearity="relu") out=conv3d_layer( self.name,inputs,self.conv2d.out_channels, kernel_size=self.kernel_dim, strides=self.stride, padding="SAME", use_bias=True,weight_initializer=init,trainable=self.trainable ) self.kernel_dim=[self.time_dim,1,1] self.stride=[self.time_stride*self.time_dim,1,1] #init=flow.constant_initializer(0) residual=self.APM.build_network(out) #init=flow.random_normal_initializer(mean=0, stddev=1) init=flow.kaiming_initializer(shape=residual.shape,mode="fan_out",nonlinearity="relu") #self.padding = "SAME" if self.stride > 1 or self.kernel_dim > 1 else "VALID" tempname=datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S.%f') residual=conv3d_layer( "APP3DC_temporal_"+tempname,residual,self.conv2d.out_channels, kernel_size=self.kernel_dim, strides=self.stride, padding="VALID",use_bias=False,weight_initializer=init, trainable=self.trainable ) out=out+residual return out
def inflate_conv(inputs, conv2d, time_dim=1, time_padding=0, time_stride=1, time_dilation=1, center=False, times=0, trainable=True): name = datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S') kernel_dim = [time_dim, conv2d.kernel_size[0], conv2d.kernel_size[1]] if isinstance(conv2d.padding, int): padding1 = conv2d.padding padding2 = conv2d.padding if isinstance(conv2d.padding, list): padding1 = conv2d.padding[0] padding2 = conv2d.padding[1] padding = [0, 0, time_padding, padding1, padding2] stride = [time_stride, conv2d.stride[0], conv2d.stride[0]] if isinstance(conv2d.dilation, int): dilation1 = conv2d.dilation dilation2 = conv2d.dilation if isinstance(conv2d.dilation, list): dilation1 = conv2d.dilation[0] dilation2 = conv2d.dilation[1] dilation = [time_dilation, dilation1, dilation2] # weight_2d=conv2d.weight.data # if center: # weight_3d=np.zeros(weight_2d.shape) # weight_3d=np.expand_dims(weight_3d,axis=2) # weight_3d=np.tile(weight_3d,(1,1,time_dim,1,1)) # middle_idx = time_dim // 2 # weight_3d[:, :, middle_idx, :, :] = weight_2d # else: # weight_3d=np.expand_dims(weight_3d,axis=2) # weight_3d=np.tile(weight_3d,(1,1,time_dim,1,1)) # weight_3d=weight_3d/time_dim # init=flow.constant_initializer(weight_3d) init = flow.kaiming_initializer(shape=inputs.shape, mode="fan_out", nonlinearity="relu") #init=flow.random_normal_initializer(mean=0, stddev=1) #padding = "SAME" if stride > 1 or kernel_dim > 1 else "VALID" output = conv3d_layer("inflate_conv_" + str(times) + "_" + name, inputs, conv2d.out_channels, kernel_size=kernel_dim, dilation_rate=dilation, strides=stride, padding="SAME", weight_initializer=init, trainable=trainable) return output
def kaiming_normal_( self, a=0, mode="fan_in", nonlinearity="leaky_relu", *, data_format="NCHW" ): initializer_conf = flow.kaiming_initializer( shape=self.shape, distribution="random_normal", mode=mode, nonlinearity=nonlinearity, negative_slope=a, data_format=data_format, ) return self._init_by_initializer_conf(initializer_conf)
def inflate_linear(inputs, linear2d, time_dim, trainable=True): # weight3d=linear2d.weight.data # weight3d=np.tile(weight3d,(1,time_dim)) # weight3d=weight3d/time_dim # init=flow.constant_initializer(weight3d) init = flow.kaiming_initializer(shape=inputs.shape, mode="fan_out", nonlinearity="relu") linear3d = flow.layers.dense(inputs, linear2d.out_features, kernel_initializer=init, trainable=trainable) return linear3d
def Block(x, config, name='Block_'): #kaiming_init_C = flow.kaiming_initializer(shape=(C, C)) #attn of X x = flow.layers.layer_norm(x, name=name + 'l1') x = x + Causal_Self_Attention(x, config, name=name + 'attentions') #mlp x = flow.layers.layer_norm(x, name=name + 'l2') x = flow.layers.dense( inputs=x, units=4 * config.n_embd, kernel_initializer=flow.kaiming_initializer(shape=(config.n_embd, 4 * config.n_embd)), activation=flow.math.gelu, name=name + 'gelu') x = flow.layers.dense( inputs=x, units=config.n_embd, kernel_initializer=flow.kaiming_initializer(shape=(4 * config.n_embd, config.n_embd)), name=name + 'dense') x = flow.nn.dropout(x, rate=config.resid_pdrop) return x
def build_network(self,inputs): #pytorch中的repeat ==>numpy tile #由于上面使用了numpy的zeros函数导致weight3d 变成了np类型的对象,无法使用 #flow相关的函数,因此这里的后续补充需要从zero开始。 # oneflow.repeat(input: oneflow.python.framework.remote_blob.BlobDef, repeat_num: int, # name: Optional[str] = None) → oneflow.python.framework.remote_blob.BlobDef #weight_3d=flow.repeat(weight_3d,) # weight_2d=self.conv2d.weight.data # weight_3d=np.zeros(weight_2d.shape) # weight_3d=flow.expand_dims(weight_3d,axis=2) # weight_3d=np.tile(weight_3d,(1,1,self.time_dim,1,1)) # middle_dix=self.time_dim//2 # weight_3d[:, :, middle_idx, :, :] = weight_2d # init=flow.constant_initializer(weight_3d) #init=flow.random_uniform_initializer(minval=0, maxval=0.5) init=flow.kaiming_initializer(shape=inputs.shape,mode="fan_out",nonlinearity="relu") output=conv3d_layer("conv_I3D_",inputs,self.conv2d.out_channels, kernel_size=self.kernel_dim,strides=self.stride, padding=self.padding, use_bias=True, weight_initializer=init,trainable=self.trainable ) return output
def test_float_initializer(test_case): initializers = [ flow.random_normal_initializer(mean=3, stddev=4), flow.random_uniform_initializer(minval=-6, maxval=18), flow.truncated_normal_initializer(mean=-5, stddev=8), flow.xavier_uniform_initializer(data_format="NCHW"), flow.xavier_uniform_initializer(data_format="NHWC"), flow.xavier_normal_initializer(data_format="NCHW"), flow.xavier_normal_initializer(data_format="NHWC"), flow.constant_initializer(value=4), flow.ones_initializer(), flow.zeros_initializer(), ] kaiming_args = GenArgDict( OrderedDict( shape=[SHAPE], mode=["fan_in", "fan_out", "fan_avg"], distribution=["random_normal", "random_uniform"], data_format=["NCHW", "NHWC"], negative_slope=[0.5], )) vs_args = GenArgDict( OrderedDict( scale=[3.4], mode=["fan_in", "fan_out", "fan_avg"], distribution=[ "truncated_normal", "random_normal", "random_uniform" ], data_format=["NCHW", "NHWC"], )) for args in kaiming_args: initializers.append(flow.kaiming_initializer(**args)) for args in vs_args: initializers.append(flow.variance_scaling_initializer(**args)) for initializer in initializers: CompareTwoDistribution(test_case, flow.float32, initializer)
def build_network(self,inputs): b,c,t,h,w=inputs.shape N=self.time_dim templist=[] for i in range(N): tempname=datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S.%f') if i!=N//2: out = flow.range(t, dtype=flow.int64) one = flow.constant_like(out, i, dtype= flow.int64) out=flow.math.add(out, one) out=flow.expand_dims(out,axis=0) templist.append(out) neighbor_time_index=flow.concat(templist,axis=0) neighbor_time_index=flow.transpose(neighbor_time_index,[1,0]) neighbor_time_index=flow.flatten(neighbor_time_index, start_dim=0, end_dim=-1) # feature map registration tempname=datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S.%f') init=flow.kaiming_initializer(shape=inputs.shape,mode="fan_out",nonlinearity="relu") semantic=conv3d_layer("conv_semantic_"+tempname,inputs,self.out_channels, kernel_size=1,use_bias=False,padding="VALID",trainable=self.trainable, weight_initializer=init ) inputs_norm=flow.math.l2_normalize( semantic,axis=1 ) inputs_norm_padding=flow.pad(inputs_norm,paddings=[ (0,0),(0,0),((self.time_dim-1)//2,(self.time_dim-1)//2), (0,0),(0,0)] ) inputs_norm_expand=flow.expand_dims(inputs_norm,axis=3) temp_inputs_norm_expand=inputs_norm_expand for i in range(N-2): inputs_norm_expand=flow.concat( inputs=[ inputs_norm_expand,temp_inputs_norm_expand], axis=3 ) inputs_norm_expand=flow.transpose(inputs_norm_expand,perm=[0, 2, 3, 4, 5, 1]) inputs_norm_expand=flow.reshape(inputs_norm_expand,shape=[-1, h*w, c//16]) slice_list=[] for index in neighbor_time_index: temp=flow.slice( inputs_norm_padding, begin=[None,None,int(index),None,None], size=[None,None,1,None,None] ) slice_list.append(temp) neighbor_norm=flow.concat( slice_list,axis=2 ) neighbor_norm=flow.transpose(neighbor_norm,perm=[0, 2, 1, 3, 4]) neighbor_norm=flow.reshape(neighbor_norm,shape=[-1, c//16, h*w]) similarity=flow.matmul(inputs_norm_expand,neighbor_norm)*self.temperature similarity=nn.softmax(similarity,axis=-1) inputs_padding=flow.pad(inputs, paddings=[ (0,0),(0,0),((self.time_dim-1)//2,(self.time_dim-1)//2), (0,0),(0,0)] ) slice_list=[] for index in neighbor_time_index: temp=flow.slice( inputs_padding, begin=[None,None,int(index),None,None], size=[None,None,1,None,None] ) slice_list.append(temp) neighbor=flow.concat( slice_list,axis=2 ) neighbor=flow.transpose(neighbor,perm=[0,2,3,4,1]) neighbor=flow.reshape(neighbor,shape=[-1, h*w, c]) neighbor_new=flow.matmul(similarity,neighbor) neighbor_new=flow.reshape(neighbor_new,shape=[b, t*(N-1), h, w, c]) neighbor_new=flow.transpose(neighbor_new,perm=[0, 4, 1, 2, 3]) # contrastive attention if self.contrastive_att: temp_input=flow.expand_dims(inputs,axis=3) temp_temp_input=temp_input for i in range(N-2): temp_input=flow.concat( inputs=[ temp_input,temp_temp_input], axis=3 ) temp_input=flow.reshape(temp_input,shape=[b, c, (N-1)*t, h, w]) input_att=conv3d_layer( "conv3d_inputmapping_"+tempname,temp_input,self.out_channels, kernel_size=1, use_bias=False,trainable=False,weight_initializer=flow.kaiming_initializer(shape=temp_input.shape,mode="fan_out",nonlinearity="relu") ) n_att=conv3d_layer( "conv3d_nmapping_"+tempname,neighbor_new,self.out_channels, kernel_size=1, use_bias=False,trainable=False,weight_initializer=flow.kaiming_initializer(shape=neighbor_new.shape,mode="fan_out",nonlinearity="relu") ) temp_input=input_att*n_att contrastive_att_net=conv3d_layer( "conv3d_att_net_"+tempname,temp_input,1, kernel_size=1, use_bias=False,trainable=self.trainable,weight_initializer=flow.kaiming_initializer(shape=temp_input.shape,mode="fan_out",nonlinearity="relu") ) contrastive_att_net=flow.math.sigmoid(contrastive_att_net) neighbor_new=flow.math.multiply( neighbor_new,contrastive_att_net ) # integrating feature maps init = flow.zeros_initializer() tempname=datetime.datetime.now().strftime('%Y-%m-%d-%H-%M-%S.%f') input_offset = flow.get_variable( "input_offset_"+tempname, shape=(b, c, N*t, h, w), initializer=init, dtype=inputs.dtype, trainable=self.trainable) with flow.scope.placement("cpu", "0:0"): input_index=np.array( [i for i in range(t*N) if i%N==N//2] ) neighbor_index=np.array( [i for i in range(t*N) if i%N!=N//2]) input_offset_list=[] inputs_list=[] neighbor_new_list=[] for index in range(input_offset.shape[2]): temp=flow.slice( input_offset, begin=[None,None,int(index),None,None], size=[None,None,1,None,None] ) input_offset_list.append(temp) for index in range(inputs.shape[2]): temp=flow.slice( inputs, begin=[None,None,int(index),None,None], size=[None,None,1,None,None] ) inputs_list.append(temp) for index in range(neighbor_new.shape[2]): temp=flow.slice( neighbor_new, begin=[None,None,int(index),None,None], size=[None,None,1,None,None] ) neighbor_new_list.append(temp) temp_index=0 for index in input_index: input_offset_list[index]+=inputs_list[temp_index] temp_index+=1 temp_index=0 for index in neighbor_index: input_offset_list[index]+=neighbor_new_list[temp_index] temp_index+=1 input_offset=flow.concat( input_offset_list,axis=2 ) return input_offset
def Causal_Self_Attention(x, config, name='csa'): """ Input:: x : Eembedded words input[B, T, C] -- B is the batch size -- T is the sequence length(block_size) -- C is the dimension of the embedding (n_embd) C/head_number = dimension of each head(d_k) config: class object defined with models.GPTConfig Output:: y : output of x, which can be used as new x in next interation Description:: This functions is the causl_sefl_attention core, which is a part of multiple head attention schema. Code refered from: https://github.com/karpathy/minGPT/blob/master/mingpt/model.py Theory refered from: http://jalammar.github.io/illustrated-gpt2/ Related paper: """ assert config.n_embd % config.n_head == 0 #def B, T, C = x.shape #Kaiming_initialize kaiming_init_C = flow.kaiming_initializer(shape=(C, C)) ## calculate query, key, values for all heads in batch and move head forward to be the batch dim # define: key, query and value projections for all heads # process: query + key ----> value # dimension: (B,T,C) -> (B, nh, T, hs), nh*ns=C # query:The query is a representation of the current word used to score against all the other words (using their keys). query = flow.layers.dense(x, units=config.n_embd, kernel_initializer=kaiming_init_C, name=(name + '_query')) query = flow.reshape(query, [B, T, config.n_head, C // config.n_head]) query = flow.transpose(query, [0, 2, 1, 3]) # key:Key vectors are like labels for all the words in the segment. key = flow.layers.dense(x, units=config.n_embd, kernel_initializer=kaiming_init_C, name=(name + '_key')) key = flow.reshape(key, [B, T, config.n_head, C // config.n_head]) key = flow.transpose(key, [0, 2, 1, 3]) # value: Value vectors are actual word representations value = flow.layers.dense(x, units=config.n_embd, kernel_initializer=kaiming_init_C, name=(name + 'value')) value = flow.reshape(value, [B, T, config.n_head, C // config.n_head]) value = flow.transpose(value, [0, 2, 1, 3]) ##causal self-attention; Self-attend: (B, nh, T, hs) x (B, nh, hs, T) -> (B, nh, T, T) att = flow.matmul(query, flow.transpose( key, [0, 1, 3, 2])) * (1.0 / math.sqrt(key.shape[-1])) att_tril = flow.math.tril( flow.constant(value=int(-1), dtype=flow.int32, shape=(B, config.n_head, T, T), name=name + "_ConstantLike_tril")) att_tril = att_tril + flow.ones_like(like=att_tril, dtype=flow.int32) att = flow.masked_fill(att, att_tril, float('-inf')) att = flow.nn.softmax(att, name=name + 'att') att = flow.nn.dropout(att, config.attn_pdrop) ## QK*V: (B, nh, T, T) x (B, nh, T, hs) -> (B, nh, T, hs) y = flow.matmul(att, value) y = flow.transpose(y, [0, 2, 1, 3]) y = flow.reshape(y, [B, T, C]) y = flow.nn.dropout(y, config.resid_pdrop) return y