def inception_block(x, filters=256): shrinkaged_filters = int(filters * INCEPTION_ENABLE_DEPTHWISE_SEPARABLE_CONV_SHRINKAGE) b0 = conv_bn_relu(x, filters=filters, kernel_size=(1, 1, 1)) b1 = conv_bn_relu(x, filters=shrinkaged_filters, kernel_size=(1, 1, 1)) b1 = conv_bn_relu(b1, filters=filters, kernel_size=(3, 3, 3)) b2 = conv_bn_relu(x, filters=shrinkaged_filters, kernel_size=(1, 1, 1)) b2 = conv_bn_relu(b2, filters=filters, kernel_size=(3, 3, 3)) b2 = conv_bn_relu(b2, filters=filters, kernel_size=(3, 3, 3)) b3 = AveragePooling3D(pool_size=(3, 3, 3), strides=(1, 1, 1), padding='same')(x) b3 = conv_bn_relu(b3, filters=filters, kernel_size=(1, 1, 1)) bs = [b0, b1, b2, b3] print('inception_block') print(b0.get_shape()) print(b1.get_shape()) print(b2.get_shape()) print(b3.get_shape()) if INCEPTION_ENABLE_SPATIAL_SEPARABLE_CONV: b4 = conv_bn_relu(x, filters=shrinkaged_filters, kernel_size=(1, 1, 1)) b4 = conv_bn_relu(b4, filters=filters, kernel_size=(5, 1, 1)) b4 = conv_bn_relu(b4, filters=filters, kernel_size=(1, 5, 1)) b4 = conv_bn_relu(b4, filters=filters, kernel_size=(1, 1, 5)) bs.append(b4) print(b4.get_shape()) x = Concatenate(axis=4)(bs) print(x.get_shape()) return x
def reduction_block(x, filters=256): b0 = conv_bn_relu(x, filters=filters, kernel_size=(3, 3, 3), strides=(2, 2, 2), padding='same') b1 = conv_bn_relu(x, filters=filters, kernel_size=(1, 1, 1)) b1 = conv_bn_relu(b1, filters=filters, kernel_size=(3, 3, 3)) b1 = conv_bn_relu(b1, filters=filters, kernel_size=(3, 3, 3), strides=(2, 2, 2), padding='same') b2 = MaxPooling3D(pool_size=(3, 3, 3), strides=(2, 2, 2), padding='same')(x) b2 = conv_bn_relu(b2, filters=filters, kernel_size=(1, 1, 1)) bs = [b0, b1, b2] print('reduction_block') print(b0.get_shape()) print(b1.get_shape()) print(b2.get_shape()) if INCEPTION_ENABLE_SPATIAL_SEPARABLE_CONV: b3 = conv_bn_relu(x, filters=filters, kernel_size=(1, 1, 1)) b3 = conv_bn_relu(b3, filters=filters, kernel_size=(5, 1, 1)) b3 = conv_bn_relu(b3, filters=filters, kernel_size=(1, 5, 1)) b3 = conv_bn_relu(b3, filters=filters, kernel_size=(1, 1, 5)) b3 = conv_bn_relu(b3, filters=filters, kernel_size=(3, 3, 3), strides=(2, 2, 2), padding='same') bs.append(b3) print(b3.get_shape()) x = Concatenate(axis=4)(bs) print(x.get_shape()) return x
def lstm_models(self, mode="cat"): self.mode = mode emb = Embedding(self.nb_words + 1, self.WORD_EMBEDDING_DIM, weights=[self.word_embedding_matrix], input_length=self.MAX_SEQUENCE_LENGTH, trainable=False) question_1 = Input(shape=(self.MAX_SEQUENCE_LENGTH, )) question_2 = Input(shape=(self.MAX_SEQUENCE_LENGTH, )) q1 = LSTM(self.SENT_EMBEDDING_DIM)(emb(question_1)) q2 = LSTM(self.SENT_EMBEDDING_DIM)(emb(question_2)) if mode == 'cat': res = Concatenate(axis=1)([q1, q2]) print("concate.shape: ", res.get_shape()) elif mode == 'dis_agl': def Manhattan_distance(A, B): return K.sum(K.abs(A - B), axis=1, keepdims=True) merged_dist = Merge(mode=lambda x: Manhattan_distance(x[0], x[1]), output_shape=lambda inp_shp: (inp_shp[0][0], 1))([q1, q2]) merged_agle = Multiply()([q1, q2]) res = Concatenate(axis=1)([merged_dist, merged_agle]) res = Dense(128, activation='relu')(res) label = Dense(1, activation='sigmoid')(res) model = Model(inputs=[question_1, question_2], outputs=label) model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) print(model.summary()) return model
def inception_base(x): x = conv_bn_relu(x, filters=32) x = conv_bn_relu(x, filters=32) x = conv_bn_relu(x, filters=64) b0 = MaxPooling3D(pool_size=(2, 2, 2))(x) b1 = conv_bn_relu(x, 64, strides=(2, 2, 2)) x = Concatenate(axis=4)([b0, b1]) print('inception_base') print(b0.get_shape()) print(b1.get_shape()) print(x.get_shape()) return x
def dense_block(x): print('dense block') print(x.get_shape()) for _ in range(DENSE_NET_BLOCK_LAYERS): y = x if DENSE_NET_ENABLE_BOTTLENETCK: y = bn_relu_conv(y, filters=DENSE_NET_GROWTH_RATE, kernel_size=(1, 1, 1)) y = bn_relu_conv(y, filters=DENSE_NET_GROWTH_RATE, kernel_size=(3, 3, 3)) x = Concatenate(axis=4)([x, y]) print(x.get_shape()) return x
def get_multiOutputRNN_model(input_shape, time_steps): '''Arguments: input_shape : the shape of input (tis time not as list but a Tensor so that we could input all time-step at one during training) time_steps : total time-steps in the RNN sequence Description: This function will return the many-to-many model of RNN, where the input of packet-loos (and other metric) of PerfSonar will be taken at each time-step and prediction will be made at each time-step about the next packet loss. With this architecture we want to have more number of gradient to be updated at each sequrnce, so that the problel of vanishing gradient we are encountering in big RNN with only one putput per sequrnce could be tackled. ''' #Placeholders Definition (later will be fed, sort of feed dict at fitting time) #X_all possible dimesion is (None,time_steps,input_dim of each time-step).None is aoutomatically taken by Keras. X_all = Input(shape=input_shape, dtype=tf.float32, name='X_all') #the inital pre-conditiong on the sequence.(currently of shape=1 like outputs of each time-step) h_initial = Input(shape=(1, ), dtype=tf.float32, name='ho') #for final inking in the model as input and output. X_inputs = [h_initial, X_all] #Y_outputs=None #(will do in loop by concatenating)now output is also in one big Tensor.(its better than having a list,unnecessory mess) #Traversing through the time-step to create unfolded version of RNN for t in range(time_steps): if (t == 0): #print("Printing kalpana ",X_all.shape) #print(X_all[:,t,:]._keras_history) X_t = Lambda(give_timestep_input, arguments={'t': t})(X_all) Y = multiOutputRNN_oneStep(X_t, h_initial, t + 1) Y_outputs = Y else: X_t = Lambda(give_timestep_input, arguments={'t': t})(X_all) Y = multiOutputRNN_oneStep(X_t, Y, t + 1) Y_outputs = Concatenate(axis=-1)([Y_outputs, Y]) m, T = Y_outputs.get_shape().as_list() Y_outputs = Lambda(expand_dims)(Y_outputs) #output dimension not required #Now merging all the graph into one model. model = Model(inputs=X_inputs, outputs=Y_outputs, name='RNN-V1') return model
def text_rcnn_model(input_shape, embedding_matrix, max_length, num_features, num_classes, input_tensor=None, emb_size=300, filter_num=64, rnn_units=128, trainable=False): inputs = Input(name='inputs', shape=[max_length], tensor=input_tensor) if embedding_matrix is None: layer = Embedding(input_dim=num_features, output_dim=emb_size, input_length=input_shape)(inputs) else: # num_features = MAX_VOCAB_SIZE num_features = embedding_matrix.shape[0] layer = Embedding( input_dim=num_features, output_dim=emb_size, # input_length=input_shape, weights=[embedding_matrix], # embeddings_initializer=keras.initializers.Constant( # embedding_matrix), trainable=trainable)(inputs) # layer_cell = GRU layer_cell = CuDNNGRU # 更快 embedding_output = layer # 拼接 x_feb = Bidirectional( layer_cell(units=rnn_units, return_sequences=True))(embedding_output) x_feb = Concatenate(axis=2)([x_feb, embedding_output]) # x_feb = Concatenate(axis=2)([x_fordwords, embedding_output, x_backwords_reverse]) ####使用多个卷积核################################################## x_feb = Dropout(rate=0.5)(x_feb) # Concatenate后的embedding_size dim_2 = K.int_shape(x_feb)[2] # print("check dim2", x_feb.get_shape()) len_max = max_length x_feb_reshape = Reshape((len_max, dim_2, 1))(x_feb) # 提取n-gram特征和最大池化, 一般不用平均池化 conv_pools = [] filters = [2, 3, 4, 5] for filter_size in filters: conv = Conv2D( filters=filter_num, kernel_size=(filter_size, dim_2), padding='valid', kernel_initializer='normal', activation='relu', )(x_feb_reshape) print("check conv", conv.get_shape()) pooled = MaxPooling2D( pool_size=(len_max - filter_size + 1, 1), strides=(1, 1), padding='valid', )(conv) print("check pooled", pooled.get_shape()) conv_pools.append(pooled) # 拼接 x = Concatenate()(conv_pools) # x = concatenate(conv_pools, axis=-1) print("check concatenate x", x.get_shape()) x = Flatten()(x) ######################################################################### output = Dense(units=num_classes, activation='softmax')(x) model = keras.models.Model(inputs=inputs, outputs=output) return model
def CombCN(input_frame, input_video, video_size=None, sampling_frame=8, frame_net_mid_depth=4): #frame_3DCN => total frames or jsut frame of Vk in Vin Activ = lambda x: LeakyReLU(alpha=0.2)(x) Bat = lambda x: BatchNormalization()(x) video_size = None if not video_size: W = 320 H = 240 else: W = video_size[0] H = video_size[1] if input_frame is None: input_frame = Input(shape=(W, H, 3)) if input_video is None: input_video = Input(shape=(sampling_frame, W / 2, H / 2, 3)) print(input_frame.get_shape()) e0 = Conv2D(filters=32, padding='same', kernel_size=5)(input_frame) #e0 = Bat(e0) e0 = Activ(e0) print(e0.get_shape()) #WITH NO CONCATENATE Init_dataloader()DING IN 3DCN BUT WITH CONCAT FOR ENCODING IN combination part e0_C = Conv2D(filters=64, padding='same', kernel_size=3, strides=2)(e0) e0_C = Bat(e0_C) e0_C = Activ(e0_C) print(e0_C.get_shape()) #skip_subnet = CN3D(video_info=None , input_video = input_video, sampling_frame= 8, vid_net_mid_depth = 3) skip_subnet = input_video size_subnet = skip_subnet.get_shape() # IS IT WHAT THE PAPER SAYS? NOT SURE skip_subnet = Reshape((int(W / 2), int(H / 2), int(size_subnet[1] * size_subnet[4])))(skip_subnet) skip_subnet = Conv2D(filters=128, padding='same', kernel_size=5, strides=1, name='subnet')(skip_subnet) skip_subnet = Bat(skip_subnet) skip_subnet = Activ(skip_subnet) skip_subnet = Conv2D(filters=64, padding='same', kernel_size=3, strides=1, name='subnet_2')(skip_subnet) skip_subnet = Bat(skip_subnet) skip_subnet = Activ(skip_subnet) e0_C = Concatenate()([e0_C, skip_subnet]) print(e0_C.get_shape()) e1 = Conv2D(filters=256, padding='same', kernel_size=3)(e0_C) e1 = Bat(e1) e1 = Activ(e1) print(e1.get_shape()) e1_C = Conv2D(filters=512, padding='same', kernel_size=4, strides=2)(e1) e1_C = Bat(e1_C) e1_C = Activ(e1_C) print(e1_C.get_shape()) e2 = Conv2D(filters=512, padding='same', kernel_size=3)(e1_C) e2 = Bat(e2) e2 = Activ(e2) print(e2.get_shape()) e2_C = Conv2D(filters=512, padding='same', kernel_size=4, strides=2)(e2) e2_C = Bat(e2_C) e2_C = Activ(e2_C) print(e2_C.get_shape()) fc_mid = e2_C fc_prev = e2_C p_num = 2 for i in range(frame_net_mid_depth): fc_mid = Conv2D(filters=512, dilation_rate=p_num, kernel_size=3, padding='same')(fc_mid) fc_mid = Bat(fc_mid) fc_mid = Activ(fc_mid) f_temp = fc_mid fc_mid = Concatenate()([fc_mid, fc_prev]) fc_prev = f_temp print(fc_mid.get_shape()) p_num = p_num * 2 #fc_mid = Deconvolution3D(strides=2, filters=64, kernel_size= 4, padding='same')(fc_mid) d0 = Concatenate()([fc_prev, e2_C]) d0 = Conv2D(strides=1, filters=512, kernel_size=3, padding='same')(d0) d0 = Bat(d0) d0 = Activ(d0) print(d0.get_shape()) d0_C = UpSampling2D()(d0) d0_C = Concatenate()([d0_C, e2]) d0_C = Conv2D(strides=1, filters=512, kernel_size=4, padding='same')(d0_C) d0_C = Bat(d0_C) d0_C = Activ(d0_C) print(d0_C.get_shape()) d0_CC = Concatenate()([d0_C, e1_C]) d0_CC = Conv2D(filters=512, padding='same', kernel_size=3)(d0_CC) d0_CC = Bat(d0_CC) d0_CC = Activ(d0_CC) print(d0_CC.get_shape()) d1 = UpSampling2D()(d0_CC) d1 = Concatenate()([d1, e1]) d1 = Conv2D(strides=1, filters=256, kernel_size=4, padding='same')(d1) d1 = Bat(d1) d1 = Activ(d1) print(d1.get_shape()) d1_C = Concatenate()([d1, e0_C]) d1_C = Conv2D(filters=128, padding='same', kernel_size=3)(d1_C) d1_C = Bat(d1_C) d1_C = Activ(d1_C) print(d1_C.get_shape()) d1_CC = UpSampling2D()(d1_C) d1_CC = Conv2D(strides=1, filters=64, kernel_size=4, padding='same')(d1_CC) d1_CC = Bat(d1_CC) d1_CC = Activ(d1_CC) print(d1_CC.get_shape()) d1_CCC = Concatenate()([d1_CC, e0]) d1_CCC = Conv2D(strides=1, filters=48, kernel_size=4, padding='same')(d1_CCC) d1_CCC = Bat(d1_CCC) d1_CCC = Activ(d1_CCC) print(d1_CCC.get_shape()) d_out = Conv2D(filters=3, padding='same', kernel_size=3)(d1_CCC) #d_out = Bat(d_out) d_out = Activation('tanh')(d_out) print(d_out.get_shape()) return d_out
def _build_layers_v2(self, input_dict, num_outputs, options): TRANSFORMER_SIMPLICIAL_DIM = options["custom_options"][ "transformer_simplicial_model_dim"] TRANSFORMER_MODEL_DIM = options["custom_options"][ "transformer_model_dim"] TRANSFORMER_NUM_HEADS = options["custom_options"][ "transformer_num_heads"] TRANSFORMER_DEPTH = options["custom_options"]["transformer_depth"] CONV_PADDING = options["custom_options"]["conv_padding"] NUM_VIRTUAL_ENTITIES = options["custom_options"][ "num_virtual_entities"] # For detailed comments see the base agent inputs = input_dict["obs"] sess = tf.get_default_session() K.set_session(sess) attention_layer = MultiHeadSelfAttentionZambaldi( name='self_attention', num_heads=TRANSFORMER_NUM_HEADS, use_masking=False, dropout=0, compression_window_size=None, num_virtual_entities=NUM_VIRTUAL_ENTITIES) attention_layer_2simplex = MultiHeadSelfAttentionSimplicial( name='self_2attention', num_heads=1, d_simp_model=TRANSFORMER_SIMPLICIAL_DIM, use_masking=False, dropout=0, compression_window_size=None, num_virtual_entities=NUM_VIRTUAL_ENTITIES) dense_layer1 = layers.Dense(TRANSFORMER_MODEL_DIM, activation='relu') dense_layer2 = layers.Dense(TRANSFORMER_MODEL_DIM) def transformer_block(input): a = LayerNormalization()(input) a1 = attention_layer( a ) # a1 = attention(h) has shape -1, seq_len, TRANSFORMER_MODEL_DIM a2 = attention_layer_2simplex( a) # shape -1, seq_len, TRANSFORMER_SIMPLICIAL_DIM a2 = LayerNormalization()(a2) ac = Concatenate()( [a1, a2] ) # shape -1, seq_len, TRANSFORMER_MODEL_DIM + TRANSFORMER_SIMPLICIAL_DIM b = dense_layer1(ac) b2 = dense_layer2(b) # b = ff(ac) r = layers.Add()([input, b2]) Hprime = LayerNormalization()(r) return Hprime # CONVOLUTIONS ------ # x = layers.Lambda(lambda x: x / 255)(inputs) # rescale RGB to [0,1] x = layers.Conv2D(12, (2, 2), activation='relu', padding=CONV_PADDING)(x) x = layers.Conv2D(24, (2, 2), activation='relu', padding=CONV_PADDING)( x) # output shape -1, num_rows, num_cols, 62 x = layers.Dense( TRANSFORMER_MODEL_DIM - 2, activation=None, use_bias=False )(x) # output shape -1, num_rows, num_cols, TRANSFORMER_MODEL_DIM-2 # POSITION EMBEDDING ----- # num_rows, num_cols, d_model = x.get_shape().as_list()[-3:] ps = np.zeros([num_rows, num_cols, 2], dtype=K.floatx()) # shape (12,13,2) for ty in range(num_rows): for tx in range(num_cols): ps[ty, tx, :] = [(2 / (num_rows - 1)) * ty - 1, (2 / (num_cols - 1)) * tx - 1] ps_expand = K.expand_dims(K.constant(ps), axis=0) # shape (1,num_rows,num_cols,2) ps_tiled = K.tile( ps_expand, [K.shape(x)[0], 1, 1, 1]) # shape (None,num_rows,num_cols,2) # (None,num_rows,num_cols,62) concatenate with (None,num_rows,num_cols,2) # to get (None,num_rows,num_cols,TRANSFORMER_MODEL_DIM) x = Concatenate(axis=3)([x, ps_tiled]) x = layers.Reshape( (num_rows * num_cols, d_model + 2))(x) # shape (None, num_rows*num_cols,d_model+2) # NOTE: the batch dimension is preserved by reshape, see https://www.tensorflow.org/api_docs/python/tf/keras/layers/Reshape # We now add some virtual entities, which are initialised randomly tokens = np.arange(NUM_VIRTUAL_ENTITIES).reshape( (1, NUM_VIRTUAL_ENTITIES)) # [[0,1,2,...,NUM_VIRTUAL_ENTITIES]] tokens = K.constant(tokens) ve = layers.Embedding( input_dim=NUM_VIRTUAL_ENTITIES, output_dim=d_model + 2)( tokens) # shape (1,NUM_VIRTUAL_ENTITIES,d_model+2) ve_tiled = K.tile(ve, [K.shape(x)[0], 1, 1]) x = Concatenate(axis=1)([x, ve_tiled]) # TRANSFORMER ----- for i in range(TRANSFORMER_DEPTH): x = transformer_block(x) # The output of the simplicial Transformer includes the virtual entities, # which we now want to remove. The current tensor is of shape # (None,num_rows*num_cols+NUM_VIRTUAL_ENTITIES,TRANSFORMER_MODEL_DIM) x = x[:, :-NUM_VIRTUAL_ENTITIES, :] # MAX-POOLING ----- # from p.4 "The E~ matrix, with shape Nxf is reudced to an f-dimensional vector by max-pooling # over the entity dimension. This pooled vector is then passed to a small MLP..." num_entities, d_model = x.get_shape().as_list()[-2:] x = layers.MaxPooling1D(pool_size=num_entities)(x) x = layers.Flatten()(x) # FULLY-CONNECTED LAYERS ---- x = layers.Dense(256, activation='relu')(x) x = layers.Dense(256, activation='relu')(x) x = layers.Dense(256, activation='relu')(x) x = layers.Dense(256, activation='relu')(x) output_tensor = layers.Dense(4)(x) # final output is logits return output_tensor, x
def _build_layers_v2(self, input_dict, num_outputs, options): """Define the layers of a custom model. Arguments: input_dict (dict): Dictionary of input tensors, including "obs", "prev_action", "prev_reward", "is_training". num_outputs (int): Output tensor must be of size [BATCH_SIZE, num_outputs]. options (dict): Model options. Returns: (outputs, feature_layer): Tensors of size [BATCH_SIZE, num_outputs] and [BATCH_SIZE, desired_feature_size]. """ TRANSFORMER_MODEL_DIM = options["custom_options"][ "transformer_model_dim"] TRANSFORMER_NUM_HEADS = options["custom_options"][ "transformer_num_heads"] TRANSFORMER_DEPTH = options["custom_options"]["transformer_depth"] CONV_PADDING = options["custom_options"]["conv_padding"] # Agent architecture p.15 of Zambaldi et al # "The input module contained two convolutional layers with 12 and 24 kernels, 2 × 2 kernel sizes # and a stride of 1, followed by a rectified linear unit (ReLU) activation function. The output # was tagged with two extra channels indicating the spatial position (x and y) of each cell in # the feature map using evenly spaced values between −1 and 1. This was passed to the relational # module, consisting of relational blocks, with shared parameters. Queries, keys and values were # produced by 2 to 4 attention heads and had an embedding size (d) of 64. The output of this module # was aggregated using a feature-wise max pooling function and passed to 4 fully connected layers, # each followed by a ReLU. Policy logits (pi, size 4) and baseline function (B, size 1) were produced # by a linear projection. The policy logits were normalized and used as multinomial distribution from # which the action (a) was sampled." # NOTE: there is no dropout in Zambaldi et al inputs = input_dict["obs"] sess = tf.get_default_session() K.set_session(sess) # NOTE: the weights in the self-attention mechanism # and feed-forward layers are shared between all Transformer blocks (as in # Zambaldi et al, but unlike every other Transformer paper) # The published version of Zambaldi et al does not tell us what the MLP g_\theta is, but # # - in Santoro et al "A simple neural network module for relational reasoning" # the analogous g_\theta is is a four-layer MLP with 256 dimensional hidden layers # with ReLU non-linearities # - in Keras-Transformer the default is a two layer model with hidden dimension # equal to 4 * the embedding dimension, which in the case of 64 dimensional embeddings # gives 256 (this is also the convention in the Sparse Transformer paper) # - in the first version of Zambaldi et al they write "passed to a multilayer perceptron # (2-layer MLP with ReLU non-linearities) with the same layers sizes as ei" # # Hence, attempting to follow Zambaldi, we use layer size TRANSFORMER_MODEL_DIM # (in v6 we used 4 times this) attention_layer = MultiHeadSelfAttentionZambaldi( name='self_attention', num_heads=TRANSFORMER_NUM_HEADS, use_masking=False, dropout=0, compression_window_size=None) dense_layer1 = layers.Dense(TRANSFORMER_MODEL_DIM, activation='relu') dense_layer2 = layers.Dense(TRANSFORMER_MODEL_DIM) def transformer_block(input): #_, seq_len, d_model = K.int_shape(input) a = LayerNormalization()(input) a = attention_layer( a ) # a = attention(h) has shape -1, seq_len, TRANSFORMER_MODEL_DIM b = dense_layer1(a) b = dense_layer2(b) # b = ff(a) r = layers.Add()([input, b]) Hprime = LayerNormalization()(r) return Hprime # CONVOLUTIONS ------ # # Question: should we use max-pooling here? It seems not, as the downsampling in the # Santoro et al paper "A simple neural network module for relational reasoning" # occurs using 3x3 patches with stride 2, rather than max-pooling, and it is not # mentioned anywhere in the papers. # # It is worth comparing to e.g. the models for deep RL on 3D environments in the IMPALA # paper, see Figure 3, which also have no max-pooling layers and downsample instead using # strided convolutional layers. You'll see there also they prefer hidden layers of width # 256 for the FC layers after the initial convolutional layers, in processing visual scenes. # So the Zambaldi paper is consistent with their other work on deep RL, in terms of the model. x = layers.Lambda(lambda x: x / 255)(inputs) # rescale RGB to [0,1] x = layers.Conv2D(12, (2, 2), activation='relu', padding=CONV_PADDING)(x) x = layers.Conv2D(24, (2, 2), activation='relu', padding=CONV_PADDING)( x) # output shape -1, num_rows, num_cols, 62 x = layers.Dense( TRANSFORMER_MODEL_DIM - 2, activation=None, use_bias=False )(x) # output shape -1, num_rows, num_cols, TRANSFORMER_MODEL_DIM-2 # NOTE: we are using the default "valid" padding, so actually our width and height decrease # by one in each convolutional layer # POSITION EMBEDDING ----- # # Here we follow Zambaldi et al, rather than the standard Transformer # positional embeddings num_rows, num_cols, d_model = x.get_shape().as_list()[-3:] ps = np.zeros([num_rows, num_cols, 2], dtype=K.floatx()) # shape (12,13,2) for ty in range(num_rows): for tx in range(num_cols): ps[ty, tx, :] = [(2 / (num_rows - 1)) * ty - 1, (2 / (num_cols - 1)) * tx - 1] ps_expand = K.expand_dims(K.constant(ps), axis=0) # shape (1,num_rows,num_cols,2) ps_tiled = K.tile( ps_expand, [K.shape(x)[0], 1, 1, 1]) # shape (None,num_rows,num_cols,2) # (None,num_rows,num_cols,62) concatenate with (None,num_rows,num_cols,2) # to get (None,num_rows,num_cols,TRANSFORMER_MODEL_DIM) x = Concatenate(axis=3)([x, ps_tiled]) x = layers.Reshape((num_rows * num_cols, d_model + 2))(x) # TRANSFORMER ----- for i in range(TRANSFORMER_DEPTH): x = transformer_block(x) # MAX-POOLING ----- # from p.4 "The E~ matrix, with shape Nxf is reudced to an f-dimensional vector by max-pooling # over the entity dimension. This pooled vector is then passed to a small MLP..." num_entities, d_model = x.get_shape().as_list()[-2:] x = layers.MaxPooling1D(pool_size=num_entities)(x) x = layers.Flatten()(x) # FULLY-CONNECTED LAYERS ---- x = layers.Dense(256, activation='relu')(x) x = layers.Dense(256, activation='relu')(x) x = layers.Dense(256, activation='relu')(x) x = layers.Dense(256, activation='relu')(x) output_tensor = layers.Dense(4)(x) # final output is logits return output_tensor, x
dtype='int32', name='input_dis1_2') dis_fea1_2 = disembedding(input_dis1_2) input_dis2_2 = Input(shape=(word_length[2], ), dtype='int32', name='input_dis2_2') dis_fea2_2 = disembedding(input_dis2_2) print([ x1.get_shape().as_list() for x1 in [word_fea_0, pos_fea_0, dis_fea1_0, dis_fea2_0] ]) emb_merge_0 = Concatenate()( [word_fea_0, pos_fea_0, dis_fea1_0, dis_fea2_0]) print("emb_merge_0", emb_merge_0.get_shape().as_list()) emb_merge_1 = Concatenate()( [word_fea_1, pos_fea_1, dis_fea1_1, dis_fea2_1]) emb_merge_2 = Concatenate()( [word_fea_2, pos_fea_2, dis_fea1_2, dis_fea2_2]) input_shortest_word = Input(shape=(s['shortest_part_length'], ), dtype='int32', name='input_shortest_word') shortest_word_fea = wordembedding(input_shortest_word) input_shortest_pos = Input(shape=(s['shortest_part_length'], ), dtype='int32', name='input_shortest_pos') shortest_pos_fea = posembedding(input_shortest_pos)