def build(self, input_shape): self.dense_1 = Dense(4 * self.output_dim, kernel_initializer=tn(stddev=self.init_range)) self.dense_1.build(input_shape) self._trainable_weights += self.dense_1.trainable_weights self.dense_2 = Dense(self.output_dim, kernel_initializer=tn(stddev=self.init_range)) self.dense_2.build( (input_shape[0], input_shape[1], 4 * self.output_dim)) self._trainable_weights += self.dense_2.trainable_weights # Multi Head Attention # self.multihead_attention = MultiHeadAttention(self.attention_dim, self.n_heads, self.init_range) self.multihead_attention.build(input_shape) self._trainable_weights += self.multihead_attention.trainable_weights # LayerNorm # self.layer_normalization_1 = LayerNormalization() self.layer_normalization_1.build(input_shape) self._trainable_weights += self.layer_normalization_1.trainable_weights # LayerNorm # self.layer_normalization_2 = LayerNormalization() self.layer_normalization_2.build(input_shape) self._trainable_weights += self.layer_normalization_2.trainable_weights # Gelu # self.gelu = Gelu() self.gelu.build((input_shape[0], input_shape[1], 4 * self.output_dim)) super(SentenceEncoderBlock, self).build(input_shape)
def create_policy_network(self, state_dim, action_dim): # build network model S = Input(shape=[state_dim]) h0 = Dense(self.HIDDEN1_UNITS, activation='elu', kernel_initializer=tn(mean=0.0, stddev=1e-4))(S) h1 = Dense(self.HIDDEN2_UNITS, activation='elu', kernel_initializer=tn(mean=0.0, stddev=1e-4))(h0) V = Dense(action_dim, activation='tanh', kernel_initializer=tn(mean=0.0, stddev=1e-4))(h1) model = Model(inputs=S, outputs=V) adam = Adam(lr=self.lr) model.compile(loss=self.BATCH_LOSS, optimizer=adam) return model, S
def create_qvalue_network(self, state_dim, action_dim): # build network model S = Input(shape=[state_dim]) w1 = Dense(self.HIDDEN1_UNITS, activation='elu', kernel_initializer=tn(mean=0.0, stddev=1e-2))(S) w2 = Dense(self.HIDDEN1_UNITS, activation='linear', kernel_initializer=tn(mean=0.0, stddev=1e-2))(w1) A = Input(shape=[action_dim]) a1 = Dense(self.HIDDEN2_UNITS, activation='linear', kernel_initializer=tn(mean=0.0, stddev=1e-2))(A) h1 = layers.concatenate([w2,a1]) h2 = Dense(self.HIDDEN2_UNITS, activation='elu', kernel_initializer=tn(mean=0.0, stddev=1e-2))(h1) V = Dense(1, activation='linear')(h2) model = Model(inputs=[S,A], outputs=V) adam = Adam(lr=self.lr) model.compile(loss=self.BATCH_LOSS, optimizer=adam) return model, A, S
def build(self, input_shape): self.wq = self.add_weight(shape=(input_shape[-1], self.d), name="Wq", initializer=tn( stddev=self.init_range), trainable=True) self.wk = self.add_weight(shape=(input_shape[-1], self.d), name="Wk", initializer=tn( stddev=self.init_range), trainable=True) self.wv = self.add_weight(shape=(input_shape[-1], self.d), name="Wv", initializer=tn( stddev=self.init_range), trainable=True) super(SelfAttention, self).build(input_shape)
def build(self, input_shape): self.w = self.add_weight(shape=(self.d * self.n_heads, input_shape[-1]), name="w", initializer=tn(stddev=self.init_range), trainable=True) for i in range(self.n_heads): self.heads[i] = SelfAttention(self.d, self.init_range) self.heads[i].build(input_shape) self._trainable_weights += self.heads[i].trainable_weights super(MultiHeadAttention, self).build(input_shape)
def build(self, input_shape): # Query Layers # self.query_layers = [] for i in range(self.n_heads): self.query_layers.append(Dense(self.k_dim, kernel_initializer=tn( stddev=self.init_range))) for i in range(self.n_heads): self.query_layers[i].build(input_shape) self._trainable_weights += self.query_layers[i].trainable_weights # Value Embeddings # self.values = Embedding(self.memory_size ** 2, self.output_dim, embeddings_initializer=tn( stddev=self.init_range)) self.values.build(input_shape) self._trainable_weights += self.values.trainable_weights # Keys # self._trainable_weights += [self.keys] super(PKM, self).build(input_shape)
def build(self): with tf.device("/device:GPU:0"): input_tokens = Input(shape=(None, )) input_positions = Input(shape=(None, )) input_segments = Input(shape=(None, )) token_embedding_matrix = Embedding( self.vocab_size + 1, self.embedding_size, input_length=self.input_length, embeddings_initializer=tn(stddev=self.init_range)) pos_embedding_matrix = Embedding( (2 * self.max_len) + 4, self.embedding_size, input_length=self.input_length, embeddings_initializer=tn(stddev=self.init_range)) seg_embedding_matrix = Embedding( 2, self.embedding_size, input_length=self.input_length, embeddings_initializer=tn(stddev=self.init_range)) token_embeddings = token_embedding_matrix(input_tokens) position_embeddings = pos_embedding_matrix(input_positions) segment_embeddings = seg_embedding_matrix(input_segments) sum_embeddings = Add()([token_embeddings, position_embeddings]) sum_embeddings = Add()([sum_embeddings, segment_embeddings]) if self.factorize_embeddings: sum_embeddings = Dense( self.encoder_size[0], kernel_initializer=tn( stddev=self.init_range))(sum_embeddings) sum_embeddings = Gelu()(sum_embeddings) if self.input_dropout != 0.: sum_embeddings = SpatialDropout1D( self.input_dropout)(sum_embeddings) ant_layer = sum_embeddings encoders = [] if self.cross_sharing: first_encoder = SentenceEncoderBlock( self.encoder_size[0], self.attention_size[0], self.n_heads[0], dropout=self.output_dropout, init_range=self.init_range) flag_mem = 0 for i in range(self.n_encoders): if self.pkm and i in self.pkm_params["in_layers"]: encoders.append( SentenceEncoderMemoryBlock(self.encoder_size[0], self.attention_size[0], self.n_heads[0], self.pkm_params, dropout=self.output_dropout, init_range=self.init_range)) flag_mem = 1 else: if self.cross_sharing: encoders.append(first_encoder) else: encoders.append( SentenceEncoderBlock(self.encoder_size[0], self.attention_size[0], self.n_heads[0], dropout=self.output_dropout, init_range=self.init_range)) if flag_mem == 1: with tf.device("/device:GPU:1"): encoded = encoders[-1](ant_layer) ant_layer = encoded flag_mem = 0 #print("Layer: %d -> %s : Allocated in GPU: %d" % ( # i, encoders[-1], 1)) else: with tf.device("/device:GPU:%d" % (i % 2)): encoded = encoders[-1](ant_layer) ant_layer = encoded #print("Layer: %d -> %s : Allocated in GPU: %d" % ( # i, encoders[-1], (i % 2))) # Reply Order Prediction # if self.use_rop: cls_output = Lambda(lambda x: x[:, 0, :])(ant_layer) rop_hidden = cls_output for i in range(self.rop_n_hidden): rop_hidden = Dense(self.rop_hidden_size, kernel_initializer=tn( self.init_range))(rop_hidden) rop_hidden = Gelu()(rop_hidden) rop_hidden = LayerNormalization()(rop_hidden) output_reply_tweet = Dense(2, activation="softmax", kernel_initializer=tn(self.init_range), name="rop")(rop_hidden) mlm_outputs = TimeDistributed(Dense(self.vocab_size, activation="softmax", kernel_initializer=tn( self.init_range)), name="mlm")(ant_layer) if self.use_rop: self.model = Model( inputs=[input_tokens, input_positions, input_segments], outputs=[output_reply_tweet, mlm_outputs]) else: self.model = Model( inputs=[input_tokens, input_positions, input_segments], outputs=[mlm_outputs]) self.pretrained_model = Model( inputs=[input_tokens, input_positions, input_segments], outputs=ant_layer)
def finetune_ffn(pretrained_model, n_classes, trainable_layers="all", collapse_mode="cls", finetune_dropout=0.15, loss="categorical_crossentropy", init_range=0.02, lr=0.001, multi_label=False, optimizer="adam", accum_iters=1): assert collapse_mode in ["cls", "max", "avg", "concat"] if trainable_layers != "all": assert type(trainable_layers) == list model_layers = [] for layer in pretrained_model.layers: layer.trainable = False if "embedding" in layer.name or "encoder" in layer.name: model_layers.append(layer) for k in trainable_layers: model_layers[k].trainable = True input_tokens = Input(shape=(None, )) input_positions = Input(shape=(None, )) input_segments = Input(shape=(None, )) pretrained_output = pretrained_model( [input_tokens, input_positions, input_segments]) if collapse_mode == "cls": cls_output = Lambda(lambda x: x[:, 0, :])(pretrained_output) else: if collapse_mode == "avg": cls_output = GlobalAveragePooling1D()(pretrained_output) elif collapse_mode == "max": cls_output = GlobalMaxPooling1D()(pretrained_output) elif collapse_mode == "concat": avg = GlobalAveragePooling1D()(pretrained_output) mx = GlobalMaxPooling1D()(pretrained_output) cls = Lambda(lambda x: x[:, 0, :])(pretrained_output) cls_output = Concatenate(axis=-1)([cls, avg, mx]) cls_output = Dropout(finetune_dropout)(cls_output) if not multi_label: output = Dense(n_classes, activation="softmax", kernel_initializer=tn(init_range))(cls_output) else: output = Dense(n_classes, activation="sigmoid", kernel_initializer=tn(init_range))(cls_output) finetune_model = Model( inputs=[input_tokens, input_positions, input_segments], outputs=output) if optimizer == "adam_accumulated": opt = ADAM(lr=lr, accum_iters=accum_iters) elif optimizer == "lamb_accumulated": opt = LAMB(lr=lr, accum_iters=accum_iters) else: opt = optimizer loss = loss_indexer(loss) finetune_model.compile(optimizer=opt, loss=loss, metrics=["accuracy"]) return finetune_model