def SelfAttentionModel(self): with tf.variable_scope('GOU1'): cellfw = tf.nn.rnn_cell.GRUCell(FLAGS.hidden_size // 2) cellbw = tf.nn.rnn_cell.GRUCell(FLAGS.hidden_size // 2) output, _ = tf.nn.bidirectional_dynamic_rnn(cellfw, cellbw, self.signal_input, dtype=tf.float32) rnn_output = tf.concat(output, 2) sample_encoder = encode_model.Encoder(num_layers=FLAGS.num_layer, d_model=1, num_heads=FLAGS.num_head, dff=FLAGS.num_dff) sample_encoder_output = sample_encoder(rnn_output, training=False, mask=None) concat_output = tf.concat([rnn_output, sample_encoder_output], axis=2) concat_output = self.layernorm1(concat_output) with tf.variable_scope('GOU2'): cellfw = tf.nn.rnn_cell.GRUCell(FLAGS.hidden_size // 2) cellbw = tf.nn.rnn_cell.GRUCell(FLAGS.hidden_size // 2) output, _ = tf.nn.bidirectional_dynamic_rnn(cellfw, cellbw, concat_output, dtype=tf.float32) rnn_output2 = tf.concat(output, 2) sample_encoder2 = encode_model.Encoder(num_layers=FLAGS.num_layer, d_model=1, num_heads=FLAGS.num_head, dff=FLAGS.num_dff) sample_encoder_output2 = sample_encoder2(rnn_output2, training=False, mask=None) concat_output2 = tf.concat([rnn_output2, sample_encoder_output2], axis=2) concat_output2 = self.layernorm1(concat_output2) with tf.variable_scope('GOU3'): cellfw = tf.nn.rnn_cell.GRUCell(8) cellbw = tf.nn.rnn_cell.GRUCell(8) output, _ = tf.nn.bidirectional_dynamic_rnn(cellfw, cellbw, concat_output2, dtype=tf.float32) rnn_output3 = (output[0] + output[1]) / 2 fft_signal = tf.math.l2_normalize(tf.abs(tf.signal.rfft(rnn_output3)), axis=1) return rnn_output3, fft_signal
def SelfAttentionModel(self): new_input = tf.expand_dims(self.signal_input, axis=2) with tf.variable_scope('GOU1'): cellfw = tf.nn.rnn_cell.GRUCell(FLAGS.hidden_size // 2) cellbw = tf.nn.rnn_cell.GRUCell(FLAGS.hidden_size // 2) output, _ = tf.nn.bidirectional_dynamic_rnn(cellfw, cellbw, new_input, dtype=tf.float32) rnn_output = tf.concat(output, 2) sample_encoder = encode_model.Encoder(num_layers=FLAGS.num_layer, d_model=1, num_heads=FLAGS.num_head, dff=FLAGS.num_dff) sample_encoder_output = sample_encoder(rnn_output, training=False, mask=None) concat_output = tf.concat([rnn_output, sample_encoder_output], axis=2) concat_output = self.layernorm1(concat_output) with tf.variable_scope('GOU2'): cellfw = tf.nn.rnn_cell.GRUCell(FLAGS.hidden_size // 2) cellbw = tf.nn.rnn_cell.GRUCell(FLAGS.hidden_size // 2) output, _ = tf.nn.bidirectional_dynamic_rnn(cellfw, cellbw, concat_output, dtype=tf.float32) rnn_output2 = tf.concat(output, 2) sample_encoder2 = encode_model.Encoder(num_layers=FLAGS.num_layer, d_model=1, num_heads=FLAGS.num_head, dff=FLAGS.num_dff) sample_encoder_output2 = sample_encoder2(rnn_output2, training=False, mask=None) concat_output2 = tf.concat([rnn_output2, sample_encoder_output2], axis=2) concat_output2 = self.layernorm1(concat_output2) with tf.variable_scope('GOU3'): cellfw = tf.nn.rnn_cell.GRUCell(FLAGS.hidden_size // 2) cellbw = tf.nn.rnn_cell.GRUCell(FLAGS.hidden_size // 2) output, _ = tf.nn.bidirectional_dynamic_rnn(cellfw, cellbw, concat_output2, dtype=tf.float32) rnn_output3 = tf.concat(output, 2) rnn_output3 = tf.expand_dims(rnn_output3, axis=3) average_layer3 = tf.layers.average_pooling2d(rnn_output3, [1, rnn_output3.shape[2]], strides=[1, 1]) squeeze_layer3 = tf.squeeze(average_layer3) fft_signal = tf.math.l2_normalize(tf.abs(tf.signal.rfft(squeeze_layer3)), axis=1) return squeeze_layer3, fft_signal
def greedy_transformer_decoder(features, labels, labels_length, params, mode): kernel_initializer = tf.contrib.layers.xavier_initializer(uniform=False) with tf.name_scope('encoder'): encoder = transformer.Encoder( params['encoder'], kernel_initializer=kernel_initializer, drop_out=lambda x: tf.layers.dropout( x, rate=params['drop_out'], training=mode == tf.estimator.ModeKeys.TRAIN ), num_layers=6, name='transformer_encoder' ) encoder_outputs = encoder.apply(features) with tf.name_scope('decoder'): decoder = transformer.Decoder( params['decoder'], kernel_initializer=kernel_initializer, drop_out=lambda x: tf.layers.dropout( x, rate=params['drop_out'], training=mode == tf.estimator.ModeKeys.TRAIN ), num_layers=6, name='transformer_decoder' ) soft_layer = tf.layers.Dense( units=labels.get_shape()[-1] + 1, kernel_initializer=kernel_initializer, name='softmax_output_layer' ) decoded_tuple = utils.transformer_decoding( decoder=decoder, encoder_outputs=encoder_outputs, labels=labels, labels_length=labels_length, soft_layer=soft_layer, mode=mode ) return decoded_tuple
def build_model(self, word_inputs, char_inputs, labels, seq_len, char_len, num_train_steps, char_mode): print("Building model!") if (char_mode == "no_char"): self.model_dim /= 2 # Implements linear decay of the learning rate. global_step = tf.Variable(0, trainable=False) learning_rate = tf.train.polynomial_decay(self.learning_rate, global_step, num_train_steps, end_learning_rate=0.0, power=1.0, cycle=False) encoder = transformer.Encoder(num_layers=self.num_layers, num_heads=self.num_heads, linear_key_dim=self.linear_key_dim, linear_value_dim=self.linear_value_dim, model_dim=self.model_dim, ffn_dim=self.ffn_dim, dropout=self.dropout, n_class=self.n_class, batch_size=self.batch_size) encoder_emb = self.build_embed(word_inputs, char_inputs, char_len, char_mode) encoder_outputs = encoder.build(encoder_emb, seq_len) loss = tf.reduce_mean( tf.nn.sparse_softmax_cross_entropy_with_logits( logits=encoder_outputs, labels=labels)) # Softmax loss optimizer = tf.train.AdamOptimizer( learning_rate=learning_rate).minimize( loss, global_step=global_step) # Adam Optimizer return loss, optimizer, encoder_outputs
return tf.tensordot(inputs, tf.transpose(self.w), 1) + self.b # input for the keras model tokens = tf.keras.layers.Input(shape=(seq_length, ), dtype='int32') # instantiates a tied softmax class tied_embedding_softmax = TiedEmbeddingSoftmax() # embedded tokens, before passing it to the transformer embedded = tied_embedding_softmax(tokens, embed=True) # the activations after passing it from the transformer # for some odd reason, TPUs don't play well with specifying the arguments of the Encoder() function # so you have to leave them at their defaults transformed = transformer.Encoder()(embedded, training=False) # pass the activations from our tiedsoftmax class # this time with embed=False denoting that we are doing the softmax operation # and not a lookup logits = tied_embedding_softmax(transformed, embed=False) # finally, define the Keras model with inputs as tokens and outputs as the logits we just computed model = tf.keras.Model(inputs=tokens, outputs=logits) # the loss function is a simple categorical crossentropy between the logits and the labels def loss(labels, logits): return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)
def main(): # fetch the training data data, vocab = prepare_data() # create the word embeddings and positional embeddings (we learn both of them) word_emb = nn.Embedding(len(vocab), EMBEDDING_SIZE) pos_emb = nn.Embedding(len(data[0]), EMBEDDING_SIZE) # turn the dataset into a tensor of word indices data = torch.LongTensor([[vocab[word] for word in sample] for sample in data]) # create the encoder, the pretraining loss, and the optimizer encoder = transformer.Encoder( NUM_LAYERS, # num_layers NUM_HEADS, # num_heads *DIMENSIONS, # dim_model / dim_keys / dim_values DROPOUT_RATE, # residual_dropout DROPOUT_RATE, # attention_dropout PAD.index # pad_index ) loss = nn.CrossEntropyLoss() optimizer = optim.Adam( itertools.chain(encoder.parameters(), word_emb.parameters(), pos_emb.parameters()), lr=LEARNING_RATE ) # move to GPU, if possible if GPU: data = data.cuda() encoder.cuda() word_emb.cuda() pos_emb.cuda() # create a mask that ensures that no future steps can be used mask = util.create_shifted_output_mask(data)[:, :-1, :-1] # -> cut off final time step, which is never an input # create a tensor of indices, which is used to retrieve the according positional embeddings below index_seq = data.new(range(data.size(1) - 1)).unsqueeze(0).expand(data.size(0), -1) # pretrain the encoder for epoch in range(NUM_EPOCHS): # embed input sequence + add positional embeddings input_seq = word_emb(data[:, :-1]) + pos_emb(index_seq) # encode the input sequence enc = encoder(input_seq, mask) # compute (unnormalized) next-word predictions from the encoded input sequences logits = enc.matmul(word_emb.weight.transpose(0, 1)) # compute the loss optimizer.zero_grad() current_loss = loss(logits.view(-1, logits.size(-1)), data[:, 1:].contiguous().view(-1)) print(f"EPOCH {epoch + 1:>3}: LOSS = {current_loss.item()}") # update the model current_loss.backward() optimizer.step() # evaluate the probabilities of the training samples encoder.eval() input_seq = word_emb(data[:, :-1]) + pos_emb(index_seq) enc = encoder(input_seq, mask) log_probs = torch.log_softmax(enc.matmul(word_emb.weight.transpose(0, 1)), 2) sample_probs = [] for sample_idx, sample_log_probs in enumerate(log_probs): sample_data = data[sample_idx][1:].unsqueeze(1) sample_log_probs = sample_log_probs.gather(1, sample_data) * (sample_data != PAD.index).float() sample_probs.append(sample_log_probs.sum().exp().item()) print("\nSAMPLE PROBABILITIES:") for p in sample_probs: print("*", p)
def main(): # fetch the training data data, vocab = prepare_data() # create the word embeddings with word2vec and positional embeddings emb_model = word2vec.Word2Vec( sentences=data, size=EMBEDDING_SIZE, min_count=1 ) for word in vocab.keys(): if word not in emb_model.wv: emb_model.wv[word] = np.zeros((EMBEDDING_SIZE,)) word_emb_mat = nn.Parameter( data=torch.FloatTensor([emb_model[word] for word in vocab.keys()]), requires_grad=False ) word_emb = nn.Embedding(len(vocab), EMBEDDING_SIZE) word_emb.weight = word_emb_mat pos_emb = nn.Embedding(len(data[0]), EMBEDDING_SIZE) pos_emb.weight.require_grad = True # turn the dataset into a tensor of word indices data = torch.LongTensor([[vocab[word] for word in sample] for sample in data]) # create the encoder, the pretraining loss, and the optimizer encoder = transformer.Encoder( NUM_LAYERS, # num_layers NUM_HEADS, # num_heads *DIMENSIONS, # dim_model / dim_keys / dim_values DROPOUT_RATE, # residual_dropout DROPOUT_RATE, # attention_dropout PAD.index # pad_index ) loss = bert.MLMLoss( encoder, word_emb, pos_emb, MASK.index ) optimizer = optim.Adam( itertools.chain(encoder.parameters(), loss.parameters()), lr=LEARNING_RATE ) # move to GPU, if possible if GPU: data = data.cuda() encoder.cuda() loss.cuda() # -> also moves embeddings to the GPU # pretrain the encoder for epoch in range(NUM_EPOCHS): # compute the loss optimizer.zero_grad() current_loss = loss(data) print("EPOCH", epoch + 1, ": LOSS =", current_loss.item()) # update the model current_loss.backward() optimizer.step()
# input for the keras model tokens = tf.keras.layers.Input(shape=(seq_length, ), dtype='int32') # 就是tf里面的placeholder # instantiates a tied softmax class tied_embedding_softmax = TiedEmbeddingSoftmax() # embedded tokens, before passing it to the transformer embedded = tied_embedding_softmax(tokens, embed=True) # the activations after passing it from the transformer # for some odd reason, TPUs don't play well with specifying the arguments of the Encoder() function # so you have to leave them at their defaults transformed = transformer.Encoder()(embedded, training=False) # 直接类加变量,表示调用里面的call函数. # pass the activations from our tiedsoftmax class # this time with embed=False denoting that we are doing the softmax operation # and not a lookup logits = tied_embedding_softmax(transformed, embed=False) # finally, define the Keras model with inputs as tokens and outputs as the logits we just computed model = tf.keras.Model(inputs=tokens, outputs=logits) # the loss function is a simple categorical crossentropy between the logits and the labels def loss(labels, logits): return tf.keras.losses.sparse_categorical_crossentropy(labels, logits, from_logits=True)