def build_word_decoder(self, word_vectors_3, char_ids_3): config = self.config with tf.variable_scope('word_condition_projection'): word_vectors_3 = layers.mlp(word_vectors_3, self.config['sentence_decoder_projection']) with tf.variable_scope('word_decoder'): spell_vector_len = config['spell_vector_len'] spell_vector_size = spell_vector_len * config['char_embed_size'] spell_vector_size *= 2 # TODO make this factor configurable # Grab char embeds and concat them to spelling vector representations of words char_ids_3 = self.add_go(char_ids_3, axis=2) char_embeds_4 = layers.embedding(self.num_chars, config['char_embed_size'], char_ids_3) spell_vectors_3 = self.create_spell_vector(char_embeds_4, spell_vector_len) # Pass spelling vector through a layer that can see previous chars, but can't see ahead with tf.variable_scope('future_masked_spelling'): spell_vectors_projected_3 = layers.feed_forward(spell_vectors_3, num_nodes=spell_vector_size, seq_len_for_future_mask=spell_vector_len) # Reshape word representation into individual char representations batch_size, sentence_len, word_len = tf.unstack(tf.shape(char_ids_3)) char_size = spell_vectors_projected_3.shape.as_list()[-1]/spell_vector_len char_vectors_4 = tf.reshape(spell_vectors_projected_3, [batch_size, sentence_len, spell_vector_len, char_size]) char_vectors_4 = char_vectors_4[:, :, :word_len, :] # Project each char_vector up to the size of the conditioning word_vector with tf.variable_scope('char_projection'): word_depth = word_vectors_3.shape.as_list()[-1] char_vectors_4 = layers.feed_forward(char_vectors_4, num_nodes=word_depth) # Add the conditioning word_vector to each char and pass result through an mlp char_vectors_4 += tf.expand_dims(word_vectors_3, axis=2) char_vectors_4 = layers.mlp(char_vectors_4, config['word_decoder_mlp']) with tf.variable_scope('logits'): char_logits_4 = layers.feed_forward(char_vectors_4, num_nodes=self.num_chars, noise_level=config['noise_level']) return char_logits_4
def build_positional_char_embeds(self, char_ids_3, char_embed_size, mlp_layer_specs, word_len_limit): """ """ char_ids_3 = char_ids_3[:, :, :word_len_limit] # potentially trim long words batch_size, max_sentence_len, max_word_len = tf.unstack(tf.shape(char_ids_3)) # Select char embeddings with tf.variable_scope('chars'): char_embeds_4 = layers.embedding(self.num_chars, char_embed_size, char_ids_3) # Create char position ids for every possible char position in the batch (including padding) position_ids_1 = tf.range(max_word_len) position_ids_3 = tf.expand_dims(tf.expand_dims(position_ids_1, 0), 0) position_ids_3 = tf.tile(position_ids_3, [batch_size, max_sentence_len, 1]) # Mask position_ids for padding chars where = tf.equal(char_ids_3, -1) position_ids_3 = tf.where(where, char_ids_3, tf.cast(position_ids_3, char_ids_3.dtype)) # Convert position_ids to relative position (scalar between 0 and 1) word_lengths_3 = tf.reduce_max(position_ids_3, axis=2, keep_dims=True) word_lengths_3 = tf.where(tf.equal(word_lengths_3, 0), tf.ones_like(word_lengths_3), word_lengths_3) word_lengths_3 = tf.cast(word_lengths_3, char_embeds_4.dtype) position_ids_3 = tf.cast(position_ids_3, char_embeds_4.dtype) relative_positions_3 = position_ids_3 / word_lengths_3 # Mask relative_positions for padding chars relative_positions_3 = tf.where(where, tf.zeros_like(relative_positions_3), relative_positions_3) # Combine char embeddings with their respective positions relative_positions_4 = tf.expand_dims(relative_positions_3, axis=3) positional_char_embeds_4 = tf.concat([char_embeds_4, relative_positions_4], axis=3) positional_char_embeds_4 = layers.mlp(positional_char_embeds_4, mlp_layer_specs) return positional_char_embeds_4
def join(self, is_training, hparams, inputs, out_w, out_c, tags): """Meta model joins word and char model.""" with tf.variable_scope('meta_char_word'): out_1 = layers.dropout(is_training, hparams.keep_prob, out_w) out_2 = layers.dropout(is_training, hparams.keep_prob, out_c) outputs = tf.concat([out_1, out_2], axis=2) out_fw, out_bw, _ = layers.lstm_layers(is_training, outputs, hparams.num_layers_meta, hparams.hidden_meta_size, hparams.recur_keep_j_prob) outputs = tf.concat([out_fw, out_bw], axis=2) outputs = layers.mlp( is_training, outputs, output_size=tags, keep_prob=hparams.keep_prob) preds_w = tf.to_int32(tf.argmax(outputs, axis=-1)) targets_w = inputs[:, :, 2] tokens_to_keep = tf.to_float(tf.greater(inputs[:, :, 0], PAD)) loss = tf.losses.sparse_softmax_cross_entropy(targets_w, outputs, tokens_to_keep) if is_training: return loss else: return preds_w
def test_compiles(self): tf.reset_default_graph() with tf.Session() as sess: inputs = tf.constant([[0, 0], [1, 1], [2, 2]], dtype=tf.float32) layer_specs = [{'num_nodes': 20}, {'num_nodes': 30}] outputs = layers.mlp(inputs, layer_specs) initialize_vars(sess) sess.run(outputs) self.assertEqual(outputs.shape, (3, 30))
def char_model(self, is_training, hparams, chars, embedding_char_size, tags, inputs_char, indexs_start, indexs_end, targets_w): """Character model.""" with tf.variable_scope('chars'): if is_training: embed_dims = [chars, embedding_char_size] np.random.seed(seed=1) embeddings_char = np.random.randn(*embed_dims).astype(np.float32) cembed = tf.get_variable( 'char_embeddings', dtype=tf.float32, initializer=embeddings_char) else: cembed = tf.get_variable('char_embeddings') # joint for both embed_nd = tf.nn.embedding_lookup(cembed, inputs_char[:, :]) embed = layers.dropout(is_training, hparams.embed_keep_prob_ch, embed_nd) output_fw, output_bw, _ = layers.lstm_layers( is_training, embed, hparams.num_layers_chars, hparams.hidden_char_size, hparams.recur_keep_prob) # Gather forward start and end of word of char LSTM output. output_fw_fst = tf.gather_nd(output_fw, indexs_start) output_fw_lst = tf.gather_nd(output_fw, indexs_end) # Gather backword start and end of word of char LSTM output. output_bw_fst = tf.gather_nd(output_bw, indexs_start) output_bw_lst = tf.gather_nd(output_bw, indexs_end) # Gathered LSTM outputs into the right shape and concatenate it. outputs = tf.concat( [output_fw_fst, output_fw_lst, output_bw_fst, output_bw_lst], axis=2) outputs = layers.mlp( is_training, outputs, output_size=hparams.mlp_size, keep_prob=hparams.keep_prob) targets = targets_w[:, :] tok_keep = tf.to_float(tf.greater(targets, PAD)) linear = layers.linear_with_dropout( is_training, outputs, tags, keep_prob=hparams.keep_prob) preds = tf.to_int32(tf.argmax(linear, axis=-1)) if is_training: int_tok_keep = tf.to_int32(tok_keep) t_correct = tf.to_int32(tf.equal(preds, targets)) * int_tok_keep accuracy = tf.reduce_sum(t_correct) / tf.reduce_sum(int_tok_keep) loss = tf.losses.sparse_softmax_cross_entropy(targets, linear, tok_keep) return loss, accuracy else: return preds, outputs
def build_word_encoder(self, char_ids_3, reuse_vars=None): config = self.config with tf.variable_scope('char_encoder', reuse=reuse_vars): char_embeds_4 = self.build_positional_char_embeds(char_ids_3, config['char_embed_size'], config['char_encoder_mlp'], config['max_word_len']) with tf.variable_scope('word_encoder', reuse=reuse_vars): # Sum positional_char_embeds to get a word_vector, normalize and noise it. word_vectors_3 = layers.do_layer_norm(tf.reduce_sum(char_embeds_4, axis=2)) shape_1 = tf.shape(word_vectors_3) word_vectors_2 = tf.reshape(word_vectors_3, [-1, shape_1[-1]]) word_vectors_2 += layers.gaussian_noise(word_vectors_2, self.config['noise_level']) word_vectors_3 = tf.reshape(word_vectors_2, shape_1) # Pass word_vectors through an MLP word_vectors_3 = layers.mlp(word_vectors_3, config['word_encoder_mlp']) return word_vectors_3
def word_model(self, is_training, hparams, words, embedding_word_size, tags, pretrained_embed, inputs): """Word model.""" with tf.variable_scope('words'): embedding = tf.get_variable( 'word_embedding', [words, embedding_word_size], dtype=tf.float32, initializer=tf.zeros_initializer()) word_inputs = tf.nn.embedding_lookup(embedding, inputs[:, :, 0]) word_inputs = word_inputs word_inputs = layers.dropout(is_training, hparams.embed_keep_prob, word_inputs) pret_inputs = tf.nn.embedding_lookup(pretrained_embed, inputs[:, :, 1]) pret_inputs = layers.dropout(is_training, hparams.embed_keep_prob, pret_inputs) word_inputs += pret_inputs targets_w = inputs[:, :, 2] outputs = word_inputs output_fw, output_bw, _ = layers.lstm_layers( is_training, outputs, hparams.num_layers_words, hparams.hidden_word_size, hparams.recur_keep_w_prob) outputs = tf.concat([output_fw, output_bw], axis=2) outputs = layers.mlp( is_training, outputs, output_size=hparams.mlp_size, keep_prob=hparams.keep_prob) logits = layers.linear_with_dropout( is_training, outputs, tags, keep_prob=hparams.keep_prob) preds_w = tf.to_int32(tf.argmax(logits, axis=-1)) tag_correct_w = tf.to_int32(tf.equal(preds_w, targets_w)) correct = tf.reduce_sum(tag_correct_w) / tf.size( tag_correct_w) tokens_to_keep = tf.to_float(tf.greater(inputs[:, :, 0], PAD)) loss_w = tf.losses.sparse_softmax_cross_entropy(targets_w, logits, tokens_to_keep) if is_training: return loss_w, correct else: return preds_w, outputs