def _rnn_encoder(model): """ :type model: modeling.BERTModel """ with tf.variable_scope('rnn_encoder'): # Embed clinical observations embedded_observations = layers.embedding_layer(model.observations, model.vocabulary_size, model.embedding_size, model.vocab_dropout, training=model.training) # Reshape to (batch * seq_len) x doc_len x embedding flattened_embedded_obs = tf.reshape(embedded_observations, [model.batch_size * model.max_seq_len, model.max_snapshot_size, model.embedding_size], name='flat_emb_obs') flattened_snapshot_sizes = tf.reshape(model.snapshot_sizes, [model.batch_size * model.max_seq_len], name='flat_snapshot_sizes') # Apply RNN to all documents in all batches flattened_snapshot_encodings = layers.rnn_layer(cell_fn=cell_fn, num_hidden=num_hidden, inputs=flattened_embedded_obs, lengths=flattened_snapshot_sizes, return_interpretable_weights=False) # Reshape back to (batch x seq_len x encoding_size) return tf.reshape(flattened_snapshot_encodings, [model.batch_size, model.max_seq_len, flattened_snapshot_encodings.shape[-1]], name='rnn_snapshot_encoding')
def generator(source, target, sequence_length, vocab_size, decoder_fn=None, **opts): """ Args: source: TensorFlow queue or placeholder tensor for word ids for source target: TensorFlow queue or placeholder tensor for word ids for target sequence_length: TensorFlow queue or placeholder tensor for number of word ids for each sentence vocab_size: max vocab size determined from data decoder_fn: if using custom decoder_fn else use the default dynamic_rnn """ tf.logging.info(" Setting up generator") embedding_layer = lay.embedding_layer(vocab_size, opts["embedding_dim"], name="embedding_matrix") # TODO: add batch norm? rnn_outputs = (source >> embedding_layer >> lay.word_dropout_layer( keep_prob=opts["word_dropout_keep_prob"]) >> lay.recurrent_layer( hidden_dims=opts["rnn_hidden_dim"], keep_prob=opts["recurrent_dropout_keep_prob"], sequence_length=sequence_length, decoder_fn=decoder_fn, name="rnn_cell")) output_projection_layer = lay.dense_layer(hidden_dims=vocab_size, name="output_projections") flat_logits = (rnn_outputs >> lay.reshape_layer( shape=(-1, opts["rnn_hidden_dim"])) >> output_projection_layer) probs = flat_logits >> lay.softmax_layer() embedding_matrix = embedding_layer.get_variables_in_scope() output_projections = output_projection_layer.get_variables_in_scope() if decoder_fn is not None: return GeneratorTuple(rnn_outputs=rnn_outputs, flat_logits=flat_logits, probs=probs, loss=None, embedding_matrix=embedding_matrix[0], output_projections=output_projections) loss = (flat_logits >> lay.cross_entropy_layer(target=target) >> lay.reshape_layer(shape=tf.shape(target)) >> lay.mean_loss_by_example_layer(sequence_length=sequence_length)) # TODO: add dropout penalty return GeneratorTuple(rnn_outputs=rnn_outputs, flat_logits=flat_logits, probs=probs, loss=loss, embedding_matrix=embedding_matrix[0], output_projections=output_projections)
def generator(source, target, sequence_length, vocab_size, decoder_fn=None, **opts): """ Args: source: TensorFlow queue or placeholder tensor for word ids for source target: TensorFlow queue or placeholder tensor for word ids for target sequence_length: TensorFlow queue or placeholder tensor for number of word ids for each sentence vocab_size: max vocab size determined from data decoder_fn: if using custom decoder_fn else use the default dynamic_rnn """ tf.logging.info(" Setting up generator") embedding_layer = lay.embedding_layer(vocab_size, opts["embedding_dim"], name="embedding_matrix") # TODO: add batch norm? rnn_outputs = ( source >> embedding_layer >> lay.word_dropout_layer(keep_prob=opts["word_dropout_keep_prob"]) >> lay.recurrent_layer(hidden_dims=opts["rnn_hidden_dim"], keep_prob=opts["recurrent_dropout_keep_prob"], sequence_length=sequence_length, decoder_fn=decoder_fn, name="rnn_cell") ) output_projection_layer = lay.dense_layer(hidden_dims=vocab_size, name="output_projections") flat_logits = ( rnn_outputs >> lay.reshape_layer(shape=(-1, opts["rnn_hidden_dim"])) >> output_projection_layer ) probs = flat_logits >> lay.softmax_layer() embedding_matrix = embedding_layer.get_variables_in_scope() output_projections = output_projection_layer.get_variables_in_scope() if decoder_fn is not None: return GeneratorTuple(rnn_outputs=rnn_outputs, flat_logits=flat_logits, probs=probs, loss=None, embedding_matrix=embedding_matrix[0], output_projections=output_projections) loss = ( flat_logits >> lay.cross_entropy_layer(target=target) >> lay.reshape_layer(shape=tf.shape(target)) >> lay.mean_loss_by_example_layer(sequence_length=sequence_length) ) # TODO: add dropout penalty return GeneratorTuple(rnn_outputs=rnn_outputs, flat_logits=flat_logits, probs=probs, loss=loss, embedding_matrix=embedding_matrix[0], output_projections=output_projections)
def _dan_encoder(model): """ :param model: :type model: modeling.CANTRIPModel :return: """ with tf.variable_scope('dan_encoder'): embedded_observations = layers.embedding_layer(model.observations, model.vocabulary_size, model.embedding_size, model.vocab_dropout, training=model.training) # Reshape to (batch * seq_len * doc_len) x embedding flattened_embedded_observations = tf.reshape( embedded_observations, [model.batch_size * model.max_seq_len * model.max_snapshot_size, model.embedding_size] ) # Add dense observation layers obs_layer = flattened_embedded_observations for num_hidden in obs_hidden_units: obs_layer = tf.keras.layers.Dense(units=num_hidden, activation=activation_fn)(obs_layer) # Reshape final output by grouping observations in the same snapshot together obs_layer = tf.reshape(obs_layer, [model.batch_size * model.max_seq_len, model.max_snapshot_size, obs_hidden_units[-1]]) # Divide by active number of observations rather than the padded snapshot size; requires reshaping to # (batch x seq_len) x 1 so we can divide by this flattened_snapshot_sizes = tf.reshape(model.snapshot_sizes, [model.batch_size * model.max_seq_len, 1]) mask = tf.sequence_mask(model.snapshot_sizes, maxlen=model.max_snapshot_size, dtype=tf.float32) mask = tf.reshape(mask, [model.batch_size * model.max_seq_len, model.max_snapshot_size, 1]) # Compute dynamic-size element-wise average avg_layer = tf.reduce_sum(obs_layer * mask, axis=1) avg_layer = avg_layer / tf.cast(tf.maximum(flattened_snapshot_sizes, 1), dtype=tf.float32) # More fun dense layers for num_hidden in avg_hidden_units: avg_layer = tf.keras.layers.Dense(num_hidden, activation_fn)(avg_layer) # Final output of the model output = tf.keras.layers.Dense(model.embedding_size, activation_fn)(avg_layer) # Reshape to [batch_size x seq_len x encoding_size] return tf.reshape(output, [model.batch_size, model.max_seq_len, model.embedding_size])
def _dan_encoder(model): """ :param model: :type model: modeling.PRONTOModel :return: """ with tf.variable_scope('dan_encoder'): embedded_observations = layers.embedding_layer(model.observations, model.vocabulary_size, model.embedding_size, model.vocab_dropout, training=model.training) # Reshape to (batch * seq_len * doc_len) x embedding flattened_embedded_observations = tf.reshape( embedded_observations, [model.batch_size * model.max_seq_len * model.max_snapshot_size, model.embedding_size] ) # Add dense observation layers # TODO: switch back to ReLU as described in the paper obs_layer = flattened_embedded_observations for num_hidden in obs_hidden_units: obs_layer = tf.layers.dense(obs_layer, num_hidden, tf.nn.tanh) # Reshape final output by grouping observations in the same snapshot together obs_layer = tf.reshape(obs_layer, [model.batch_size * model.max_seq_len, model.max_snapshot_size, obs_layer.shape[-1]]) # Divide by active number of observations rather than the padded snapshot size; requires reshaping to # (batch x seq_len) x 1 so we can divide by this flattened_snapshot_sizes = tf.reshape(model.snapshot_sizes, [model.batch_size * model.max_seq_len, 1]) # Compute dynamic-size element-wise average avg_layer = tf.reduce_mean(obs_layer, axis=1) / tf.to_float(tf.maximum(flattened_snapshot_sizes, 1)) # More fun dense layers # TODO: switch back to ReLU as described in the paper for num_hidden in avg_hidden_units: avg_layer = tf.layers.dense(avg_layer, num_hidden, tf.nn.tanh) # Final output of the model output = tf.layers.dense(avg_layer, model.embedding_size, tf.nn.tanh) # Reshape to [batch_size x seq_len x encoding_size] return tf.reshape(output, [model.batch_size, model.max_seq_len, model.embedding_size])
def _cnn_encoder(model): """ :type model: BERTModel """ with tf.variable_scope('cnn_encoder'): # Embed observations embedded_observations = layers.embedding_layer(model.observations, model.vocabulary_size, model.embedding_size, model.vocab_dropout, training=model.training) # Reshape to (batch * seq_len) x snapshot_size x embedding flattened_embedded_obs = tf.reshape(embedded_observations, [model.batch_size * model.max_seq_len, model.max_snapshot_size, model.embedding_size]) # Apply parallel convolutional and pooling layers outputs = [] for n in windows: if dropout > 0: flattened_embedded_obs = \ tf.keras.layers.Dropout(rate=model.dropout)(flattened_embedded_obs, training=model.training) conv_layer = tf.keras.layers.Convolution1D(filters=kernels, kernel_size=n, activation=tf.nn.leaky_relu, name="conv_%dgram" % n)(flattened_embedded_obs) pool_layer = tf.keras.layers.MaxPooling1D(pool_size=1, strides=model.max_snapshot_size - n + 1, name="maxpool_%dgram" % n)(conv_layer) outputs.append(pool_layer) # Concatenate pooled outputs output = tf.concat(outputs, axis=-1) # Embed concat output with leaky ReLU embeddings = tf.keras.layers.Dense(units=model.embedding_size, activation=tf.nn.relu)(output) # Reshape back to [batch_size x max_seq_len x encoding_size] return tf.reshape(embeddings, [model.batch_size, model.max_seq_len, model.embedding_size])
def __init__(self, corpus, n_filters=(128, 256), filter_width=3, token_embeddings_dim=128, char_embeddings_dim=50, use_char_embeddins=True, pretrained_model_filepath=None, embeddings_dropout=False, dense_dropout=False, use_batch_norm=False, logging=False, use_crf=False, net_type='cnn', char_filter_width=5, verbouse=True, use_capitalization=False, concat_embeddings=False, cell_type=None): tf.reset_default_graph() n_tags = len(corpus.tag_dict) n_tokens = len(corpus.token_dict) n_chars = len(corpus.char_dict) embeddings_onethego = not concat_embeddings and \ corpus.embeddings is not None and \ not isinstance(corpus.embeddings, dict) # Create placeholders if embeddings_onethego: x_word = tf.placeholder( dtype=tf.float32, shape=[None, None, corpus.embeddings.vector_size], name='x_word') else: x_word = tf.placeholder(dtype=tf.int32, shape=[None, None], name='x_word') if concat_embeddings: x_emb = tf.placeholder( dtype=tf.float32, shape=[None, None, corpus.embeddings.vector_size], name='x_word') x_char = tf.placeholder(dtype=tf.int32, shape=[None, None, None], name='x_char') y_true = tf.placeholder(dtype=tf.int32, shape=[None, None], name='y_tag') mask = tf.placeholder(dtype=tf.float32, shape=[None, None], name='mask') x_capi = tf.placeholder(dtype=tf.float32, shape=[None, None], name='x_capi') # Auxiliary placeholders learning_rate_ph = tf.placeholder(dtype=tf.float32, shape=[], name='learning_rate') dropout_ph = tf.placeholder_with_default(1.0, shape=[]) training_ph = tf.placeholder_with_default(False, shape=[]) learning_rate_decay_ph = tf.placeholder(dtype=tf.float32, shape=[], name='learning_rate_decay') # Embeddings if not embeddings_onethego: with tf.variable_scope('Embeddings'): w_emb = embedding_layer( x_word, n_tokens=n_tokens, token_embedding_dim=token_embeddings_dim) if use_char_embeddins: c_emb = character_embedding_network( x_char, n_characters=n_chars, char_embedding_dim=char_embeddings_dim, filter_width=char_filter_width) emb = tf.concat([w_emb, c_emb], axis=-1) else: emb = w_emb else: emb = x_word if concat_embeddings: emb = tf.concat([emb, x_emb], axis=2) if use_capitalization: cap = tf.expand_dims(x_capi, 2) emb = tf.concat([emb, cap], axis=2) # Dropout for embeddings if embeddings_dropout: emb = tf.layers.dropout(emb, dropout_ph, training=training_ph) if 'cnn' in net_type.lower(): # Convolutional network with tf.variable_scope('ConvNet'): units = stacked_convolutions(emb, n_filters=n_filters, filter_width=filter_width, use_batch_norm=use_batch_norm, training_ph=training_ph) elif 'rnn' in net_type.lower(): if cell_type is None or cell_type not in {'lstm', 'gru'}: raise RuntimeError( 'You must specify the type of the cell! It could be either "lstm" or "gru"' ) units = stacked_rnn(emb, n_filters, cell_type=cell_type) elif 'cnn_highway' in net_type.lower(): units = highway_convolutional_network( emb, n_filters=n_filters, filter_width=filter_width, use_batch_norm=use_batch_norm, training_ph=training_ph) else: raise KeyError( 'There is no such type of network: {}'.format(net_type)) # Classifier with tf.variable_scope('Classifier'): logits = tf.layers.dense(units, n_tags, kernel_initializer=xavier_initializer()) if use_crf: sequence_lengths = tf.reduce_sum(mask, axis=1) log_likelihood, trainsition_params = tf.contrib.crf.crf_log_likelihood( logits, y_true, sequence_lengths) loss_tensor = -log_likelihood predictions = None else: ground_truth_labels = tf.one_hot(y_true, n_tags) loss_tensor = tf.nn.softmax_cross_entropy_with_logits( labels=ground_truth_labels, logits=logits) loss_tensor = loss_tensor * mask predictions = tf.argmax(logits, axis=-1) loss = tf.reduce_mean(loss_tensor) # Initialize session sess = tf.Session() if verbouse: self.print_number_of_parameters() if logging: self.train_writer = tf.summary.FileWriter('summary', sess.graph) self._use_crf = use_crf self.summary = tf.summary.merge_all() self._learning_rate_decay_ph = learning_rate_decay_ph self._x_w = x_word self._x_c = x_char self._y_true = y_true self._y_pred = predictions if concat_embeddings: self._x_emb = x_emb if use_crf: self._logits = logits self._trainsition_params = trainsition_params self._sequence_lengths = sequence_lengths self._learning_rate_ph = learning_rate_ph self._dropout = dropout_ph self._loss = loss self._sess = sess self.corpus = corpus self._loss_tensor = loss_tensor self._use_dropout = True if embeddings_dropout or dense_dropout else None self._training_ph = training_ph self._logging = logging # Get training op self._train_op = self.get_train_op( loss, learning_rate_ph, lr_decay_rate=learning_rate_decay_ph) self._embeddings_onethego = embeddings_onethego self.verbouse = verbouse sess.run(tf.global_variables_initializer()) self._mask = mask if use_capitalization: self._x_capi = x_capi self._use_capitalization = use_capitalization self._concat_embeddings = concat_embeddings if pretrained_model_filepath is not None: self.load(pretrained_model_filepath)
def step_through_session(self, X, attention_mask, return_last_with_hidden_states=False, return_softmax=False, reuse=False): """ Train for a batch of sessions in the HRED X can be a 3-D tensor (steps, batch, vocab) :param X: The input sessions. Lists of ints, ints correspond to words Shape: (max_length x batch_size) :return: """ num_of_steps = tf.shape(X)[0] batch_size = tf.shape(X)[1] # Making embeddings for x embedder = layers.embedding_layer(X, vocab_dim=self.vocab_size, embedding_dim=self.embedding_dim, reuse=reuse) # Mask used to reset the query encoder when symbol is End-Of-Query symbol and to retain the state of the # session encoder when EoQ symbol has been seen yet. eoq_mask = tf.expand_dims(tf.cast(tf.not_equal(X, self.eoq_symbol), tf.float32), 2) # eoq mask has size [MAX LEN x BATCH SIZE] --> we want to loop over batch size # BATCH_SIZE = 80 # MAX_LEN = 50 # TODO: this shouldn't be as local # print((embedder, eoq_mask)) # Computes the encoded query state. The tensorflow scan function repeatedly applies the gru_layer_with_reset # function to (embedder, eoq_mask) and it initialized the gru layer with the zero tensor. # In the query encoder we need the possibility to reset the gru layer, namely after the eos symbol has been # reached query_encoder_packed = tf.scan( lambda result_prev, x: layers.gru_layer_with_reset( result_prev[1], # h_reset_prev x, name='forward_query_encoder', x_dim=self.embedding_dim, y_dim=self.query_dim, reuse=reuse ), (embedder, eoq_mask), # scan does not accept multiple tensors so we need to pack and unpack initializer=tf.zeros((2, batch_size, self.query_dim)) ) # print(tf.shape(query_encoder_packed)) query_encoder, hidden_query = tf.unstack(query_encoder_packed, axis=1) # query_encoder = tf.nn.dropout(query_encoder, keep_prob=0.5) # This part does the same, yet for the session encoder. Here we need to have the possibility to keep the current # state where we were at, namely if we have not seen a full query. If we have, update the session encoder state. # session_encoder_packed = tf.scan( # lambda result_prev, x: layers.gru_layer_with_retain( # result_prev[1], # h_retain_prev # x, # name='session_encoder', # x_dim=self.query_dim, # 2* # y_dim=self.session_dim, # reuse=reuse # ), # (query_encoder, eoq_mask), # initializer=tf.zeros((2, batch_size, self.session_dim)) # ) # # session_encoder, hidden_session = tf.unstack(session_encoder_packed, axis=1) # session_encoder = layers.gnn_attention(session_encoder, attention_mask, query_encoder_gnn, self.session_dim, # self.query_dim, reuse=reuse) # This part makes the decoder for a step. The decoder uses both the word embeddings, the reset/retain vector # and the session encoder, so we give three variables to the decoder GRU. The decoder GRU is somewhat special, # as it incorporates the session_encoder into each hidden state update # decoder = tf.scan( # lambda result_prev, x: layers.gru_layer_with_state_reset( # result_prev, # x, # name='decoder', # x_dim=self.embedding_dim, # h_dim=self.query_dim, # y_dim=self.decoder_dim, # reuse=reuse # ), # (embedder, eoq_mask, query_encoder), # # scan does not accept multiple tensors so we need to pack and unpack # initializer=tf.zeros((batch_size, self.decoder_dim)) # ) # After the decoder we add an additional output layer flatten_decoder = tf.reshape(query_encoder, (-1, self.decoder_dim)) flatten_embedder = tf.reshape(embedder, (-1, self.embedding_dim)) # flatten_session_encoder = tf.reshape(session_encoder, (-1, self.session_dim)) # attention # expand to batch_size x num_of_steps x query_dim # query_encoder_T = tf.transpose(query_encoder, perm=[1, 0, 2]) # query_decoder_T = tf.transpose(decoder, perm=[1, 0, 2]) # expand to num_of_steps x batch_size x num_of_steps x query_dim # query_encoder_expanded = tf.tile(tf.expand_dims(query_encoder, 2), (1, 1, num_of_steps, 1)) # query_encoder_expanded = query_encoder_expanded * tf.tile(tf.expand_dims(attention_mask, 3), (1, 1, 1, self.query_dim)) # 2* # flatten_decoder_with_attention = \ # layers.attention_session(query_encoder_expanded, flatten_decoder, enc_dim=self.query_dim, dec_dim=self.decoder_dim, # reuse=reuse) # 2* output_layer = layers.output_layer( flatten_embedder, flatten_decoder, # x_dim=self.embedding_dim, h_dim=self.decoder_dim, # 2* y_dim=self.output_dim, reuse=reuse ) # We compute the output logits based on the output layer above flatten_logits, self.l2_loss = layers.logits_layer( output_layer, self.l2_loss, x_dim=self.output_dim, y_dim=self.vocab_size, reuse=reuse ) logits = tf.reshape(flatten_logits, (num_of_steps, batch_size, self.vocab_size)) # logits = tf.Print(logits, [np.argmax(logits)], summarize=1500) # If we want the softmax back from this step or just the logits f or the loss function if return_softmax: output = self.softmax(logits) else: output = logits # If we want to continue decoding with single_step we need the hidden states of all GRU layers if return_last_with_hidden_states: # hidden_decoder = decoder # there is no resetted decoder output # Note for attention mechanism return output[-1, :, :], hidden_query[:, :, :] # , hidden_decoder[-1, :, :] else: return output
def single_step(self, X, prev_hidden_query_states, prev_hidden_session, prev_hidden_decoder, reuse=True): """ Performs a step in the HRED X can be a 2-D tensor (batch, vocab), this can be used for beam search :param X: The input sessions. Lists of ints, ints correspond to words Shape: (max_length) :param start_hidden_query: The first hidden state of the query encoder. Initialized with zeros. Shape: (2 x query_dim) :param start_hidden_session: The first hidden state of the session encoder. Iniitalized with zeros. Shape: (2 x session_dim) :param start_hidden_decoder: The first hidden state of the decoder. Initialized with zeros. Shape: (output_dim) :return: """ # Note that with the implementation of attention the object "prev_hidden_query_states" contains not only the # previous query encoded state but all previous states, therefore we need to get the last query state prev_hidden_query = prev_hidden_query_states[-1, :, :] # Making embeddings for x embedder = layers.embedding_layer(X, vocab_dim=self.vocab_size, embedding_dim=self.embedding_dim, reuse=reuse) # Mask used to reset the query encoder when symbol is End-Of-Query symbol and to retain the state of the # session encoder when EoQ symbol has been seen yet. eoq_mask = tf.cast(tf.not_equal(X, self.eoq_symbol), tf.float32) query_encoder, hidden_query = tf.unstack(layers.gru_layer_with_reset( prev_hidden_query, # h_reset_prev (embedder, eoq_mask), name='forward_query_encoder', x_dim=self.embedding_dim, y_dim=self.query_dim, reuse=reuse )) # This part does the same, yet for the session encoder. Here we need to have the possibility to keep the current # state where we were at, namely if we have not seen a full query. If we have, update the session encoder state. session_encoder, hidden_session = tf.unstack(layers.gru_layer_with_retain( prev_hidden_session, # h_retain_prev (query_encoder, eoq_mask), name='session_encoder', x_dim=self.query_dim, y_dim=self.session_dim, reuse=reuse )) # This part makes the decoder for a step. The decoder uses both the word embeddings, the reset/retain vector # and the session encoder, so we give three variables to the decoder GRU. The decoder GRU is somewhat special, # as it incorporates the session_encoder into each hidden state update hidden_decoder = layers.gru_layer_with_state_reset( prev_hidden_decoder, (embedder, eoq_mask, session_encoder), name='decoder', x_dim=self.embedding_dim, h_dim=self.session_dim, y_dim=self.decoder_dim, reuse=reuse ) decoder = hidden_decoder flatten_decoder = tf.reshape(decoder, (-1, self.decoder_dim)) # add attention layer # expand to num_of_steps x batch_size x num_of_steps x query_dim num_of_atten_states = tf.shape(prev_hidden_query_states)[0] # tf.Print(num_of_atten_states, [num_of_atten_states], "INFO - single-step ") # tf.Print(flatten_decoder, [tf.shape(flatten_decoder)], "INFO - decoder.shape ") query_encoder_expanded = tf.transpose(prev_hidden_query_states, [1, 0, 2]) flatten_decoder_with_attention = \ layers.attention_step(query_encoder_expanded, flatten_decoder, enc_dim=self.query_dim, dec_dim=self.decoder_dim, reuse=reuse) # After the decoder we add an additional output layer output = layers.output_layer( embedder, flatten_decoder_with_attention, # x_dim=self.embedding_dim, h_dim=self.decoder_dim + self.query_dim, # y_dim=self.output_dim, reuse=reuse ) # We compute the output logits based on the output layer above logits = layers.logits_layer( output, x_dim=self.output_dim, y_dim=self.vocab_size, reuse=reuse ) softmax = self.softmax(logits) return softmax, tf.concat([prev_hidden_query_states, tf.expand_dims(hidden_query, 0)], 0), hidden_session, hidden_decoder
def __init__(self, corpus, n_filters=(128, 128), filter_width=3, token_embeddings_dim=100, char_embeddings_dim=30, use_char_embeddins=True, embeddings_dropout=False, use_crf=False, char_filter_width=3, pretrained_model_path=None, char_max_len=30): tf.reset_default_graph() n_tags = len(corpus.tag_dict) n_tokens = len(corpus.token_dict) n_chars = len(corpus.char_dict) # Create placeholders x_word = tf.placeholder(dtype=tf.int32, shape=[None, None], name='x_word') x_char = tf.placeholder(dtype=tf.int32, shape=[None, None, None], name='x_char') y_true = tf.placeholder(dtype=tf.int32, shape=[None, None], name='y_tag') mask = tf.placeholder(dtype=tf.int32, shape=[None, None], name='mask') learning_rate_ph = tf.placeholder(dtype=tf.float32, shape=[], name='learning_rate') dropout_ph = tf.placeholder_with_default(1.0, shape=[]) training_ph = tf.placeholder_with_default(False, shape=[]) learning_rate_decay_ph = tf.placeholder(dtype=tf.float32, shape=[], name='learning_rate_decay') momentum_ph = tf.placeholder(dtype=tf.float32, shape=[], name='momentum') max_grad_ph = tf.placeholder(dtype=tf.float32, shape=[], name='max_grad') # Embeddings with tf.variable_scope('Embeddings'): w_emb = embedding_layer(x_word, n_tokens=n_tokens, token_embedding_dim=token_embeddings_dim, token_embedding_matrix=corpus.emb_mat) w_emb = tf.cast(w_emb, tf.float32) c_emb = character_embedding_network( x_char, n_characters=n_chars, char_embedding_dim=char_embeddings_dim, filter_width=char_filter_width, dropout_ph=dropout_ph) emb = tf.concat([w_emb, c_emb], axis=-1) # Dropout for embeddings emb = tf.layers.dropout(emb, dropout_ph, training=training_ph) # Make bi-LSTM sequence_lengths = tf.reduce_sum(mask, axis=1) units = biLSTM(emb, n_filters, sequence_lengths) # Dropout units = tf.layers.dropout(units, dropout_ph, training=training_ph) # Classifier with tf.variable_scope('Classifier'): logits = tf.layers.dense(units, n_tags, kernel_initializer=xavier_initializer()) if use_crf: log_likelihood, trainsition_params = tf.contrib.crf.crf_log_likelihood( logits, y_true, sequence_lengths) loss_tensor = -log_likelihood predictions = None else: ground_truth_labels = tf.one_hot(y_true, n_tags) loss_tensor = tf.nn.softmax_cross_entropy_with_logits( labels=ground_truth_labels, logits=logits) loss_tensor = loss_tensor * mask predictions = tf.argmax(logits, axis=-1) loss = tf.reduce_mean(loss_tensor) # Initialize session sess = tf.Session() self._use_crf = use_crf self._learning_rate_decay_ph = learning_rate_decay_ph self._x_w = x_word self._x_c = x_char self._y_true = y_true self._y_pred = predictions self._learning_rate_ph = learning_rate_ph self._dropout = dropout_ph self._loss = loss self._sess = sess self.corpus = corpus self._loss_tensor = loss_tensor self._use_dropout = embeddings_dropout self._training_ph = training_ph if use_crf: self._logits = logits self._trainsition_params = trainsition_params self._sequence_lengths = sequence_lengths self.filewriter = tf.summary.FileWriter('graphs', sess.graph) self.summary = tf.summary.merge_all() self._train_op = self.get_train_op( loss, learning_rate_ph, lr_decay_rate=learning_rate_decay_ph, momentum=momentum_ph, max_grad=max_grad_ph) sess.run(tf.global_variables_initializer()) self._mask = mask if pretrained_model_path is not None: self.load(pretrained_model_path) self._momentum = momentum_ph self._max_grad = max_grad_ph
def build_model(placeholders, info, batch_size=4, adj_channel_num=1, embedding_dim=10): sequences = placeholders["sequences"] sequences_len = placeholders["sequences_len"] labels = placeholders["labels"] mask = placeholders["mask"] dropout_rate = placeholders["dropout_rate"] mask_label = placeholders["mask_label"] wd_b = None wd_w = 0.1 is_train = placeholders["is_train"] dropout_rate = 1 - dropout_rate ### ### Sequence part ### with tf.variable_scope("seq_nn") as scope_part: # Embedding embedding_dim = 25 layer = layers.embedding_layer("embedding", sequences, info.sequence_symbol_num, embedding_dim, init_params_flag=True, params=None) # CNN + Pooling stride = 4 layer = klayer.convolutional.Conv1D(505, stride, padding="same", activation='relu')(layer) layer = klayer.pooling.MaxPooling1D(stride)(layer) stride = 3 layer = klayer.convolutional.Conv1D(200, stride, padding="same", activation='relu')(layer) layer = klayer.pooling.MaxPooling1D(stride)(layer) stride = 2 layer = klayer.convolutional.Conv1D(100, stride, padding="same", activation='relu')(layer) layer = klayer.pooling.MaxPooling1D(stride)(layer) layer = klayer.convolutional.Conv1D(1, stride, padding="same", activation='tanh')(layer) layer = tf.squeeze(layer) output_dim = info.label_dim logits = mu.multitask_logits(layer, labels.shape[1]) model = logits # # costの計算 各タスクのバッチ数平均 12 task_losses = mu.add_training_loss(logits = logits,label = labels,pos_weight = info.pos_weight,\ batch_size= batch_size,n_tasks = labels.shape[1],mask = mask_label) total_loss = tf.reduce_sum(task_losses) #全タスクのlossを合計 ### multi-task loss cost_opt = task_losses each_cost = task_losses # 2値の確率予測:12×50×2 prediction = mu.add_softmax(logits) metrics = {} cost_sum = total_loss # cost_sum = cost_opt metrics["each_cost"] = task_losses metrics["each_correct_count"] = {} for i in range(labels.shape[1]): equal_cnt = mask_label[:, i] * tf.cast( tf.equal(tf.cast(tf.argmax(prediction[i], 1), tf.int16), tf.cast(labels[:, i], tf.int16)), tf.float32) each_correct_count = tf.cast(tf.reduce_sum(equal_cnt, axis=0), tf.float32) metrics["each_correct_count"][i] = each_correct_count # correct_count=0#mask*tf.cast(tf.reduce_all(tf.equal(tf.cast(tf.argmax(prediction,1),tf.int16), tf.cast(labels,tf.int16)),axis=1),tf.float32) metrics["correct_count"] = sum( [metrics["each_correct_count"][i] for i in range(labels.shape[1])]) return model, prediction, cost_opt, cost_sum, metrics