def _build(self, features, labels, params): # Pre-process features and labels features, labels = self._preprocess(features, labels) # features = tf.Print(features, [tf.shape(features), features], message="input features: ") encoder_output = self.encode(features, labels) decoder_output, _ = self.decode(encoder_output, features, labels) print ("build seq2seq model") print (encoder_output) print (decoder_output) print ("build done") if self.mode == tf.contrib.learn.ModeKeys.INFER: predictions = self.create_predictions( decoder_output=decoder_output, features=features, labels=labels) loss = None train_op = None else: losses, loss = self.compute_loss(decoder_output, features, labels) train_op = None if self.mode == tf.contrib.learn.ModeKeys.TRAIN: train_op = self._build_train_op(loss) predictions = self.create_predictions( decoder_output=decoder_output, features=features, labels=labels, losses=losses) # We add "useful" tensors to the graph collection so that we # can easly find them in our hooks/monitors. graph_utils.add_dict_to_collection(predictions, "predictions") return predictions, loss, train_op
def create_lookup_table(self): # Create vocabulary lookup for source source_vocab_to_id, source_id_to_vocab, source_word_to_count, source_origin_vocab_size = \ vocab.create_tensor_vocab(self._vocab_instance) # Create vocabulary look for target target_vocab_to_id, target_id_to_vocab, target_word_to_count, target_origin_vocab_size = \ source_vocab_to_id, source_id_to_vocab, source_word_to_count, source_origin_vocab_size # Add vocab tables to graph colection so that we can access them in # other places. graph_utils.add_dict_to_collection({ "source_vocab_to_id": source_vocab_to_id, "source_id_to_vocab": source_id_to_vocab, "source_word_to_count": source_word_to_count, "target_vocab_to_id": target_vocab_to_id, "target_id_to_vocab": target_id_to_vocab, "target_word_to_count": target_word_to_count }, "vocab_tables") self._source_vocab_to_id = source_vocab_to_id self._source_id_to_vocab = source_id_to_vocab self._target_vocab_to_id = target_vocab_to_id self._target_id_to_vocab = target_id_to_vocab self._source_origin_vocab_size = source_origin_vocab_size self._target_origin_vocab_size = target_origin_vocab_size
def _build(self, features, labels, params): # Pre-process features and labels features, labels = self._preprocess(features, labels) encoder_output = self.encode(features, labels) decoder_output, _, = self.decode(encoder_output, features, labels) # 判断是都是推理,实例化的predictions参数不一样 if self.mode == tf.contrib.learn.ModeKeys.INFER: predictions = self._create_predictions( decoder_output=decoder_output, features=features, labels=labels) loss = None train_op = None else: losses, loss = self.compute_loss(decoder_output, features, labels) train_op = None if self.mode == tf.contrib.learn.ModeKeys.TRAIN: train_op = self._build_train_op(loss) predictions = self._create_predictions( decoder_output=decoder_output, features=features, labels=labels, losses=losses) # We add "useful" tensors to the graph collection so that we # can easly find them in our hooks/monitors. graph_utils.add_dict_to_collection(predictions, "predictions") return predictions, loss, train_op
def _build(self, features, labels, params): # Pre-process features and labels features, labels = self._preprocess(features, labels) encoder_output = self.encode(features, labels) decoder_output, _, = self.decode(encoder_output, features, labels) if self.mode == tf.contrib.learn.ModeKeys.INFER: predictions = self._create_predictions( decoder_output=decoder_output, features=features, labels=labels) loss = None train_op = None else: losses, loss = self.compute_loss(decoder_output, features, labels) train_op = None if self.mode == tf.contrib.learn.ModeKeys.TRAIN: train_op = self._build_train_op(loss) predictions = self._create_predictions( decoder_output=decoder_output, features=features, labels=labels, losses=losses) # We add "useful" tensors to the graph collection so that we # can easly find them in our hooks/monitors. graph_utils.add_dict_to_collection(predictions, "predictions") return predictions, loss, train_op
def _build(self, features, labels, params): # Pre-process features and labels features, labels = self._preprocess(features, labels) encoder_output = self.encode(features, labels) decoder_output, _, = self.decode(encoder_output, features, labels) if self.mode == tf.contrib.learn.ModeKeys.INFER: predictions = self._create_predictions( decoder_output=decoder_output, features=features, labels=labels) loss = None train_op = None else: losses, loss = self.compute_loss(decoder_output, features, labels) train_op = None if self.mode == tf.contrib.learn.ModeKeys.TRAIN: train_op = self._build_train_op(loss) predictions = self._create_predictions( decoder_output=decoder_output, features=features, labels=labels, losses=losses) # We add "useful" tensors to the graph collection so that we # can easly find them in our hooks/monitors. graph_utils.add_dict_to_collection(predictions, "predictions") #here return 3 elements is ok, in estimator, it will be atomatically into model_fn_lib.ModelFnOps return predictions, loss, train_op
def setUp(self): super(TestTokenCounter, self).setUp() self.model_dir = tempfile.mkdtemp() graph_utils.add_dict_to_collection( {"source_len": tf.constant([[2, 3]])}, "features") graph_utils.add_dict_to_collection({"target_len": tf.constant([4, 6])}, "labels")
def encode(self, features, labels): res = super(DiscriminatorSeq2Seq, self).encode(features, labels) # Add the encoder output to a graph collection so that we can find it. dict_items = collections.OrderedDict(zip(res._fields, res)) dict_items.pop("final_state") graph_utils.add_dict_to_collection(dict_items, "encoder_output") return res
def setUp(self): super(TestTrainSampleHook, self).setUp() self.sample_dir = tempfile.mkdtemp() # The hook expects these collections to be in the graph pred_dict = {} pred_dict["predicted_tokens"] = tf.constant([["Hello", "World", "笑w"]]) pred_dict["labels.target_tokens"] = tf.constant([["Hello", "World", "笑w"]]) pred_dict["labels.target_len"] = tf.constant(2), graph_utils.add_dict_to_collection(pred_dict, "predictions")
def setUp(self): super(TestTrainSampleHook, self).setUp() self.model_dir = tempfile.mkdtemp() self.sample_dir = os.path.join(self.model_dir, "samples") # The hook expects these collections to be in the graph pred_dict = {} pred_dict["predicted_tokens"] = tf.constant([["Hello", "World", "笑w"]]) pred_dict["labels.target_tokens"] = tf.constant([["Hello", "World", "笑w"]]) pred_dict["labels.target_len"] = tf.constant(2), graph_utils.add_dict_to_collection(pred_dict, "predictions")
def _build(self, features, labels): # Create vocabulary lookup for source source_vocab_to_id, source_id_to_vocab, _ = \ vocab.create_vocabulary_lookup_table(self.source_vocab_info.path) # Create vocabulary look for target target_vocab_to_id, target_id_to_vocab, _ = \ vocab.create_vocabulary_lookup_table(self.target_vocab_info.path) # Add vocab tables to graph colection so that we can access them in # other places. graph_utils.add_dict_to_collection({ "source_vocab_to_id": source_vocab_to_id, "source_id_to_vocab": source_id_to_vocab, "target_vocab_to_id": target_vocab_to_id, "target_id_to_vocab": target_id_to_vocab }, "vocab_tables") # Slice source to max_len if self.max_seq_len_source is not None: features["source_tokens"] = features[ "source_tokens"][:, :self.max_seq_len_source] features["source_len"] = tf.minimum( features["source_len"], self.max_seq_len_source) # Look up the source ids in the vocabulary features["source_ids"] = source_vocab_to_id.lookup(features[ "source_tokens"]) features["source_len"] = tf.to_int32(features["source_len"]) tf.summary.histogram("source_len", tf.to_float(features["source_len"])) if labels is None: return features, None labels = labels.copy() # Slices targets to max length if self.max_seq_len_target is not None: labels["target_tokens"] = labels[ "target_tokens"][:, :self.max_seq_len_target] labels["target_len"] = tf.minimum( labels["target_len"], self.max_seq_len_target) # Look up the target ids in the vocabulary labels["target_ids"] = target_vocab_to_id.lookup(labels["target_tokens"]) labels["target_len"] = tf.to_int32(labels["target_len"]) tf.summary.histogram("target_len", tf.to_float(labels["target_len"])) return features, labels
def topic_softmax(logits_message,logits_topic,batch_size): ###(exp(Vi)+exp(Ki)) / (sum(exp(Vi))+sum(exp(Ki))) , if the word is a topic word in the mean while logits_message_exp = tf.exp(logits_message*0.1) logits_topic_exp = tf.exp(logits_topic*0.1) #logits_message_exp = tf.clip_by_norm(logits_message_exp,0.1) #logits_topic_exp = tf.clip_by_norm(logits_topic_exp,0.1) logits_message_exp_nan=tf.is_nan(logits_message_exp) logits_message_exp_nan=tf.where(logits_message_exp_nan) logits_topic_exp_nan=tf.is_nan(logits_topic_exp) logits_topic_exp_nan=tf.where(logits_topic_exp_nan) logits_exp_sum = logits_message_exp ##require sum of the last dim ###logits_exp_sum = tf.concat([logits_message_exp, logits_topic_exp],-1) ##require sum of the last dim ###logits_exp_sum = tf.add(logits_message_exp, topic_words_mask*logits_topic_exp) ##require sum of the last dim logits_exp_sum = tf.reduce_sum(logits_exp_sum,-1) logits_exp_sum = tf.expand_dims(logits_exp_sum,-1) vocab_size = logits_message.get_shape().as_list()[-1] if tf.contrib.learn.ModeKeys.TRAIN: logits_exp_sum = tf.tile(logits_exp_sum,[1,1,vocab_size]) ###ke you ke wu ###logits_exp_sum = tf.clip_by_value(logits_exp_sum,10000,logits_exp_sum) ###logits_softmax_output = (logits_message_exp + topic_words_mask*logits_topic_exp)/logits_exp_sum ###logits_softmax_output = logits_topic_exp / logits_exp_sum logits_softmax_output = logits_message_exp / logits_exp_sum ###logits_softmax_output = (logits_message_exp + logits_topic_exp)/logits_exp_sum ###logits_softmax_output = tf.add(logits_message_exp, topic_words_mask*logits_topic_exp)/logits_exp_sum graph_utils.add_dict_to_collection({ "logits_message_exp": logits_message_exp, "logits_topic_exp": logits_topic_exp, "logits_message_exp_nan":logits_message_exp_nan, "logits_topic_exp_nan":logits_topic_exp_nan, "logits_exp_sum":logits_exp_sum, "logits_softmax_output": logits_softmax_output, }, "logits_softmax") return logits_softmax_output
def _build(self, features, labels, params): # Pre-process features and labels features, labels = self._preprocess(features, labels) encoder_output = self.encode(features, labels) decoder_output, _, = self.decode(encoder_output, features, labels) if self.mode == tf.contrib.learn.ModeKeys.INFER: loss = None train_op = None predictions = self._create_predictions( decoder_output=decoder_output, features=features, labels=labels) else: losses, loss = self.compute_loss(decoder_output, features, labels) train_op = None if self.mode == tf.contrib.learn.ModeKeys.TRAIN: gradient_multipliers = {} # multiply the gradient by 1.0/(2*#att_layer) for i in tf.get_collection(tf.GraphKeys.TRAINABLE_VARIABLES, scope='model/conv_seq2seq/encode'): if 'encode/W' in i.name or 'encode/pos' in i.name: continue tf.logging.info("tensor %s, name is %s", i, i.name) gradient_multipliers[i] = 1.0/(2*self.params["decoder.params"]["cnn.layers"]) #tf.logging.info("gradient_multipliers %s",gradient_multipliers) train_op = self._build_train_op(loss, gradient_multipliers=gradient_multipliers) predictions = self._create_predictions( decoder_output=decoder_output, features=features, labels=labels, losses=losses) # We add "useful" tensors to the graph collection so that we # can easly find them in our hooks/monitors. graph_utils.add_dict_to_collection(predictions, "predictions") return predictions, loss, train_op
def _preprocess(self, features, labels): """Model-specific preprocessing for features and labels: - Creates vocabulary lookup tables for target vocab - Converts tokens into vocabulary ids - Prepends a speical "SEQUENCE_START" token to the target - Appends a speical "SEQUENCE_END" token to the target """ # Create vocabulary look for target target_vocab_to_id, target_id_to_vocab, target_word_to_count, _ = \ vocab.create_vocabulary_lookup_table(self.target_vocab_info.path) # Add vocab tables to graph colection so that we can access them in # other places. graph_utils.add_dict_to_collection( { "target_vocab_to_id": target_vocab_to_id, "target_id_to_vocab": target_id_to_vocab, "target_word_to_count": target_word_to_count }, "vocab_tables") if labels is None: return features, None labels = labels.copy() # Slices targets to max length if self.params["target.max_seq_len"] is not None: labels["target_tokens"] = labels[ "target_tokens"][:, :self.params["target.max_seq_len"]] labels["target_len"] = tf.minimum( labels["target_len"], self.params["target.max_seq_len"]) # Look up the target ids in the vocabulary labels["target_ids"] = target_vocab_to_id.lookup( labels["target_tokens"]) labels["target_len"] = tf.to_int32(labels["target_len"]) tf.summary.histogram("target_len", tf.to_float(labels["target_len"])) # Add to graph collection for later use graph_utils.add_dict_to_collection(features, "features") if labels: graph_utils.add_dict_to_collection(labels, "labels") return features, labels
def _preprocess(self, features, labels): """Model-specific preprocessing for features and labels: - Creates vocabulary lookup tables for target vocab - Converts tokens into vocabulary ids - Prepends a speical "SEQUENCE_START" token to the target - Appends a speical "SEQUENCE_END" token to the target """ # Create vocabulary look for target target_vocab_to_id, target_id_to_vocab, target_word_to_count, _ = \ vocab.create_vocabulary_lookup_table(self.target_vocab_info.path) # Add vocab tables to graph colection so that we can access them in # other places. graph_utils.add_dict_to_collection({ "target_vocab_to_id": target_vocab_to_id, "target_id_to_vocab": target_id_to_vocab, "target_word_to_count": target_word_to_count }, "vocab_tables") if labels is None: return features, None labels = labels.copy() # Slices targets to max length if self.params["target.max_seq_len"] is not None: labels["target_tokens"] = labels["target_tokens"][:, :self.params[ "target.max_seq_len"]] labels["target_len"] = tf.minimum(labels["target_len"], self.params["target.max_seq_len"]) # Look up the target ids in the vocabulary labels["target_ids"] = target_vocab_to_id.lookup(labels["target_tokens"]) labels["target_len"] = tf.to_int32(labels["target_len"]) tf.summary.histogram("target_len", tf.to_float(labels["target_len"])) # Add to graph collection for later use graph_utils.add_dict_to_collection(features, "features") if labels: graph_utils.add_dict_to_collection(labels, "labels") return features, labels
def _build(self, features, labels, params, mode): # Pre-process features and labels features, labels = self.create_featurizer(mode)(features, labels) # Add to graph collection for later use graph_utils.add_dict_to_collection(features, "features") if labels: graph_utils.add_dict_to_collection(labels, "labels") source_ids = features["source_ids"] if self.params["source.reverse"] is True: source_ids = tf.reverse_sequence( input=features["source_ids"], seq_lengths=features["source_len"], seq_dim=1, batch_dim=0, name=None) # Create embedddings source_embedding = tf.get_variable( "source_embedding", [self.source_vocab_info.total_size, self.params["embedding.dim"]]) target_embedding = tf.get_variable( "target_embedding", [self.target_vocab_info.total_size, self.params["embedding.dim"]]) # Embed source source_embedded = tf.nn.embedding_lookup(source_embedding, source_ids) # Graph used for inference if mode == tf.contrib.learn.ModeKeys.INFER: target_start_id = self.target_vocab_info.special_vocab.SEQUENCE_START # Embed the "SEQUENCE_START" token initial_input = tf.nn.embedding_lookup( target_embedding, tf.ones_like(features["source_len"]) * target_start_id) def make_input_fn(predictions): """Use the embedded prediction as the input to the next time step """ return tf.nn.embedding_lookup(target_embedding, predictions) def elements_finished_fn(_time_, predictions): """Returns true when a prediction is finished""" return tf.equal( predictions, tf.cast(self.target_vocab_info.special_vocab.SEQUENCE_END, dtype=predictions.dtype)) decoder_input_fn_infer = decoders.DynamicDecoderInputs( initial_inputs=initial_input, make_input_fn=make_input_fn, max_decode_length=self.params["inference.max_decode_length"], elements_finished_fn=elements_finished_fn) # Decode decoder_output = self.encode_decode( source=source_embedded, source_len=features["source_len"], decoder_input_fn=decoder_input_fn_infer, mode=mode) predictions = self._create_predictions( decoder_output=decoder_output, features=features, labels=labels) return predictions, None, None # Embed target target_embedded = tf.nn.embedding_lookup(target_embedding, labels["target_ids"]) # During training/eval, we have labels and use them for teacher forcing # We don't feed the last SEQUENCE_END token decoder_input_fn_train = decoders.FixedDecoderInputs( inputs=target_embedded[:, :-1], sequence_length=labels["target_len"] - 1) decoder_output = self.encode_decode( source=source_embedded, source_len=features["source_len"], decoder_input_fn=decoder_input_fn_train, mode=mode) # Calculate loss per example-timestep of shape [B, T] losses = seq2seq_losses.cross_entropy_sequence_loss( logits=decoder_output.logits[:, :, :], targets=tf.transpose(labels["target_ids"][:, 1:], [1, 0]), sequence_length=labels["target_len"] - 1) # Calculate the average log perplexity loss = tf.reduce_sum(losses) / tf.to_float( tf.reduce_sum(labels["target_len"] - 1)) learning_rate_decay_fn = training_utils.create_learning_rate_decay_fn( decay_type=self.params["optimizer.lr_decay_type"] or None, decay_steps=self.params["optimizer.lr_decay_steps"], decay_rate=self.params["optimizer.lr_decay_rate"], start_decay_at=self.params["optimizer.lr_start_decay_at"], stop_decay_at=self.params["optimizer.lr_stop_decay_at"], min_learning_rate=self.params["optimizer.lr_min_learning_rate"], staircase=self.params["optimizer.lr_staircase"]) train_op = tf.contrib.layers.optimize_loss( loss=loss, global_step=tf.contrib.framework.get_global_step(), learning_rate=self.params["optimizer.learning_rate"], learning_rate_decay_fn=learning_rate_decay_fn, clip_gradients=self.params["optimizer.clip_gradients"], optimizer=self.params["optimizer.name"], summaries=tf.contrib.layers.optimizers.OPTIMIZER_SUMMARIES) if mode == tf.contrib.learn.ModeKeys.EVAL: train_op = None predictions = self._create_predictions(decoder_output=decoder_output, features=features, labels=labels, losses=losses) # We add "useful" tensors to the graph collection so that we # can easly find them in our hooks/monitors. graph_utils.add_dict_to_collection(predictions, "predictions") return predictions, loss, train_op
def _preprocess(self, features, labels): """Model-specific preprocessing for features and labels: - Creates vocabulary lookup tables for source and target vocab - Converts tokens into vocabulary ids """ self.create_lookup_table() # int64_keys = ["source_len", "source_ids", "extend_source_ids", "source_oov_nums", "target_ids", "extend_target_ids"] # Slice source to max_len ###here can't if self.params["source.max_seq_len"] is not None: features["source_tokens"] = features["source_tokens"][:, :self.params[ "source.max_seq_len"]] features["source_len"] = tf.minimum(features["source_len"], self.params["source.max_seq_len"]) # Look up the source ids in the vocabulary graph_source_ids = self._source_vocab_to_id.lookup(features["source_tokens"]) #every sequence contains sequence end flag tf.assert_equal(graph_source_ids, features["source_ids"]) features["source_oov_nums"] = tf.cast(features["source_oov_nums"], tf.int32) features["source_max_oov_num"] = tf.reduce_max(features["source_oov_nums"]) # Maybe reverse the source if self.params["source.reverse"] is True: raise NotImplemented("reverse func is not stable now") reverse_keys = ["source_ids", "extend_source_ids", "source_pos_ids", "source_tfidfs", "source_ner_ids"] for key in reverse_keys: features[key] = tf.reverse_sequence( input=features[key], seq_lengths=features["source_len"], seq_dim=1, batch_dim=0, name=None) features["source_len"] = tf.cast(features["source_len"], tf.int32) tf.summary.histogram("source_len", features["source_len"]) tf.summary.histogram("source_oov_nums", features["source_oov_nums"]) tf.summary.scalar("batch_max_oov_words", features["source_max_oov_num"]) if labels is None: return features, None labels = labels.copy() # Slices targets to max length if self.params["target.max_seq_len"] is not None: labels["target_tokens"] = labels["target_tokens"][:, :self.params[ "target.max_seq_len"]] labels["target_len"] = tf.minimum(labels["target_len"], self.params["target.max_seq_len"]) # Look up the target ids in the vocabulary graph_target_ids = self._target_vocab_to_id.lookup(labels["target_tokens"]) tf.assert_equal(graph_target_ids, labels["target_ids"]) labels["target_len"] = tf.to_int32(labels["target_len"]) tf.summary.histogram("target_len", tf.to_float(labels["target_len"])) # Keep track of the number of processed tokens num_tokens = tf.reduce_sum(labels["target_len"]) num_tokens += tf.reduce_sum(features["source_len"]) token_counter_var = tf.Variable(0, "tokens_counter", dtype=tf.int32) total_tokens = tf.assign_add(token_counter_var, num_tokens) tf.summary.scalar("num_tokens", total_tokens) with tf.control_dependencies([total_tokens]): features["source_tokens"] = tf.identity(features["source_tokens"]) # Add to graph collection for later use graph_utils.add_dict_to_collection(features, "features") if labels: graph_utils.add_dict_to_collection(labels, "labels") return features, labels
def _preprocess(self, features, labels): """Model-specific preprocessing for features and labels: - Creates vocabulary lookup tables for source and target vocab - Converts tokens into vocabulary ids """ # Create vocabulary lookup for source source_vocab_to_id, source_id_to_vocab, source_word_to_count, _ = \ vocab.create_vocabulary_lookup_table(self.source_vocab_info.path) # Create vocabulary look for target target_vocab_to_id, target_id_to_vocab, target_word_to_count, _ = \ vocab.create_vocabulary_lookup_table(self.target_vocab_info.path) # Add vocab tables to graph colection so that we can access them in # other places. graph_utils.add_dict_to_collection( { "source_vocab_to_id": source_vocab_to_id, "source_id_to_vocab": source_id_to_vocab, "source_word_to_count": source_word_to_count, "target_vocab_to_id": target_vocab_to_id, "target_id_to_vocab": target_id_to_vocab, "target_word_to_count": target_word_to_count }, "vocab_tables") # Slice source to max_len if self.params["source.max_seq_len"] is not None: features["source_tokens"] = features[ "source_tokens"][:, :self.params["source.max_seq_len"]] features["source_len"] = tf.minimum( features["source_len"], self.params["source.max_seq_len"]) # Look up the source ids in the vocabulary features["source_ids"] = source_vocab_to_id.lookup( features["source_tokens"]) # Maybe reverse the source if self.params["source.reverse"] is True: features["source_ids"] = tf.reverse_sequence( input=features["source_ids"], seq_lengths=features["source_len"], seq_dim=1, batch_dim=0, name=None) features["source_len"] = tf.to_int32(features["source_len"]) tf.summary.histogram("source_len", tf.to_float(features["source_len"])) if labels is None: return features, None labels = labels.copy() # Slices targets to max length if self.params["target.max_seq_len"] is not None: labels["target_tokens"] = labels[ "target_tokens"][:, :self.params["target.max_seq_len"]] labels["target_len"] = tf.minimum( labels["target_len"], self.params["target.max_seq_len"]) # Look up the target ids in the vocabulary labels["target_ids"] = target_vocab_to_id.lookup( labels["target_tokens"]) labels["target_len"] = tf.to_int32(labels["target_len"]) tf.summary.histogram("target_len", tf.to_float(labels["target_len"])) # Keep track of the number of processed tokens num_tokens = tf.reduce_sum(labels["target_len"]) num_tokens += tf.reduce_sum(features["source_len"]) token_counter_var = tf.Variable(0, "tokens_counter") total_tokens = tf.assign_add(token_counter_var, num_tokens) tf.summary.scalar("num_tokens", total_tokens) with tf.control_dependencies([total_tokens]): features["source_tokens"] = tf.identity(features["source_tokens"]) # Add to graph collection for later use graph_utils.add_dict_to_collection(features, "features") if labels: graph_utils.add_dict_to_collection(labels, "labels") return features, labels
def _preprocess(self, features, labels): """Model-specific preprocessing for features and labels: - Creates vocabulary lookup tables for source and target vocab - Converts tokens into vocabulary ids - Appends a special "SEQUENCE_END" token to the source - Prepends a special "SEQUENCE_START" token to the target - Appends a special "SEQUENCE_END" token to the target """ # Create vocabulary lookup for source source_vocab_to_id, source_id_to_vocab, source_word_to_count, _ = \ vocab.create_vocabulary_lookup_table(self.source_vocab_info.path) # Create vocabulary look for target target_vocab_to_id, target_id_to_vocab, target_word_to_count, _ = \ vocab.create_vocabulary_lookup_table(self.target_vocab_info.path) # Add vocab tables to graph colection so that we can access them in # other places. graph_utils.add_dict_to_collection( { "source_vocab_to_id": source_vocab_to_id, "source_id_to_vocab": source_id_to_vocab, "source_word_to_count": source_word_to_count, "target_vocab_to_id": target_vocab_to_id, "target_id_to_vocab": target_id_to_vocab, "target_word_to_count": target_word_to_count }, "vocab_tables") # Slice source to max_len if self.params["source.max_seq_len"] is not None: features["source_tokens"] = features[ "source_tokens"][:, :self.params["source.max_seq_len"]] features["source_len"] = tf.minimum( features["source_len"], self.params["source.max_seq_len"]) # Look up the source ids in the vocabulary features["source_ids"] = source_vocab_to_id.lookup( features["source_tokens"]) # Maybe reverse the source if self.params["source.reverse"] is True: features["source_ids"] = tf.reverse_sequence( input=features["source_ids"], seq_lengths=features["source_len"], seq_dim=1, batch_dim=0, name=None) features["source_len"] = tf.to_int32(features["source_len"]) tf.summary.histogram("source_len", tf.to_float(features["source_len"])) if labels is None: return features, None labels = labels.copy() # Slices targets to max length if self.params["target.max_seq_len"] is not None: labels["target_tokens"] = labels[ "target_tokens"][:, :self.params["target.max_seq_len"]] labels["target_len"] = tf.minimum( labels["target_len"], self.params["target.max_seq_len"]) # Look up the target ids in the vocabulary labels["target_ids"] = target_vocab_to_id.lookup( labels["target_tokens"]) labels["target_len"] = tf.to_int32(labels["target_len"]) tf.summary.histogram("target_len", tf.to_float(labels["target_len"])) # Add to graph collection for later use graph_utils.add_dict_to_collection(features, "features") if labels: graph_utils.add_dict_to_collection(labels, "labels") return features, labels
def _preprocess(self, features, labels): """Model-specific preprocessing for features and labels: - Creates vocabulary lookup tables for source and target vocab - Converts tokens into vocabulary ids """ # Create vocabulary lookup for source source_vocab_to_id, source_id_to_vocab, source_word_to_count, _ = \ vocab.create_vocabulary_lookup_table(self.source_vocab_info.path) # Create vocabulary look for target target_vocab_to_id, target_id_to_vocab, target_word_to_count, _ = \ vocab.create_vocabulary_lookup_table(self.target_vocab_info.path) # Add vocab tables to graph colection so that we can access them in # other places. graph_utils.add_dict_to_collection({ "source_vocab_to_id": source_vocab_to_id, "source_id_to_vocab": source_id_to_vocab, "source_word_to_count": source_word_to_count, "target_vocab_to_id": target_vocab_to_id, "target_id_to_vocab": target_id_to_vocab, "target_word_to_count": target_word_to_count }, "vocab_tables") # Slice source to max_len if self.params["source.max_seq_len"] is not None: features["source_tokens"] = features["source_tokens"][:, :self.params[ "source.max_seq_len"]] features["source_len"] = tf.minimum(features["source_len"], self.params["source.max_seq_len"]) # Look up the source ids in the vocabulary features["source_ids"] = source_vocab_to_id.lookup(features[ "source_tokens"]) # Maybe reverse the source if self.params["source.reverse"] is True: features["source_ids"] = tf.reverse_sequence( input=features["source_ids"], seq_lengths=features["source_len"], seq_dim=1, batch_dim=0, name=None) features["source_len"] = tf.to_int32(features["source_len"]) tf.summary.histogram("source_len", tf.to_float(features["source_len"])) if labels is None: return features, None labels = labels.copy() # Slices targets to max length if self.params["target.max_seq_len"] is not None: labels["target_tokens"] = labels["target_tokens"][:, :self.params[ "target.max_seq_len"]] labels["target_len"] = tf.minimum(labels["target_len"], self.params["target.max_seq_len"]) # Look up the target ids in the vocabulary labels["target_ids"] = target_vocab_to_id.lookup(labels["target_tokens"]) labels["target_len"] = tf.to_int32(labels["target_len"]) tf.summary.histogram("target_len", tf.to_float(labels["target_len"])) # Keep track of the number of processed tokens num_tokens = tf.reduce_sum(labels["target_len"]) num_tokens += tf.reduce_sum(features["source_len"]) token_counter_var = tf.Variable(0, "tokens_counter") total_tokens = tf.assign_add(token_counter_var, num_tokens) tf.summary.scalar("num_tokens", total_tokens) with tf.control_dependencies([total_tokens]): features["source_tokens"] = tf.identity(features["source_tokens"]) # Add to graph collection for later use graph_utils.add_dict_to_collection(features, "features") if labels: graph_utils.add_dict_to_collection(labels, "labels") return features, labels
def _build_schema_lookup_tables(self): # May include: schema map, schema matrix, schema text. # Read in all the filenames from all schema_loc_files, # identifying unique filenames. schema_loc_files = self.params["schema_loc_files"] if len(schema_loc_files) < 1: return all_schema_locations = set() for loc_file in schema_loc_files: with open(loc_file, 'r') as f: locations = [l.strip() for l in f.readlines()] all_schema_locations.update(locations) all_schema_locations = list(all_schema_locations) # fixed order # Build a lookup table of filename --> index # (Required for all models that use any schema representation) schema_file_lookup_table = tf.contrib.lookup.index_table_from_tensor( mapping=all_schema_locations, num_oov_buckets=0, default_value=-1) # For each filename, get its matrix from the npy file. # Note the length of the schema. schema_embeddings_matrices = [] schema_lengths = [] if self.params["build_schema_text_table"]: all_schema_strings = [] if self.params["build_schema_map_table"]: schema_map_matrices = [] schema_map_lengths = [] def load_npy(matrix_list, length_list, file_location, fname): npy_file = os.path.join(file_location, fname) matrix_np = np.load(npy_file) matrix_list.append(matrix_np) length = matrix_np.shape[0] length_list.append(length) for schema_location in all_schema_locations: # Schema embeddings: required for all attn to schema models. load_npy(schema_embeddings_matrices, schema_lengths, schema_location, "schema_embeddings.npy") if self.params["build_schema_map_table"]: load_npy(schema_map_matrices, schema_map_lengths, schema_location, "schema_map.npy") # Schema strings: required for schema-copying models. if self.params["build_schema_text_table"]: schema_csv_file = os.path.join(schema_location, "schema.csv") schema_string = self.get_schema_strings(schema_csv_file) all_schema_strings.append(schema_string) max_emb_len = max(schema_lengths) schema_lengths = tf.constant(schema_lengths) # Pad matrices with zeros as needed. def pad_to_size(matrix, length): if matrix.shape[0] == length: return matrix padding_size = length - matrix.shape[0] padded = np.pad(matrix, pad_width=((0, padding_size), (0, 0)), mode='constant', constant_values=0) return padded schema_embeddings_matrices = [ pad_to_size(m, max_emb_len) for m in schema_embeddings_matrices ] # Assemble all the matrices into a big 3d tensor all_schema_embeddings = tf.convert_to_tensor( np.asarray(schema_embeddings_matrices), dtype=tf.float32) tables_dict = { "schema_file_lookup_table": schema_file_lookup_table, "all_schema_embeddings": all_schema_embeddings, "schema_lengths": schema_lengths, } # Assemble all the schema strings into a big lookup table. # (Required for schema-copying models) if self.params["build_schema_text_table"]: schema_strings_tbl = tf.contrib.lookup.index_to_string_table_from_tensor( all_schema_strings, name="schema_strings_lookup_table") tables_dict["all_schema_strings"] = schema_strings_tbl if self.params["build_schema_map_table"]: max_map_len = max(schema_map_lengths) schema_map_lengths = tf.constant(schema_map_lengths) schema_map_matrices = [ pad_to_size(m, max_map_len) for m in schema_map_matrices ] all_schema_maps = tf.convert_to_tensor( np.asarray(schema_map_matrices), dtype=tf.float32) tables_dict["all_schema_maps"] = all_schema_maps tables_dict["schema_map_lengths"] = schema_map_lengths graph_utils.add_dict_to_collection(tables_dict, "schema_tables")