def __init__(self, mode, config): self._mode = mode self.config = config if self._mode != Mode.Predict: self.input = tf.placeholder(name='inputs', dtype=tf.int32, shape=[None, None]) self.pos_input = tf.placeholder(name='pos_input', dtype=tf.int32, shape=[None]) self.morph_input = tf.placeholder(name='morph_input', dtype=tf.int32, shape=[None, config.max_morph_tags]) else: k, v = list(zip(*config.dec_char_numberer.value2num.items())) i = KeyValueTensorInitializer(v, k,key_dtype=tf.int64,value_dtype=tf.string) self.dec2char = HashTable(i, default_value="<UNK>") self.enc_chars = self.lookup(config.enc_char_numberer) self.morph_tags = self.lookup(config.morph_numberer) self.pos_tags = self.lookup(config.pos_numberer) self.input_plc =tf.placeholder(name='inputs', dtype=tf.string, shape=[None, None]) self.pos_plc = tf.placeholder(name='pos_input', dtype=tf.string, shape=[None]) self.morph_plc = tf.placeholder(name='morph_input', dtype=tf.string, shape=[None, config.max_morph_tags]) self.input = self.enc_chars.lookup(self.input_plc) self.pos_input = self.pos_tags.lookup(self.pos_plc) self.morph_input = self.morph_tags.lookup(self.morph_plc) self.input_lengths = tf.placeholder(name='input_lengths', dtype=tf.int32, shape=[None]) self.enc_character_embeddings = tf.get_variable('enc_character_embeddings', shape=[self.config.enc_vocab_size, self.config.char_embedding_size]) self.dec_character_embeddings = tf.get_variable('dec_character_embeddings', shape=[self.config.dec_vocab_size, self.config.char_embedding_size]) self.pos_embeddings = tf.get_variable('pos_embeddings', shape=[self.config.pos_vocab_size, self.config.pos_embedding_size]) self.morph_embeddings = tf.get_variable('morph_embeddings', shape=[self.config.morph_vocab_size, self.config.morph_embedding_size]) if self._mode != Mode.Train: self.start_tokens = tf.tile([config.start_idx], [tf.shape(self.input)[0]]) if self._mode != Mode.Predict: # length +2 since lengths are <bow> + word + <eow> self.dec_input = tf.placeholder(name='dec_in', dtype=tf.int32, shape=[None, None]) # exclude start token from targets for loss-computations since we feed the start token and don't want to # predict it self.decoder_targets = self.dec_input[:, 1:] self.dec_lengths = tf.placeholder(name='dec_lengths', dtype=tf.int32, shape=[None])
def __init__(self, returns_dict=False): embeddings = [ ("", [0, 0, 0, 0]), # OOV items are mapped to this embedding. ("hello world", [1, 2, 3, 4]), ("pair-programming", [5, 5, 5, 5]), ] keys = tf.constant([item[0] for item in embeddings], dtype=tf.string) indices = tf.constant(list(range(len(embeddings))), dtype=tf.int64) tbl_init = KeyValueTensorInitializer(keys, indices) self.table = HashTable(tbl_init, 0) self.weights = tf.Variable( list([item[1] for item in embeddings]), dtype=tf.float32) self.variables = [self.weights] self.trainable_variables = self.variables self._returns_dict = returns_dict
def text_module_fn(): embeddings = [ ("", [0, 0, 0, 0]), # OOV items are mapped to this embedding. ("hello world", [1, 2, 3, 4]), ("pair-programming", [5, 5, 5, 5]), ] keys = tf.constant([item[0] for item in embeddings], dtype=tf.string) indices = tf.constant(list(range(len(embeddings))), dtype=tf.int64) tbl_init = KeyValueTensorInitializer(keys, indices) table = HashTable(tbl_init, 0) weights_initializer = tf.cast( tf.constant(list([item[1] for item in embeddings])), tf.float32) weights = tf_v1.get_variable( "weights", dtype=tf.float32, initializer=weights_initializer) text_tensor = tf_v1.placeholder(dtype=tf.string, name="text", shape=[None]) indices_tensor = table.lookup(text_tensor) embedding_tensor = tf.gather(weights, indices_tensor) hub.add_signature(inputs=text_tensor, outputs=embedding_tensor)
def input_fn(): with tf.variable_scope("input_fn"), tf.device("/cpu:0"): caption_dataset = Dataset.from_tensor_slices(list(captions)) filename_dataset = Dataset.from_tensor_slices(list(filenames)) table_init = KeyValueTensorInitializer( list(self.word_to_idx.keys()), list(self.word_to_idx.values()), key_dtype=tf.string, value_dtype=tf.int32) table = HashTable(table_init, default_value=0) def split_sentence(sentence): words = tf.string_split(tf.reshape(sentence, (1, ))).values words = tf.concat([ tf.constant(["<START>"]), words, tf.constant(["<END>"]) ], axis=0) return table.lookup(words) index_dataset = caption_dataset.map(split_sentence, num_threads=8) def decode_image(filename): image = tf.image.decode_jpeg(tf.read_file(filename), channels=3) # image = tf.image.resize_images(image, [224, 224]) image = tf.to_float(image) return image image_dataset = filename_dataset.map(decode_image, num_threads=8) caption_structure = { "raw": caption_dataset, "index": index_dataset } return image_dataset, caption_structure
def lookup(self, numberer): k, v = list(zip(*numberer.value2num.items())) i = KeyValueTensorInitializer(k, v) return HashTable(i, default_value=numberer.unknown_idx)