def extract_args(self, features, mode, params): super().extract_args(features, mode, params) if self.hparams.vocab_size > 0: self.vocab = Vocabulary(size=self.hparams.vocab_size) else: self.vocab = Vocabulary( fname=self.hparams.vocab_file, skip_tokens=self.hparams.skip_tokens, skip_tokens_start=self.hparams.skip_tokens_start)
def main(_): print("Loading hyperparameters..") params = util.load_params(FLAGS.params_file) print("Building model..") model_dir = FLAGS.model_dir if FLAGS.clean_model_dir: util.clean_model_dir(model_dir) if FLAGS.model_cls == "transformer": model_cls = TransformerEstimator elif FLAGS.model_cls == "seq2seq": model_cls = Seq2SeqEstimator else: raise ValueError("Model class not supported.") model = model_cls(model_dir, params) print("Getting sources..") fields = {"train/inputs": "int", "train/targets": "int"} train_source = DataSource(FLAGS.train_file, fields) test_source = DataSource(FLAGS.test_file, fields) field_map = {"inputs": "train/inputs", "targets": "train/targets"} train_input_fn = train_source.get_input_fn( "train_in", field_map, None, FLAGS.batch_size) test_input_fn = test_source.get_input_fn( "test_in", field_map, 1, FLAGS.batch_size) print("Processing model..") model.train(train_input_fn, steps=FLAGS.train_batches) model.evaluate(test_input_fn) if FLAGS.interactive: print("Interactive decoding...") vocab = Vocabulary(fname=params["vocab_file"]) decoding.cmd_decode(model, vocab)
def extract_args(self, features, mode, params): super().extract_args(features, mode, params) self.d_k = self.hparams.d_model // self.hparams.num_heads self.d_pos = self.hparams.d_pos if self.hparams.d_pos == 0 else self.hparams.d_pos self.d_ff = self.hparams.d_ff if self.hparams.d_ff == 0 else self.hparams.d_ff if self.hparams.vocab_size > 0: self.vocab = Vocabulary(size=self.hparams.vocab_size) else: self.vocab = Vocabulary(fname=self.hparams.vocab_file) if not self.hparams.fixed_learning_rate: self.train_step = tf.get_variable( 'train_step', shape=[], dtype=tf.float32, initializer=tf.zeros_initializer(dtype=tf.int32), trainable=False) self.learning_rate = ( # magic formula provided in transformer paper tf.sqrt(1.0 / self.hparams.d_model) * tf.minimum( self.train_step * tf.pow(self.hparams.warmup_steps, -1.5), tf.pow(self.train_step, -0.5)))
def write_to_tfrecord(self, out_file, pipeline=None, max_lines=None): print("Writing to TFRecord..") writer = tf.python_io.TFRecordWriter(out_file) line_ctr = 0 for row in self.row_gen(): if not self.process_row(pipeline, row): continue feature = dict() for i in range(len(row)): key_ = self.headers[i].name type_ = self.headers[i].data_type vocab_ = self.headers[i].vocab_file mode_ = self.headers[i].vocab_mode if type_ == "text": if vocab_ not in self.vocabs: if mode_ != "write": self.vocabs[vocab_] = Vocabulary(fname=vocab_) else: self.vocabs[vocab_] = Vocabulary() row[i] = self.vocabs[vocab_].tokenize( row[i], fixed_vocab=(mode_ == "read")) feature[key_] = self.int64_feature(row[i]) elif type_ == "int": print([int(row[i])]) feature[key_] = self.int64_feature([int(row[i])]) elif type_ == "float": feature[key_] = self.float_feature([float(row[i])]) else: raise ValueError("Header type " + str(type_) + " not supported.") example = tf.train.Example(features=tf.train.Features( feature=feature)) writer.write(example.SerializeToString()) line_ctr = self.print_lines_processed(line_ctr) if max_lines is not None and line_ctr >= max_lines: break writer.close()
def build_vocab_files(self, count_cutoff=0): print("Building vocabularies..") read_only = True self.vocabs = dict() for i in range(len(self.headers)): vocab_ = self.headers[i].vocab_file mode = self.headers[i].vocab_mode if ((vocab_ is not None) and (vocab_ not in self.vocabs) and (mode != "read")): read_only = False if mode == "write": self.vocabs[vocab_] = Vocabulary() elif mode == "append": self.vocabs[vocab_] = Vocabulary(fname=vocab_) else: raise ValueError("Vocab mode " + str(mode) + " not supported.") elif vocab_ is not None and mode == "read": self.vocabs[vocab_] = Vocabulary(fname=vocab_) if read_only: return line_ctr = 0 for row in self.row_gen(): for i in range(len(row)): vocab_ = self.headers[i].vocab_file if vocab_ in self.vocabs: self.vocabs[vocab_].tokenize(row[i], fixed_vocab=False) line_ctr = self.print_lines_processed(line_ctr) for vocab_ in self.vocabs: if count_cutoff >= 0: self.vocabs[vocab_].count_cutoff(count_cutoff) with open(vocab_, "w", encoding="utf8") as vocab_f: for word in self.vocabs[vocab_].words: vocab_f.write(word + "\n") for i in range(len(self.headers)): self.headers[i].vocab_mode = "read"
def extract_args(self, features, mode, params): super().extract_args(features, mode, params) if (self.hparams.src_vocab_size == 0 and self.hparams.tgt_vocab_size == 0 and self.hparams.src_vocab_file == "" and self.hparams.tgt_vocab_file == ""): self.src_vocab = self.vocab self.tgt_vocab = self.vocab else: if self.hparams.src_vocab_size > 0: self.src_vocab = Vocabulary(size=self.hparams.src_vocab_size) else: self.src_vocab = Vocabulary(fname=self.hparams.src_vocab_file) if self.hparams.tgt_vocab_size > 0: self.tgt_vocab = Vocabulary(size=self.hparams.tgt_vocab_size) else: self.tgt_vocab = Vocabulary(fname=self.hparams.tgt_vocab_file)
def main(_): print("Loading hyperparameters..") params = util.load_params(FLAGS.params_file) print("Building model..") validation_config = tf.estimator.RunConfig( save_checkpoints_steps=100, keep_checkpoint_max=None, ) model_dir = FLAGS.model_dir if FLAGS.clean_model_dir: util.clean_model_dir(model_dir) if FLAGS.model_cls == "transformer": model_cls = TransformerEstimator elif FLAGS.model_cls == "seq2seq": model_cls = Seq2SeqEstimator else: raise ValueError("Model class not supported.") model = model_cls(model_dir, params, config=validation_config) print("Getting sources..") fields = {"train/inputs": "int", "train/targets": "int"} train_source = DataSource(FLAGS.train_file, fields) test_source = DataSource(FLAGS.test_file, fields) field_map = {"inputs": "train/inputs", "targets": "train/targets"} train_input_fn = train_source.get_input_fn("train_in", field_map, None, FLAGS.batch_size) test_input_fn = test_source.get_input_fn("test_in", field_map, 1, FLAGS.batch_size) print("Processing model..") model.train(train_input_fn, steps=FLAGS.train_batches) model.choose_best_checkpoint(test_input_fn) model.evaluate(test_input_fn) if FLAGS.interaction != "off": print("Interactive decoding...") vocab = Vocabulary(fname=params["vocab_file"]) if FLAGS.interaction == "cmd": decoding.cmd_decode(model, vocab, persona=True) elif FLAGS.interaction == "gui": decoding.gui_decode(model, vocab)
def write_to_tfrecord(self, out_file, pipeline=None, max_lines=None, line_gen=None, line_shard_len=None, streamline=True, traversal="depth_first", max_pos_len=32): print("Writing to TFRecord..") writer = tf.python_io.TFRecordWriter(out_file) line_ctr = 0 if line_gen is None: line_gen = self.row_gen() for row in line_gen: if not self.process_row(row, pipeline): continue feature = {} for i in range(len(row)): key_ = self.headers[i].name type_ = self.headers[i].data_type vocab_ = self.headers[i].vocab_file mode_ = self.headers[i].vocab_mode if type_ == "text" or type_ == "tree": if vocab_ not in self.vocabs: if mode_ != "write": self.vocabs[vocab_] = Vocabulary(fname=vocab_) else: self.vocabs[vocab_] = Vocabulary() if type_ == "text": row[i] = self.vocabs[vocab_].tokenize( row[i], fixed_vocab=(mode_ == "read")) feature[key_] = self.int64_feature(row[i]) else: tree_ints = [] tree_pos = [] for node in row[i].choose_traversal(traversal): if streamline: if node.value == "_NULL" and ( not node.parent or node.parent.children[0].value == "_NULL"): continue node.value = str(node.value) if (not node.is_leaf() ) and node.children[0].value == "_NULL": if node.children[1].value == "_NULL": node.value = str(node.value) + "_0" else: node.value = str(node.value) + "_1" if mode_ == "read" and node.value not in self.vocabs[ vocab_].word2idx: if len(node.value ) > 2 and node.value[-2:] == "_0": node.value = "_UNK_0" elif len(node.value ) > 2 and node.value[-2:] == "_1": node.value = "_UNK_1" else: node.value = "_UNK" tree_ints.append(self.vocabs[vocab_].get_token_id( node.value, mode_ == "read")) tree_pos += node.get_padded_positional_encoding( max_pos_len) field = self.headers[i].name feature[field] = self.int64_feature(tree_ints) feature[field + "_pos"] = self.float_feature(tree_pos) elif type_ == "int": feature[key_] = self.int64_feature([int(row[i])]) elif type_ == "float": feature[key_] = self.float_feature([float(row[i])]) else: raise ValueError("Header type " + str(type_) + " not supported.") example = tf.train.Example(features=tf.train.Features( feature=feature)) writer.write(example.SerializeToString()) line_ctr = self.print_lines_processed(line_ctr, "trees") if max_lines is not None and line_ctr >= max_lines: break writer.close()
def apply_byte_pair_encodings(self, out_file, max_lines=None): self.build_vocab_files() print("Applying byte pair encodings..") all_bpe_vocabs = dict() word_encodings = dict() for vocab_ in self.vocabs: all_bpe_vocabs[vocab_] = Vocabulary(fname=vocab_) word_encodings[vocab_] = dict() length_headers = OrderedDict() for i in range(len(self.headers)): if self.headers[i].vocab_file is not None: length_headers[self.headers[i].name] = DataHeader( self.headers[i].name + "/_length", "int") for header_name in length_headers: self.headers.append(length_headers[header_name]) with open(out_file, "w", encoding="utf8") as out_f: line_ctr = 0 for row in self.row_gen(): row_extension = [] for i in range(len(row)): vocab_ = self.headers[i].vocab_file if vocab_ is not None: row_extension.append(len(row[i].strip().split())) new_elem = "" for word in row[i].strip().split(): if word in word_encodings[vocab_]: encoding = word_encodings[vocab_][word] else: encoding = list(word) + ["</EOW>"] bigrams = dict() for j in range(len(encoding) - 1): bigram = encoding[j] + encoding[j + 1] if bigram in all_bpe_vocabs[ vocab_].word2idx: bigrams[j] = all_bpe_vocabs[ vocab_].word2idx[bigram] while len(bigrams) > 0: bigrams_argmin = None for idx in bigrams: if bigrams_argmin is None or bigrams[ idx] < bigrams[bigrams_argmin]: bigrams_argmin = idx encoding = encoding[0:bigrams_argmin] + \ [encoding[bigrams_argmin] + encoding[bigrams_argmin+1]] + encoding[bigrams_argmin+2:] bigrams = dict() for j in range(len(encoding) - 1): bigram = encoding[j] + encoding[j + 1] if bigram in all_bpe_vocabs[ vocab_].word2idx: bigrams[j] = all_bpe_vocabs[ vocab_].word2idx[bigram] word_encodings[vocab_][word] = encoding for subword in encoding: new_elem += subword + " " row[i] = new_elem row += row_extension out_f.write(self.concatenate_segments(row)) line_ctr = self.print_lines_processed(line_ctr) if max_lines is not None and line_ctr >= max_lines: break self.in_files = [out_file]
def main(_): print("Loading parameters..") params = util.load_params(FLAGS.params_file) print("Building model..") model_dir = FLAGS.model_dir if FLAGS.clean_model_dir: util.clean_model_dir(model_dir) first_model = PersonaSeq2SeqEstimator(model_dir, params, scope="first") second_model_encoder = Seq2SeqEncoderEstimator(model_dir, params, scope="second_encoder") second_model = EstimatorChain([second_model_encoder, first_model.decoder], model_dir, params, scope="second") mmi_model = PersonaSeq2SeqEstimator(model_dir, params, scope="mmi", is_mmi_model=True) model_group = EstimatorGroup([first_model, second_model, mmi_model], model_dir, params, scope="group") print("Getting sources..") fields = { "train/inputs": "int", "train/targets": "int", "train/speakers": "int" } train_source = DataSource(FLAGS.train_file, fields) autoenc_source = DataSource(FLAGS.autoenc_file, fields) test_source = DataSource(FLAGS.test_file, fields) train_field_map = { "inputs": "train/inputs", "targets": "train/targets", "speaker_ids": "train/speakers" } autoenc_field_map = { "inputs": "train/inputs", "targets": "train/inputs", "speaker_ids": "train/speakers" } mmi_field_map = { "inputs": "train/targets", "targets": "train/inputs", "speaker_ids": "train/speakers" } paired_input_fn = train_source.get_input_fn("paired_in", train_field_map, None, FLAGS.batch_size) autoenc_input_fn = train_source.get_input_fn("autoenc_in", autoenc_field_map, None, FLAGS.batch_size) mmi_input_fn = train_source.get_input_fn("mmi_in", mmi_field_map, None, FLAGS.batch_size) train_input_fn = DataSource.group_input_fns( ["first", "second", "mmi"], [paired_input_fn, autoenc_input_fn, mmi_input_fn]) test_input_fn = test_source.get_input_fn("test_in", train_field_map, 1, FLAGS.batch_size) print("Processing models..") print("Pretraining primary model..") model_group.train(train_input_fn, first_model, steps=FLAGS.pretrain_batches) print("Multitask training..") model_group.train(train_input_fn, { "first": 1, "second": 1, "mmi": 0 }, steps=FLAGS.train_batches) print("Training MMI model..") model_group.train(train_input_fn, mmi_model, steps=FLAGS.mmi_batches) print("Evaluating..") model_group.evaluate(test_input_fn, first_model) if FLAGS.interactive: print("Interactive decoding...") vocab = Vocabulary(fname=params["vocab_file"]) decoding.cmd_decode(first_model, vocab, persona=True, mmi_component=mmi_model)
class AbstractTransformerEstimator(AbstractIcecapsEstimator): @classmethod def construct_expected_params(cls): expected_params = super().construct_expected_params() expected_params["vocab_file"] = cls.make_param( "icecaps/examples/dummy_data/vocab.dic") expected_params["vocab_size"] = cls.make_param(0) expected_params["depth"] = cls.make_param(1) expected_params["num_heads"] = cls.make_param(8) expected_params["d_model"] = cls.make_param(32) expected_params["d_pos"] = cls.make_param(32) expected_params["d_ff"] = cls.make_param(64) expected_params["max_length"] = cls.make_param(10) expected_params["min_wavelength"] = cls.make_param(1.0) expected_params["max_wavelength"] = cls.make_param(1000.0) expected_params["warmup_steps"] = cls.make_param(4000.0) expected_params["fixed_learning_rate"] = cls.make_param(False) expected_params["learn_wavelengths"] = cls.make_param(False) expected_params["modality"] = cls.make_param("seq") expected_params["tree_depth"] = cls.make_param(256) expected_params["tree_width"] = cls.make_param(2) expected_params["learn_positional_embeddings"] = cls.make_param(False) return expected_params def extract_args(self, features, mode, params): super().extract_args(features, mode, params) self.d_k = self.hparams.d_model // self.hparams.num_heads self.d_pos = self.hparams.d_pos if self.hparams.d_pos == 0 else self.hparams.d_pos self.d_ff = self.hparams.d_ff if self.hparams.d_ff == 0 else self.hparams.d_ff if self.hparams.vocab_size > 0: self.vocab = Vocabulary(size=self.hparams.vocab_size) else: self.vocab = Vocabulary(fname=self.hparams.vocab_file) if not self.hparams.fixed_learning_rate: self.train_step = tf.get_variable( 'train_step', shape=[], dtype=tf.float32, initializer=tf.zeros_initializer(dtype=tf.int32), trainable=False) self.learning_rate = ( # magic formula provided in transformer paper tf.sqrt(1.0 / self.hparams.d_model) * tf.minimum( self.train_step * tf.pow(self.hparams.warmup_steps, -1.5), tf.pow(self.train_step, -0.5))) def build_embeddings(self): # self.hparams.max_length), dtype=tf.float32), 1) position = tf.expand_dims(tf.cast(tf.range(0, 2048), dtype=tf.float32), 1) if self.hparams.learn_wavelengths: wavelength_logs = tf.get_variable("wavelength_logs", [self.d_pos // 2], tf.float32) else: wavelength_logs = tf.linspace( math.log(self.hparams.min_wavelength), math.log(self.hparams.max_wavelength), self.d_pos // 2) div_term = tf.expand_dims(tf.exp(-wavelength_logs), 0) outer_product = tf.matmul(position, div_term) cosines = tf.cos(outer_product) sines = tf.sin(outer_product) self.positional_embeddings = tf.concat([cosines, sines], -1) if self.hparams.learn_positional_embeddings: self.positional_embeddings = tf.get_variable( name='positional_embeddings', shape=[self.hparams.max_length, self.hparams.d_model ]) * np.sqrt(float(self.hparams.d_model)) self.token_embeddings = tf.get_variable( name='token_embeddings', shape=[self.vocab.size(), self.hparams.d_model]) * np.sqrt( float(self.hparams.d_model)) if self.hparams.modality == "tree": self.d_tree_param = self.d_pos // (self.hparams.tree_depth * self.hparams.tree_width) self.tree_params = tf.tanh( tf.get_variable("tree_params", [self.d_tree_param])) self.tiled_tree_params = tf.tile( tf.reshape(self.tree_params, [1, 1, -1]), [self.hparams.tree_depth, self.hparams.tree_width, 1]) self.tiled_depths = tf.tile( tf.reshape(tf.range(self.hparams.tree_depth, dtype=tf.float32), [-1, 1, 1]), [1, self.hparams.tree_width, self.d_tree_param]) self.tree_norm = tf.sqrt( (1 - tf.square(self.tree_params)) * self.hparams.d_model / 2) self.tree_weights = tf.reshape( tf.pow(self.tiled_tree_params, self.tiled_depths) * self.tree_norm, [ self.hparams.tree_depth * self.hparams.tree_width, self.d_tree_param ]) def treeify_positions(self, positions): treeified = tf.expand_dims(positions, -1) * self.tree_weights shape = tf.shape(treeified) shape = tf.concat([shape[:-2], [self.d_pos]], -1) treeified = tf.reshape(treeified, shape) return treeified def init_inputs(self): self.inputs_sparse = tf.cast(self.features["inputs"], tf.int32) self.mask = tf.cast( tf.not_equal(self.inputs_sparse, self.vocab.end_token_id), tf.float32) self.inputs_length = tf.cast(tf.count_nonzero(self.mask, -1), tf.int32) self.inputs_max_length = tf.reduce_max(self.inputs_length) self.batch_size = tf.shape(self.inputs_sparse)[0] self.inputs_sparse = tf.slice( self.inputs_sparse, [0, 0], [self.batch_size, self.inputs_max_length]) self.mask = tf.slice(self.mask, [0, 0], [self.batch_size, self.inputs_max_length]) self.inputs_dense = tf.nn.embedding_lookup( params=self.token_embeddings, ids=self.inputs_sparse) if self.hparams.modality == "seq": self.positions = tf.slice(self.positional_embeddings, [0, 0], [self.inputs_max_length, self.d_pos]) elif self.hparams.modality == "tree": self.positions = tf.reshape(self.features["inputs_positions"], [ self.batch_size, self.inputs_max_length, self.hparams.tree_depth * self.hparams.tree_width ]) self.positions = self.treeify_positions(self.positions) else: raise ValueError("This input modality is not supported.") if self.d_pos != self.hparams.d_model: self.positions = tf.layers.dense(self.positions, self.hparams.d_model) self.inputs_dense = self.inputs_dense + self.positions self.inputs_dense = tf.nn.dropout(self.inputs_dense, self.keep_prob) self.inputs_dense = tf.transpose( tf.transpose(self.inputs_dense) * tf.transpose(self.mask)) def build_layer_norm(self, x): return tf.contrib.layers.layer_norm(x, begin_norm_axis=-1) def build_sublayer_fn(self, x, f): x = self.build_layer_norm(x) x = x + tf.nn.dropout(f(x), self.keep_prob) return x def attention(self, query, key, value, d_k, enc_mask=None, dec_mask=None): scores = tf.matmul(query, tf.transpose(key, [0, 1, 3, 2])) / math.sqrt(d_k) if enc_mask is not None: scores = tf.transpose( scores, [1, 2, 0, 3]) * enc_mask - 1e24 * (1.0 - enc_mask) scores = tf.transpose(scores, [2, 0, 1, 3]) if dec_mask is not None: scores = scores * dec_mask - 1e24 * (1.0 - dec_mask) p_attn = tf.nn.softmax(scores) p_attn = tf.nn.dropout(p_attn, keep_prob=self.keep_prob) attended_values = tf.matmul(p_attn, value) return attended_values, p_attn def mha_fn(self, query, key, value, batch_size, enc_mask_, dec_mask_): with tf.variable_scope("mha", reuse=tf.AUTO_REUSE) as scope: query = tf.transpose( tf.reshape( tf.layers.dense(query, self.hparams.d_model, use_bias=True), [batch_size, -1, self.hparams.num_heads, self.d_k]), [0, 2, 1, 3]) key = tf.transpose( tf.reshape( tf.layers.dense(key, self.hparams.d_model, use_bias=True), [batch_size, -1, self.hparams.num_heads, self.d_k]), [0, 2, 1, 3]) value = tf.transpose( tf.reshape( tf.layers.dense(value, self.hparams.d_model, use_bias=True), [batch_size, -1, self.hparams.num_heads, self.d_k]), [0, 2, 1, 3]) attended, _ = self.attention(query, key, value, self.d_k, enc_mask_, dec_mask_) attended = tf.reshape(tf.transpose(attended, [0, 2, 1, 3]), [batch_size, -1, self.hparams.d_model]) return attended def build_mha_sublayer(self, x, m, batch_size, enc_mask=None, dec_mask=None): with tf.variable_scope("attn", reuse=tf.AUTO_REUSE) as scope: return self.build_sublayer_fn( x, lambda q: tf.layers.dense( self.mha_fn(q, m, m, batch_size, enc_mask, dec_mask), self. hparams.d_model)) def build_ffn_sublayer(self, x, d_ff): with tf.variable_scope("ffn", reuse=tf.AUTO_REUSE) as scope: def ffn_fn(q): return tf.layers.dense(tf.layers.dense(q, d_ff, tf.nn.relu), self.hparams.d_model) return self.build_sublayer_fn(x, ffn_fn) def build_optimizer(self, trainable_params=None): super().build_optimizer(trainable_params) self.step_update_op = tf.assign_add(self.train_step, 1.0) with tf.control_dependencies([self.step_update_op]): self.train_op = tf.group([self.step_update_op, self.train_op])
def __init__(self, fname, fields, vocab=None): self.fname = fname self.parse_fields(fields) self.input_fns = dict() self.vocab = vocab if vocab is not None else Vocabulary()
class AbstractRecurrentEstimator(AbstractIcecapsEstimator): @classmethod def construct_expected_params(cls): expected_params = super().construct_expected_params() expected_params["max_length"] = cls.make_param(50) expected_params["cell_type"] = cls.make_param('gru') expected_params["hidden_units"] = cls.make_param(32) expected_params["depth"] = cls.make_param(1) expected_params["token_embed_dim"] = cls.make_param(16) expected_params["tie_token_embeddings"] = cls.make_param(True) expected_params["beam_width"] = cls.make_param(8) expected_params["vocab_file"] = cls.make_param( "./dummy_data/vocab.dic") expected_params["vocab_size"] = cls.make_param(0) expected_params["skip_tokens"] = cls.make_param('') expected_params["skip_tokens_start"] = cls.make_param('') return expected_params def extract_args(self, features, mode, params): super().extract_args(features, mode, params) if self.hparams.vocab_size > 0: self.vocab = Vocabulary(size=self.hparams.vocab_size) else: self.vocab = Vocabulary( fname=self.hparams.vocab_file, skip_tokens=self.hparams.skip_tokens, skip_tokens_start=self.hparams.skip_tokens_start) def build_cell(self, name=None): if self.hparams.cell_type == 'linear': cell = BasicRNNCell(self.hparams.hidden_units, activation=tf.identity, name=name) elif self.hparams.cell_type == 'tanh': cell = BasicRNNCell(self.hparams.hidden_units, activation=tf.tanh, name=name) elif self.hparams.cell_type == 'relu': cell = BasicRNNCell(self.hparams.hidden_units, activation=tf.nn.relu, name=name) elif self.hparams.cell_type == 'gru': cell = GRUCell(self.hparams.hidden_units, name=name) elif self.hparams.cell_type == 'lstm': cell = LSTMCell(self.hparams.hidden_units, name=name) else: raise ValueError('Provided cell type not supported.') return cell def build_deep_cell(self, cell_list=None, name=None, return_raw_list=False): if name is None: name = "cell" if cell_list is None: cell_list = [] for i in range(self.hparams.depth): cell = self.build_cell(name=name + "_" + str(i)) cell = DropoutWrapper(cell, output_keep_prob=self.keep_prob) cell_list.append(cell) if return_raw_list: return cell_list if len(cell_list) == 1: return cell_list[0] return MultiRNNCell(cell_list) def build_rnn(self, input_key="inputs"): with tf.variable_scope('rnn'): self.cell = self.build_deep_cell() self.build_inputs(input_key) self.outputs, self.last_state = tf.nn.dynamic_rnn( cell=self.cell, inputs=self.inputs_dense, sequence_length=self.inputs_length, time_major=False, dtype=tf.float32 ) # [batch_size, max_time_step, cell_output_size], [batch_size, cell_output_size] def build_embeddings(self): if "token_embeddings" in self.features and self.hparams.tie_token_embeddings: self.token_embeddings = self.features["token_embeddings"] else: self.token_embeddings = tf.get_variable( name='token_embeddings', shape=[self.vocab.size(), self.hparams.token_embed_dim]) if self.hparams.token_embed_dim != self.hparams.hidden_units: projection = tf.get_variable(name='token_embed_proj', shape=[ self.hparams.token_embed_dim, self.hparams.hidden_units ]) self.token_embeddings = self.token_embeddings @ projection def embed_sparse_to_dense(self, sparse): with tf.variable_scope('embed_sparse_to_dense', reuse=tf.AUTO_REUSE): dense = tf.nn.embedding_lookup(self.token_embeddings, sparse) return dense def build_inputs(self, input_key): self.build_embeddings() self.inputs_sparse_untrimmed = tf.cast(self.features[input_key], tf.int32) self.inputs_length = tf.cast( tf.count_nonzero( self.inputs_sparse_untrimmed - self.vocab.end_token_id, -1), tf.int32) self.inputs_max_length = tf.reduce_max(self.inputs_length) self.inputs_sparse = tf.slice(self.inputs_sparse_untrimmed, [0, 0], [-1, self.inputs_max_length]) self.inputs_dense = self.embed_sparse_to_dense(self.inputs_sparse) self.batch_size = tf.shape(self.inputs_sparse)[0] def build_loss(self): with tf.name_scope('build_loss'): self.loss = seq2seq.sequence_loss( logits=self.logits, targets=self.targets_sparse, weights=self.target_mask, average_across_timesteps=True, average_across_batch=True, ) self.reported_loss = tf.identity(self.loss, 'reported_loss')
def main(_): ''' This is a simple example of how to build an Icecaps training script, and is essentially the "Hello World" of Icecaps. Icecaps training scripts follow a basic five-phase pattern that we describe here. We train a basic model on the paired data stored in dummy_data/paired_personalized.tfrecord. For information on how to build TFRecords from text data files, please see data_processing_example.py. ''' print("Loading hyperparameters..") # The first phase is to load hyperparameters from a .params file. These files follow a # simple colon-delimited format (e.g. see dummy_params/simple_example_seq2seq.params). params = util.load_params(FLAGS.params_file) print("Building model..") # Second, we build our architecture based on our loaded hyperparameters. Our architecture # here is very basic: we use a simple LSTM-based seq2seq model. For information on more # complex architectures, wee train_persona_mmi_example.py. model_dir = FLAGS.model_dir if FLAGS.clean_model_dir: util.clean_model_dir(model_dir) model_cls = Seq2SeqEstimator # Every estimator expects a different set of hyperparmeters. If you set use_default_params # to True in your .params file, the estimator will employ default values for any unspecified # hyperparameters. To view the list of hyperparmeters with default values, you can run the # class method list_params(). E.g. you can open a Python session and run # Seq2SeqEstimator.list_params() to view what hyperparameters our seq2seq estimator expects. model = model_cls(model_dir, params) print("Getting sources..") # Third, we set up our data sources. DataSource objects allow you to build input_fns that # efficiently feed data into the training pipeline from TFRecord files. In our simple example, # we set up two data sources: one for training and one for testing. # TFRecords are created with name variables per data point. You must create a fields dictionary # to tell the DataSource which variables to load and what their types are. fields = {"train/inputs": "int", "train/targets": "int"} train_source = DataSource(FLAGS.train_file, fields) test_source = DataSource(FLAGS.test_file, fields) # Then, you must create a field_map dictionary to tell your estimator how to map the TFRecord's # variable names to the names expected by the estimator. While this may seem like unnecessary # overhead in this simple example, it provides useful flexibility in more complex scenarios. field_map = {"inputs": "train/inputs", "targets": "train/targets"} # Finally, build input_fns from your DataSources. train_input_fn = train_source.get_input_fn( "train_in", field_map, None, FLAGS.batch_size) # None lets our input_fn run for an unbounded # number of epochs. test_input_fn = test_source.get_input_fn( "test_in", field_map, 1, FLAGS.batch_size) # For testing, we only want to run the input_fn # for one epoch instead. print("Processing model..") # Fourth, we pipe our input_fns through our model for training and evaluation. model.train(train_input_fn, steps=FLAGS.train_batches) model.evaluate(test_input_fn) if FLAGS.interactive: print("Interactive decoding...") # Fifth, you may optionally set up an interactive session to test your system by directly # engaging with it. vocab = Vocabulary(fname=params["vocab_file"]) decoding.cmd_decode(model, vocab)
class RNNEstimator(AbstractRecurrentEstimator): def _model_fn(self, features, mode, params): with tf.variable_scope(self.scope, reuse=tf.AUTO_REUSE): self.extract_args(features, mode, params) self.init_inputs() self.build_cell() self.build_obj() if mode == tf.estimator.ModeKeys.PREDICT: self.build_rt_decoder() self.predictions = { "inputs": self.features["inputs"], "outputs": self.hypotheses, "scores": self.scores } if "metadata" in self.features: self.predictions["metadata"] = self.features["metadata"] return tf.estimator.EstimatorSpec(mode, predictions=self.predictions) self.init_targets() self.build_loss() if mode == tf.estimator.ModeKeys.TRAIN: self.build_optimizer() for var in tf.trainable_variables(): # Add histograms for trainable variables tf.summary.histogram(var.op.name, var) return tf.estimator.EstimatorSpec(mode, loss=self.reported_loss, train_op=self.train_op) if mode == tf.estimator.ModeKeys.EVAL: print("Number of parameters: " + str(self.get_num_model_params())) self.eval_metric_ops = dict() return tf.estimator.EstimatorSpec( mode, loss=self.reported_loss, eval_metric_ops=self.eval_metric_ops) @classmethod def construct_expected_params(cls): expected_params = super().construct_expected_params() expected_params["src_vocab_file"] = cls.make_param("") expected_params["tgt_vocab_file"] = cls.make_param("") expected_params["src_vocab_size"] = cls.make_param(0) expected_params["tgt_vocab_size"] = cls.make_param(0) return expected_params def extract_args(self, features, mode, params): super().extract_args(features, mode, params) if (self.hparams.src_vocab_size == 0 and self.hparams.tgt_vocab_size == 0 and self.hparams.src_vocab_file == "" and self.hparams.tgt_vocab_file == ""): self.src_vocab = self.vocab self.tgt_vocab = self.vocab else: if self.hparams.src_vocab_size > 0: self.src_vocab = Vocabulary(size=self.hparams.src_vocab_size) else: self.src_vocab = Vocabulary(fname=self.hparams.src_vocab_file) if self.hparams.tgt_vocab_size > 0: self.tgt_vocab = Vocabulary(size=self.hparams.tgt_vocab_size) else: self.tgt_vocab = Vocabulary(fname=self.hparams.tgt_vocab_file) def init_inputs(self): with tf.name_scope('init_encoder'): inputs = tf.cast(self.features["inputs"], tf.int32) self.batch_size = tf.shape(inputs)[0] inputs_length = tf.cast( tf.count_nonzero(inputs - self.vocab.end_token_id, -1), tf.int32) inputs_max_length = tf.reduce_max(inputs_length) end_token = tf.ones(shape=[ self.batch_size, self.hparams.max_length - inputs_max_length ], dtype=tf.int32) * self.vocab.end_token_id # [batch_size, max_time_steps + 1] self.inputs_sparse = tf.concat([inputs, end_token], axis=1) def init_targets(self): with tf.name_scope('init_decoder'): targets = tf.cast(self.features["targets"], tf.int32) targets_length = tf.cast( tf.count_nonzero(targets - self.vocab.end_token_id, -1), tf.int32) targets_max_length = tf.reduce_max(targets_length) end_token = tf.ones(shape=[ self.batch_size, self.hparams.max_length - targets_max_length ], dtype=tf.int32) * self.vocab.end_token_id # [batch_size, max_time_steps + 1] self.targets_sparse = tf.concat([targets, end_token], axis=1) self.targets_length = targets_length + 1 self.target_mask = tf.sequence_mask(lengths=self.targets_length, maxlen=self.hparams.max_length, dtype=tf.float32) def build_cell(self): sequence_length = tf.ones([self.batch_size], dtype=tf.int32) * self.hparams.max_length super().build_cell(sequence_length, self.src_vocab.size()) def build_obj(self): output_layer = Dense(self.tgt_vocab.size(), name='output_projection') self.logits = output_layer(self.outputs) def build_rt_decoder(self): with tf.name_scope('predict_decoder'): self.hypotheses = tf.argmax(self.logits, -1) self.scores = tf.reduce_sum( tf.reduce_max(tf.nn.log_softmax(self.logits), -1), -1)
def main(_): ''' This is a more complex example in which we build an Icecaps script involving component chaining and multi-task learning. We recommend you start with train_simple_example.py. In this example, we build a personalized conversation system that combines paired and unpaired data, and applies MMI during decoding. ''' print("Loading parameters..") # When multiple estimators are involved, you can specify which hyperparameters in your # params file belong to which estimator using scoping. See dummy_params/persona_mmi_example.params # for an example. If no scope is specified, the hyperparameter is provided to all # models in your architecture. params = util.load_params(FLAGS.params_file) print("Building model..") model_dir = FLAGS.model_dir if FLAGS.clean_model_dir: util.clean_model_dir(model_dir) # For this system, we will need to build three different estimators. # The first estimator is a personalized seq2seq estimator that will be responsible for # learning the conversational model. first_model = PersonaSeq2SeqEstimator(model_dir, params, scope="first") # The second estimator is a personalized seq2seq estimator that shares its decoder with # the first model. This model will learn an autoencoder on an unpaired personalized # data set. The purpose of this configuration is to influence the first model with # stylistic information from the unpaired dataset. # To construct this second estimator, we first build a seq2seq encoder separate from # the first model. Then, we use an EstimatorChain to chain that encoder to the first # model's decoder, allowing the two models to share that decoder. second_model_encoder = Seq2SeqEncoderEstimator(model_dir, params, scope="second_encoder") second_model = EstimatorChain([second_model_encoder, first_model.decoder], model_dir, params, scope="second") # The third estimator is used for MMI decoding. This model will learn the inverse # function of the first model. During decoding, this estimator will be used to rerank # hypotheses generated by the first model during beam search decoding. While this # won't have much of an effect on our toy data sets, the purpose of this model in # real-world settings is to penalize generic responses applicable to many contexts # such as "I don't know." mmi_model = PersonaSeq2SeqEstimator(model_dir, params, scope="mmi", is_mmi_model=True) model_group = EstimatorGroup([first_model, second_model, mmi_model], model_dir, params, scope="group") print("Getting sources..") # We will use two DataSources for training and one for testing. fields = { "train/inputs": "int", "train/targets": "int", "train/speakers": "int" } paired_source = DataSource(FLAGS.paired_file, fields) unpaired_source = DataSource(FLAGS.unpaired_file, fields) test_source = DataSource(FLAGS.test_file, fields) # We construct three field maps. # The paired field map is similar to the field map shown in train_simple_example.py # The unpaired field map maps train/inputs to both the estimator's inputs and targets, # in order to train an autoencoder. # The mmi field maps maps train/inputs to targets and train/targets to inputs, in # order to learn the inverse of the first estimator. paired_field_map = { "inputs": "train/inputs", "targets": "train/targets", "speaker_ids": "train/speakers" } unpaired_field_map = { "inputs": "train/inputs", "targets": "train/inputs", "speaker_ids": "train/speakers" } mmi_field_map = { "inputs": "train/targets", "targets": "train/inputs", "speaker_ids": "train/speakers" } paired_input_fn = paired_source.get_input_fn("paired_in", paired_field_map, None, FLAGS.batch_size) unpaired_input_fn = unpaired_source.get_input_fn("unpaired_in", unpaired_field_map, None, FLAGS.batch_size) mmi_input_fn = paired_source.get_input_fn("mmi_in", mmi_field_map, None, FLAGS.batch_size) # For multi-task learning, you will need to group your input_fns together with group_input_fns(). train_input_fn = DataSource.group_input_fns( ["first", "second", "mmi"], [paired_input_fn, unpaired_input_fn, mmi_input_fn]) test_input_fn = test_source.get_input_fn("test_in", paired_field_map, 1, FLAGS.batch_size) print("Processing models..") # Icecaps supports flexible multi-task training pipelines. You can set up multiple phases # where each phase trains your architecture with different weights across your objectives. # In this example, we will first pre-train the first model by itself, then jointly train # the first and second models, then finally train the MMI model by itself. print("Pretraining primary model..") model_group.train(train_input_fn, first_model, steps=FLAGS.pretrain_batches) print("Multitask training..") model_group.train(train_input_fn, { "first": 1, "second": 1, "mmi": 0 }, steps=FLAGS.train_batches) print("Training MMI model..") model_group.train(train_input_fn, mmi_model, steps=FLAGS.mmi_batches) print("Evaluating..") model_group.evaluate(test_input_fn, first_model) if FLAGS.interactive: print("Interactive decoding...") vocab = Vocabulary(fname=params["vocab_file"]) # To decode with MMI, you can pass in your MMI model to cmd_decode(). # lambda_balance represents how the first model and MMI model's scores are weighted during decoding. decoding.cmd_decode(first_model, vocab, persona=True, mmi_component=mmi_model, lambda_balance=FLAGS.lambda_balance)