def _create_greedy_infer_model(self): """Creates model for greedy inference testing. Returns: model: A t2t model. features: An map of string to tensor. """ model, features = get_model(transformer.transformer_small()) out_logits, _ = model(features) out_logits = tf.squeeze(out_logits, axis=[2, 3]) loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=tf.reshape(out_logits, [-1, VOCAB_SIZE]), labels=tf.reshape(features["targets"], [-1])) loss = tf.reduce_mean(loss) apply_grad = tf.train.AdamOptimizer(0.001).minimize(loss) with self.test_session(): tf.global_variables_initializer().run() for _ in range(100): apply_grad.run() model.set_mode(tf.estimator.ModeKeys.PREDICT) return model, features
def testTransformer(self): model, features = get_model(transformer.transformer_small()) logits, _ = model(features) with self.test_session() as session: session.run(tf.global_variables_initializer()) res = session.run(logits) self.assertEqual(res.shape, (BATCH_SIZE, TARGET_LENGTH, 1, 1, VOCAB_SIZE))
def transformer_ae_small(): """Set of hyperparameters.""" hparams = transformer.transformer_small() hparams.batch_size = 2048 hparams.learning_rate_warmup_steps = 4000 hparams.num_hidden_layers = 3 hparams.hidden_size = 384 hparams.filter_size = 2048 hparams.label_smoothing = 0.0 hparams.add_hparam("z_size", 16) hparams.add_hparam("noise_dev", 0.0) hparams.add_hparam("d_mix", 0.5) # Bottleneck kinds supported: dense, vae, semhash, gumbel-softmax, vq-vae. hparams.add_hparam("bottleneck_kind", "semhash") hparams.add_hparam("do_ae", True) hparams.add_hparam("do_mask", True) hparams.add_hparam("do_refine", False) hparams.add_hparam("drop_inputs", False) hparams.add_hparam("v_size", 1024*64) hparams.add_hparam("max_context_length", 64) hparams.add_hparam("num_compress_steps", 3) hparams.add_hparam("kl_steps", 35000) hparams.add_hparam("startup_steps", 10000) hparams.add_hparam("kmeans_lr_factor", 0.002) hparams.add_hparam("z_dropout", 0.1) hparams.add_hparam("is_2d", 0) hparams.add_hparam("use_gumbel_softmax", True) hparams.add_hparam("softmax_k", 0) hparams.add_hparam("decode_autoregressive", True) hparams.add_hparam("do_vae", True) hparams.add_hparam("bit_vae", True) hparams.add_hparam("beta", 0.25) hparams.kl_warmup_steps = 150000 hparams.force_full_predict = True return hparams
def testSlowVsFast(self): model, features = get_model(transformer.transformer_small()) decode_length = 3 out_logits, _ = model(features) out_logits = tf.squeeze(out_logits, axis=[2, 3]) loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=tf.reshape(out_logits, [-1, VOCAB_SIZE]), labels=tf.reshape(features["targets"], [-1])) loss = tf.reduce_mean(loss) apply_grad = tf.train.AdamOptimizer(0.001).minimize(loss) with self.test_session(): tf.global_variables_initializer().run() for _ in range(100): apply_grad.run() model.set_mode(tf.estimator.ModeKeys.PREDICT) with tf.variable_scope(tf.get_variable_scope(), reuse=True): greedy_result = model._slow_greedy_infer( features, decode_length)["outputs"] greedy_result = tf.squeeze(greedy_result, axis=[2, 3]) fast_result = model._greedy_infer(features, decode_length)["outputs"] with self.test_session(): greedy_res = greedy_result.eval() fast_res = fast_result.eval() self.assertEqual(fast_res.shape, (BATCH_SIZE, INPUT_LENGTH + decode_length)) self.assertAllClose(greedy_res, fast_res)
def small_librispeech_model(param_overrides=None): hparams = transformer.transformer_small() hparams.hidden_size = 8 hparams.filter_size = 32 hparams.num_heads = 1 hparams.layer_prepostprocess_dropout = 0.0 p_hparams = librispeech.Librispeech().get_hparams(hparams) p_hparams.vocab_size["targets"] = VOCAB_SIZE hparams.problem_hparams = p_hparams model = transformer.Transformer(hparams, problem_hparams=p_hparams) if param_overrides is not None: # Add or Set any provided HParams assert isinstance(param_overrides, dict) for param_name in param_overrides: if hasattr(hparams, param_name): hparams.set_hparam(param_name, param_overrides[param_name]) else: hparams.add_hparam(param_name, param_overrides[param_name]) inputs = np.random.rand(BATCH_SIZE, INPUT_LENGTH, 80, 3).astype("float32") # modify for speech targets = np.random.randint(VOCAB_SIZE, size=(BATCH_SIZE, TARGET_LENGTH, 1, 1)) features = { "inputs": tf.constant(inputs, dtype=tf.float32, name="inputs"), "targets": tf.constant(targets, dtype=tf.int32, name="targets"), "target_space_id": tf.constant(1, dtype=tf.int32) } return model, features
def testSlowVsFastNoInput(self): model, features = get_model(transformer.transformer_small(), has_input=False) decode_length = 3 out_logits, _ = model(features) out_logits = tf.squeeze(out_logits, axis=[2, 3]) loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=tf.reshape(out_logits, [-1, VOCAB_SIZE]), labels=tf.reshape(features["targets"], [-1])) loss = tf.reduce_mean(loss) apply_grad = tf.train.AdamOptimizer(0.001).minimize(loss) with self.test_session(): tf.global_variables_initializer().run() for _ in range(100): apply_grad.run() model.set_mode(tf.estimator.ModeKeys.PREDICT) with tf.variable_scope(tf.get_variable_scope(), reuse=True): slow_result = model._slow_greedy_infer(features, decode_length)["outputs"] slow_result = tf.squeeze(slow_result, axis=[2, 3]) fast_result = model._greedy_infer(features, decode_length)["outputs"] with self.test_session(): slow_res = slow_result.eval() fast_res = fast_result.eval() self.assertEqual(slow_res.shape, (BATCH_SIZE, decode_length)) self.assertAllClose(slow_res, fast_res)
def testTransformer(self): model, features = self.getModel(transformer.transformer_small()) logits, _ = model(features) with self.test_session() as session: session.run(tf.global_variables_initializer()) res = session.run(logits) self.assertEqual(res.shape, (BATCH_SIZE, TARGET_LENGTH, 1, 1, VOCAB_SIZE))
def transformer_ae_small(): """Set of hyperparameters.""" hparams = transformer.transformer_small() hparams.batch_size = 2048 hparams.learning_rate_warmup_steps = 4000 hparams.num_hidden_layers = 3 hparams.hidden_size = 384 hparams.filter_size = 2048 hparams.label_smoothing = 0.0 hparams.add_hparam("z_size", 16) hparams.add_hparam("noise_dev", 1.0) hparams.add_hparam("d_mix", 0.5) # Bottleneck kinds supported: dense, vae, semhash, gumbel-softmax, vq-vae. hparams.add_hparam("bottleneck_kind", "semhash") hparams.add_hparam("do_ae", True) hparams.add_hparam("do_mask", True) hparams.add_hparam("do_refine", True) hparams.add_hparam("drop_inputs", False) hparams.add_hparam("v_size", 1024 * 64) hparams.add_hparam("max_context_length", 64) hparams.add_hparam("num_compress_steps", 3) hparams.add_hparam("kl_steps", 35000) hparams.add_hparam("startup_steps", 10000) hparams.add_hparam("kmeans_lr_factor", 0.002) hparams.add_hparam("z_dropout", 0.1) hparams.add_hparam("is_2d", 0) hparams.add_hparam("use_gumbel_softmax", True) hparams.add_hparam("softmax_k", 0) hparams.add_hparam("decode_autoregressive", True) hparams.add_hparam("do_vae", True) hparams.add_hparam("bit_vae", True) hparams.add_hparam("beta", 0.25) hparams.kl_warmup_steps = 150000 return hparams
def transformer_ae_small(): """Set of hyperparameters.""" hparams = transformer.transformer_small() hparams.batch_size = 2048 hparams.learning_rate = 0.2 hparams.learning_rate_warmup_steps = 4000 hparams.num_hidden_layers = 3 hparams.hidden_size = 384 hparams.filter_size = 2048 hparams.add_hparam("compress_filter_size", 2048 * 2) hparams.label_smoothing = 0.0 hparams.optimizer = "Adam" # Can be unstable, maybe try Adam. hparams.optimizer_adam_epsilon = 1e-9 hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.997 # Needs tuning, try 0.98 to 0.999. hparams.add_hparam("z_size", 14) hparams.add_hparam("noise_dev", 0.5) hparams.add_hparam("d_mix", 0.5) # Bottleneck kinds supported: dense, vae, semhash, gumbel-softmax, dvq. hparams.add_hparam("bottleneck_kind", "semhash") hparams.add_hparam("num_blocks", 1) hparams.add_hparam("num_decode_blocks", 1) # Reshape method for DVQ: slice, project hparams.add_hparam("reshape_method", "slice") hparams.add_hparam("trainable_projections", False) # Hparams for Dirichlet process process hparams.add_hparam("dp_alpha", 0.5) hparams.add_hparam("dp_strength", 0.25) hparams.add_hparam("dp_decay", 1.0) hparams.add_hparam("unmasked_percentage", 0.1) hparams.add_hparam("do_ae", True) hparams.add_hparam("do_mask", True) hparams.add_hparam("do_refine", False) hparams.add_hparam("do_attend_compress", False) hparams.add_hparam("do_attend_decompress", True) hparams.add_hparam("do_residual_compress", False) hparams.add_hparam("drop_inputs", False) hparams.add_hparam("v_size", 1024*64) hparams.add_hparam("max_context_length", 64) hparams.add_hparam("num_compress_steps", 3) hparams.add_hparam("startup_steps", 10000) hparams.add_hparam("mask_startup_steps", 50000) hparams.add_hparam("kmeans_lr_factor", 0.002) hparams.add_hparam("z_dropout", 0.1) hparams.add_hparam("is_2d", 0) hparams.add_hparam("softmax_k", 0) hparams.add_hparam("decode_autoregressive", True) hparams.add_hparam("do_vae", True) hparams.add_hparam("bit_vae", True) hparams.add_hparam("beta", 0.25) hparams.add_hparam("epsilon", 1e-5) hparams.add_hparam("decay", 0.999) hparams.add_hparam("ema", True) hparams.add_hparam("random_top_k", 1) hparams.kl_warmup_steps = 150000 hparams.force_full_predict = True # task params hparams.add_hparam("task", "translate") # translate or image tasks supported return hparams
def get_hparams(self): hparams = transformer.transformer_small() hparams.add_hparam("prior_type", "affine") hparams.add_hparam("factor", 2) # squeezing factor hparams.add_hparam("n_layers_transform_params", 1) hparams.add_hparam("n_1x1_heads", N_1X1_HEADS) hparams.add_hparam("flow_num_1x1_heads", 4) hparams.add_hparam("flow_num_heads", 4) hparams.add_hparam("flow_model_d", 64) hparams.add_hparam("flow_d_ff", 128) hparams.add_hparam("flow_layer_prepostprocess_dropout", 0.0) hparams.add_hparam("flow_attention_dropout", 0.0) hparams.add_hparam("flow_relu_dropout", 0.0) hparams.add_hparam("latent_size", N_CHANNELS) hparams.add_hparam("use_weightnorm", True) hparams.add_hparam("kl_startup_steps", 2000) hparams.add_hparam("affine_scale", "glow") hparams.add_hparam("scale_width", 0.999) hparams.add_hparam("step_fn", "glow") # glow / chunting hparams.add_hparam("conv_fn", "np") # np / tf hparams.add_hparam("posterior_type", "diagonal_normal") hparams.causal_decoder_self_attention = False hparams.model_d = model_d hparams.weight_dtype = "float32" hparams.add_hparam("pos_attn", False) return hparams
def transformer_sketch(): """Basic transformer_sketch hparams.""" hparams = transformer.transformer_small() hparams.num_compress_steps = 4 hparams.batch_size = 32 hparams.clip_grad_norm = 2. hparams.sampling_method = "random" return hparams
def transformer_vae_small(): """Set of hyperparameters.""" hparams = transformer.transformer_small() hparams.batch_size = 2048 hparams.learning_rate_warmup_steps = 16000 hparams.add_hparam("z_size", 128) hparams.add_hparam("num_compress_steps", 4) hparams.add_hparam("kl_warmup_steps", 60000) return hparams
def testBeamVsFast(self): model, features = self.getModel(transformer.transformer_small()) decode_length = 2 out_logits, _ = model.model_fn(features) out_logits = tf.squeeze(out_logits[0], axis=[2, 3]) loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=tf.reshape(out_logits, [-1, VOCAB_SIZE]), labels=tf.reshape(features["targets"], [-1])) loss = tf.reduce_mean(loss) apply_grad = tf.train.AdamOptimizer(0.001).minimize(loss) with self.test_session(): tf.global_variables_initializer().run() for _ in range(100): apply_grad.run() model, _ = self.getModel(transformer.transformer_small(), mode=tf.estimator.ModeKeys.PREDICT) with tf.variable_scope(tf.get_variable_scope(), reuse=True): beam_result = model._beam_decode_slow( features, decode_length, beam_size=4, top_beams=1, last_position_only=True, alpha=1.0) fast_result = model._beam_decode( features, decode_length, beam_size=4, top_beams=1, last_position_only=True, alpha=1.0) with self.test_session(): beam_res = beam_result.eval() fast_res = fast_result.eval() self.assertEqual(fast_res.shape, (BATCH_SIZE, INPUT_LENGTH + decode_length)) self.assertAllClose(beam_res, fast_res)
def imagetransformer_latent_tiny(): """Tiny set of hparams for a latent image model.""" hparams = transformer.transformer_small() hparams.batch_size = 2 hparams.num_hidden_layers = 3 hparams.hidden_size = 16 hparams.filter_size = 32 hparams.compress_filter_size = 64 hparams.ffn_layer = "conv_hidden_relu" hparams.layer_prepostprocess_dropout = 0.2 hparams.layer_preprocess_sequence = "none" hparams.layer_postprocess_sequence = "dan" hparams.dropout = 0.3 hparams.pos = "timing" hparams.num_encoder_layers = 1 hparams.num_decoder_layers = 2 hparams.use_pad_remover = False hparams.add_hparam("logit_normalization", True) hparams.add_hparam("bottleneck_kind", "dvq") hparams.add_hparam("bottleneck_bits", 4) hparams.add_hparam("num_residuals", 1) hparams.add_hparam("use_gold_targets", False) hparams.add_hparam("do_compress_attend", False) hparams.add_hparam("do_decompress_attend", False) hparams.add_hparam("drop_inputs", False) hparams.add_hparam("num_compress_steps", 2) hparams.add_hparam("startup_steps", 10000) hparams.add_hparam("mask_startup_steps", 50000) hparams.add_hparam("latent_dropout", 0.0) hparams.add_hparam("decode_autoregressive", False) hparams.add_hparam("vq_beta", 0.25) hparams.add_hparam("vq_epsilon", 1e-5) hparams.add_hparam("vq_decay", 0.999) hparams.add_hparam("ema", False) hparams.add_hparam("soft_em", True) hparams.add_hparam("num_samples", 1) hparams.add_hparam("num_latent_layers", 2) hparams.add_hparam("num_res_layers", 2) hparams.add_hparam("res_kernel_size", 3) hparams.add_hparam("num_blocks", 1) hparams.add_hparam("reshape_method", "slice") hparams.add_hparam("shared_rel", False) hparams.add_hparam("block_size", 1) hparams.add_hparam("kernel_size", 3) hparams.add_hparam("img_len", 8) hparams.add_hparam("num_channels", 1) hparams.add_hparam("local_and_global_att", False) hparams.add_hparam("block_length", 32) hparams.add_hparam("block_width", 128) hparams.add_hparam("dec_attention_type", cia.AttentionType.LOCAL_1D) hparams.add_hparam("latent_attention_type", cia.AttentionType.GLOBAL) hparams.add_hparam("block_raster_scan", False) hparams.add_hparam("num_latents", 1) hparams.add_hparam("q_filter_width", 1) hparams.add_hparam("kv_filter_width", 1) return hparams
def testTransformer(self, get_model_fn=None, p=None): if get_model_fn: model, features = get_model_fn(param_overrides=p) else: model, features = get_model(transformer.transformer_small()) logits, _ = model(features) with self.test_session() as session: session.run(tf.global_variables_initializer()) res = session.run(logits) self.assertEqual(res.shape, (BATCH_SIZE, TARGET_LENGTH, 1, 1, VOCAB_SIZE))
def _get_encoder_hparams(self): hparams = transformer.transformer_small() hparams.add_hparam("encoder_layer_list", layers.ENCODER_LAYERS.get_layer_names()) hparams.add_hparam("encoder_output_dim_list", [32] + [64] * (len(hparams.encoder_layer_list) - 2) + [32]) hparams.add_hparam("encoder_activation_list", ["none"] + ["relu"] * (len(hparams.encoder_layer_list) - 1)) hparams.add_hparam("encoder_norm_list", ["none"] + ["layer_norm"] * (len(hparams.encoder_layer_list) - 1)) return hparams
def testTransformerWithEncoderDecoderAttentionLoss(self): model, features = self.getModel(transformer.transformer_small()) expected_attention_weights = np.random.random_sample( size=(BATCH_SIZE, TARGET_LENGTH, INPUT_LENGTH)) features["expected_attention_weights"] = tf.constant( expected_attention_weights, dtype=tf.float32) _, extra_loss = model(features) with self.test_session() as session: session.run(tf.global_variables_initializer()) res = session.run(extra_loss["attention_loss"]) self.assertEqual(res.shape, ())
def transformer_small_sketch(): """Modified transformer_small.""" hparams = transformer_small() hparams.batch_size = 2048 hparams.max_length = 784 hparams.clip_grad_norm = 5. hparams.learning_rate_decay_scheme = "noam" hparams.learning_rate = 0.1 hparams.initializer = "orthogonal" hparams.sampling_method = "random" hparams.learning_rate_warmup_steps = 10000 return hparams
def transformer_ae_small(): """Set of hyperparameters.""" hparams = transformer.transformer_small() hparams.batch_size = 2048 hparams.learning_rate_warmup_steps = 4000 hparams.add_hparam("z_size", 128) hparams.add_hparam("v_size", 1024 * 32) hparams.add_hparam("num_compress_steps", 4) hparams.add_hparam("kl_warmup_steps", 60000) hparams.add_hparam("startup_steps", 30000) hparams.add_hparam("kmeans_lr_factor", 0.002) hparams.add_hparam("z_dropout", 0.1) return hparams
def transformer_ae_small(): """Set of hyperparameters.""" hparams = transformer.transformer_small() hparams.batch_size = 2048 hparams.learning_rate_warmup_steps = 4000 hparams.num_hidden_layers = 3 hparams.hidden_size = 384 hparams.filter_size = 2048 hparams.label_smoothing = 0.0 hparams.optimizer = "Adafactor" hparams.add_hparam("z_size", 16) hparams.add_hparam("noise_dev", 0.0) hparams.add_hparam("d_mix", 0.5) # Bottleneck kinds supported: dense, vae, semhash, gumbel-softmax, vq-vae. hparams.add_hparam("bottleneck_kind", "semhash") hparams.add_hparam("num_blocks", 1) # Reshape method for hierarchical vq-vae: slice, project hparams.add_hparam("reshape_method", "slice") hparams.add_hparam("trainable_projections", False) hparams.add_hparam("unmasked_percentage", 0.3) hparams.add_hparam("do_ae", True) hparams.add_hparam("do_mask", True) hparams.add_hparam("do_refine", False) hparams.add_hparam("do_attend_compress", False) hparams.add_hparam("do_attend_decompress", True) hparams.add_hparam("do_residual_compress", False) hparams.add_hparam("drop_inputs", False) hparams.add_hparam("v_size", 1024*64) hparams.add_hparam("max_context_length", 64) hparams.add_hparam("num_compress_steps", 3) hparams.add_hparam("kl_steps", 35000) hparams.add_hparam("startup_steps", 10000) hparams.add_hparam("mask_startup_steps", 50000) hparams.add_hparam("kmeans_lr_factor", 0.002) hparams.add_hparam("z_dropout", 0.1) hparams.add_hparam("is_2d", 0) hparams.add_hparam("use_gumbel_softmax", True) hparams.add_hparam("softmax_k", 0) hparams.add_hparam("decode_autoregressive", True) hparams.add_hparam("do_vae", True) hparams.add_hparam("bit_vae", True) hparams.add_hparam("beta", 0.25) hparams.add_hparam("epsilon", 1e-5) hparams.add_hparam("decay", 0.999) hparams.add_hparam("ema", True) hparams.add_hparam("random_top_k", 1) hparams.kl_warmup_steps = 150000 hparams.force_full_predict = True return hparams
def transformer_adv_small(): """Set of hyperparameters.""" hparams = transformer.transformer_small() hparams.batch_size = 2048 hparams.learning_rate_warmup_steps = 4000 hparams.num_hidden_layers = 3 hparams.hidden_size = 384 hparams.filter_size = 2048 hparams.label_smoothing = 0.0 hparams.weight_decay = 0.1 hparams.symbol_modality_skip_top = int(True) hparams.add_hparam("num_compress_steps", 2) hparams.add_hparam("extra_steps", 0) hparams.add_hparam("noise_val", 0.3) hparams.add_hparam("delta_max", 2.0) return hparams
def build_model(): global_step = tf.get_variable('global_step', [], initializer=tf.constant_initializer(0), trainable=False) model = getTransformerModel(transformer.transformer_small()) with tf.name_scope("input"): input = tf.placeholder(tf.float32, [batch_size, num_steps, num_features], name="input") target = tf.placeholder(tf.int32, [batch_size, num_steps], name="target") input2 = tf.reshape(input, [batch_size, num_steps, num_features, 1]) target2 = tf.reshape(target, [batch_size, num_steps, 1, 1]) features = { "inputs": input2, "targets": target2, "target_space_id": tf.constant(1, dtype=tf.int32) } out_logits, _ = model(features) out_logits = tf.squeeze(out_logits, axis=[2, 3]) loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=tf.reshape(out_logits, [-1, num_classes]), labels=tf.reshape(features["targets"], [-1])) loss = tf.reduce_mean(loss) last_predicted = tf.split(tf.cast(tf.argmax(out_logits, 2), tf.int32), num_steps, 1)[-1] last_target = tf.split(target, num_steps, 1)[-1] confusion_mat = tf.confusion_matrix(tf.reshape(last_target, [batch_size]), tf.reshape(last_predicted, [batch_size]), num_classes=num_classes, name='batch_confusion') acc = tf.reduce_mean( tf.cast(tf.equal(last_predicted, last_target), tf.float32)) grad_op = tf.train.AdamOptimizer().minimize(loss, global_step=global_step) loss_summary = tf.summary.scalar('cross_entropy', loss) acc_summary = tf.summary.scalar('accuracy', acc) summary_op = tf.summary.merge_all() return input, target, loss, acc, grad_op, summary_op, global_step, confusion_mat
def transformer_ae_small(): """Set of hyperparameters.""" hparams = transformer.transformer_small() hparams.batch_size = 2048 hparams.learning_rate_warmup_steps = 4000 hparams.add_hparam("z_size", 128) hparams.add_hparam("v_size", 1024 * 32) hparams.add_hparam("num_compress_steps", 4) hparams.add_hparam("kl_warmup_steps", 60000) hparams.add_hparam("startup_steps", 30000) hparams.add_hparam("kmeans_lr_factor", 0.002) hparams.add_hparam("z_dropout", 0.1) hparams.add_hparam("is_2d", 0) hparams.add_hparam("use_gumbel_softmax", int(True)) hparams.add_hparam("softmax_k", 4) hparams.add_hparam("decode_autoregressive", int(True)) return hparams
def getModel(self): hparams = transformer.transformer_small() p_hparams = problem_hparams.test_problem_hparams( hparams, VOCAB_SIZE, VOCAB_SIZE) hparams.problems = [p_hparams] inputs = -1 + np.random.random_integers( VOCAB_SIZE, size=(BATCH_SIZE, INPUT_LENGTH, 1, 1)) targets = -1 + np.random.random_integers( VOCAB_SIZE, size=(BATCH_SIZE, TARGET_LENGTH, 1, 1)) features = { "inputs": tf.constant(inputs, dtype=tf.int32), "targets": tf.constant(targets, dtype=tf.int32), "target_space_id": tf.constant(1, dtype=tf.int32), } return transformer.Transformer(hparams, tf.estimator.ModeKeys.PREDICT, p_hparams), features
def testGreedySlowTPUVsNonTPU(self): # Only works with TF 1.8+ # Version string can take the following form: "1.9.0-rc0" major_str, minor_str, unused_rest = tf.__version__.split(".", 3) major, minor = int(major_str), int(minor_str) if major < 1 or (major == 1 and minor < 8): return model, features = get_model(transformer.transformer_small()) decode_length = 3 out_logits, _ = model(features) out_logits = tf.squeeze(out_logits, axis=[2, 3]) loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=tf.reshape(out_logits, [-1, VOCAB_SIZE]), labels=tf.reshape(features["targets"], [-1])) loss = tf.reduce_mean(loss) apply_grad = tf.train.AdamOptimizer(0.001).minimize(loss) with self.test_session(): tf.global_variables_initializer().run() for _ in range(100): apply_grad.run() model.set_mode(tf.estimator.ModeKeys.PREDICT) with tf.variable_scope(tf.get_variable_scope(), reuse=True): slow_result_non_tpu = model._slow_greedy_infer( features, decode_length)["outputs"] slow_result_non_tpu = tf.squeeze(slow_result_non_tpu, axis=[2, 3]) slow_result_tpu = model._slow_greedy_infer_tpu( features, decode_length)["outputs"] slow_result_tpu = tf.squeeze(slow_result_tpu, axis=[2, 3]) with self.test_session(): slow_non_tpu_res = slow_result_non_tpu.eval() slow_tpu_res = slow_result_tpu.eval() self.assertEqual(slow_tpu_res.shape, (BATCH_SIZE, INPUT_LENGTH + decode_length)) self.assertAllClose(slow_tpu_res, slow_non_tpu_res)
def testBeamVsFast(self): model, features = get_model(transformer.transformer_small()) decode_length = 2 out_logits, _ = model(features) out_logits = tf.squeeze(out_logits, axis=[2, 3]) loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=tf.reshape(out_logits, [-1, VOCAB_SIZE]), labels=tf.reshape(features["targets"], [-1])) loss = tf.reduce_mean(loss) apply_grad = tf.train.AdamOptimizer(0.001).minimize(loss) with self.test_session(): tf.global_variables_initializer().run() for _ in range(100): apply_grad.run() model.set_mode(tf.estimator.ModeKeys.PREDICT) with tf.variable_scope(tf.get_variable_scope(), reuse=True): beam_result = model._beam_decode_slow( features, decode_length, beam_size=4, top_beams=1, alpha=1.0)["outputs"] fast_result = model._beam_decode( features, decode_length, beam_size=4, top_beams=1, alpha=1.0)["outputs"] with self.test_session(): beam_res = beam_result.eval() fast_res = fast_result.eval() self.assertAllClose(beam_res, fast_res)
def transformer_l2_arctic(): """HParams for training ASR model on L2 Arctic""" hparams = transformer_small() hparams.max_length = 1240000 hparams.max_input_seq_length = 1550 hparams.max_target_seq_length = 350 hparams.batch_size = 16 hparams.learning_rate = 0.15 hparams.daisy_chain_variables = False hparams.num_heads = 2 hparams.ffn_layer = "conv_relu_conv" hparams.conv_first_kernel = 9 hparams.weight_decay = 0 hparams.layer_prepostprocess_dropout = 0.2 hparams.relu_dropout = 0.2 hparams.num_decoder_layers = 1 hparams.num_encoder_layers = 3 # hparams.num_hidden_layers = 1 # hparams.hidden_size = 256 return hparams
def getModel(self, mode=tf.estimator.ModeKeys.TRAIN): hparams = transformer.transformer_small() hparams.hidden_size = 8 hparams.filter_size = 32 hparams.num_heads = 1 hparams.layer_prepostprocess_dropout = 0.0 p_hparams = problem_hparams.test_problem_hparams( VOCAB_SIZE, VOCAB_SIZE) hparams.problems = [p_hparams] inputs = -1 + np.random.random_integers( VOCAB_SIZE, size=(BATCH_SIZE, INPUT_LENGTH, 1, 1)) targets = -1 + np.random.random_integers( VOCAB_SIZE, size=(BATCH_SIZE, TARGET_LENGTH, 1, 1)) features = { "inputs": tf.constant(inputs, dtype=tf.int32), "targets": tf.constant(targets, dtype=tf.int32), "target_space_id": tf.constant(1, dtype=tf.int32), } return transformer.Transformer(hparams, tf.estimator.ModeKeys.PREDICT, p_hparams), features
def get_hparams(self): hparams = transformer.transformer_small() hparams.add_hparam("prior_type", "affine") hparams.add_hparam("depths", "12") # infer n_levels from depths hparams.add_hparam("split_plans", "tca") hparams.add_hparam("factor", 2) # squeezing factor hparams.add_hparam("n_layers_transform_params", 1) hparams.add_hparam("n_layers_multiscale_prior", 3) hparams.add_hparam("flow_num_heads", 4) hparams.add_hparam("flow_num_1x1_heads", N_1X1_HEADS) hparams.add_hparam("flow_hidden_size", 64) hparams.add_hparam("flow_filter_size", 128) hparams.add_hparam("cond_prior_on_src", True) hparams.add_hparam("bottom_prior_std", False) hparams.add_hparam("latent_size", N_CHANNELS) hparams.add_hparam("scale_width", 0.999) hparams.add_hparam("coupling_transform_ratio", 0.5) hparams.add_hparam("actnorm_type", "actnorm") hparams.add_hparam("actnorm_weightnorm", True) hparams.add_hparam("perm_type", "1x1") hparams.add_hparam("init_permutation", True) hparams.causal_decoder_self_attention = False hparams.hidden_size = HIDDEN_SIZE return hparams
def transformer_nat_small(): """Set of hyperparameters.""" hparams = transformer.transformer_small() hparams.batch_size = 2048 hparams.learning_rate = 0.2 hparams.learning_rate_warmup_steps = 4000 hparams.num_hidden_layers = 3 hparams.hidden_size = 384 hparams.filter_size = 2048 hparams.label_smoothing = 0.0 hparams.force_full_predict = True hparams.optimizer = "adam" hparams.optimizer_adam_epsilon = 1e-9 hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.997 hparams.add_hparam("bottleneck_kind", "vq") hparams.add_hparam("bottleneck_bits", 12) hparams.add_hparam("num_compress_steps", 3) hparams.add_hparam("beta", 0.25) hparams.add_hparam("epsilon", 1e-5) hparams.add_hparam("decay", 0.999) hparams.add_hparam("num_samples", 10) hparams.add_hparam("mask_startup_steps", 50000) return hparams
def transformer_nat_small(): """Set of hyperparameters.""" hparams = transformer.transformer_small() hparams.batch_size = 2048 hparams.learning_rate = 0.2 hparams.learning_rate_warmup_steps = 4000 hparams.num_hidden_layers = 3 hparams.hidden_size = 384 hparams.filter_size = 2048 hparams.label_smoothing = 0.0 hparams.force_full_predict = True hparams.optimizer = "Adam" hparams.optimizer_adam_epsilon = 1e-9 hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.997 hparams.add_hparam("bottleneck_kind", "vq") hparams.add_hparam("bottleneck_bits", 12) hparams.add_hparam("num_compress_steps", 3) hparams.add_hparam("beta", 0.25) hparams.add_hparam("epsilon", 1e-5) hparams.add_hparam("decay", 0.999) hparams.add_hparam("num_samples", 10) hparams.add_hparam("mask_startup_steps", 50000) return hparams
def transformer_nat_small(): """Set of hyperparameters.""" hparams = transformer.transformer_small() hparams.batch_size = 2048 hparams.learning_rate = 0.2 hparams.learning_rate_warmup_steps = 4000 hparams.num_hidden_layers = 3 hparams.hidden_size = 384 hparams.filter_size = 2048 hparams.label_smoothing = 0.0 hparams.optimizer = "Adam" # Can be unstable, maybe try Adam. hparams.optimizer_adam_epsilon = 1e-9 hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.997 # Needs tuning, try 0.98 to 0.999. hparams.add_hparam("bottleneck_kind", "em") hparams.add_hparam("bottleneck_bits", 12) hparams.add_hparam("num_compress_steps", 3) hparams.add_hparam("startup_steps", 10000) hparams.add_hparam("mask_startup_steps", 50000) hparams.add_hparam("beta", 0.25) hparams.add_hparam("epsilon", 1e-5) hparams.add_hparam("decay", 0.999) hparams.add_hparam("num_samples", 10) return hparams
def transformer_ae_small(): """Set of hyperparameters.""" hparams = transformer.transformer_small() hparams.batch_size = 2048 hparams.learning_rate = 0.2 hparams.learning_rate_warmup_steps = 4000 hparams.num_hidden_layers = 3 hparams.hidden_size = 384 hparams.filter_size = 2048 hparams.add_hparam("compress_filter_size", 2048 * 2) hparams.label_smoothing = 0.0 hparams.optimizer = "Adam" # Can be unstable, maybe try Adam. hparams.optimizer_adam_epsilon = 1e-9 hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.997 # Needs tuning, try 0.98 to 0.999. hparams.add_hparam("z_size", 14) hparams.add_hparam("noise_dev", 0.5) hparams.add_hparam("d_mix", 0.5) hparams.add_hparam("logit_normalization", True) hparams.add_hparam("word_dropout", 0.0) # Bottleneck kinds supported: dense, vae, semhash, gumbel-softmax, dvq. hparams.add_hparam("bottleneck_kind", "semhash") hparams.add_hparam("num_blocks", 1) hparams.add_hparam("num_decode_blocks", 1) # Add an hparam for number of reiduals hparams.add_hparam("num_residuals", 1) # Reshape method for DVQ: slice, project hparams.add_hparam("causal", True) hparams.add_hparam("reshape_method", "slice") hparams.add_hparam("trainable_projections", False) hparams.add_hparam("unmasked_percentage", 0.1) hparams.add_hparam("do_ae", True) hparams.add_hparam("do_mask", True) hparams.add_hparam("use_predict_mask", True) hparams.add_hparam("do_refine", False) hparams.add_hparam("do_attend_compress", False) hparams.add_hparam("do_attend_decompress", True) hparams.add_hparam("do_residual_compress", False) hparams.add_hparam("drop_inputs", False) hparams.add_hparam("v_size", 1024 * 64) hparams.add_hparam("max_context_length", 64) hparams.add_hparam("num_compress_steps", 3) hparams.add_hparam("startup_steps", 10000) hparams.add_hparam("mask_startup_steps", 50000) hparams.add_hparam("z_dropout", 0.1) hparams.add_hparam("is_2d", 0) hparams.add_hparam("softmax_k", 0) hparams.add_hparam("decode_autoregressive", True) hparams.add_hparam("do_vae", True) hparams.add_hparam("bit_vae", True) hparams.add_hparam("beta", 0.25) hparams.add_hparam("epsilon", 1e-5) hparams.add_hparam("decay", 0.999) hparams.add_hparam("ema", True) hparams.add_hparam("random_top_k", 1) hparams.add_hparam("soft_em", False) hparams.add_hparam("num_samples", 10) hparams.add_hparam("inv_temp", 1.0) hparams.add_hparam("entropy_scale", 0.0) hparams.add_hparam("prior_scale", 1.0) hparams.add_hparam("do_hard_gumbel_softmax", False) hparams.add_hparam("do_iaf", False) hparams.add_hparam("approximate_gs_entropy", False) hparams.add_hparam("temperature_warmup_steps", 150000) hparams.add_hparam("sum_over_latents", False) hparams.force_full_predict = True # task params hparams.add_hparam("task", "translate") # translate or image tasks supported return hparams
def test_calculate_branching_model_parameters_transformer( self, get_config, expected_hidden_depths): tf.reset_default_graph() (num_cells, left_inputs, left_layers, left_output_dims, right_inputs, right_layers, right_output_dims, combiner_functions, final_combiner_function, dummy_activations, dummy_norms, layer_registry, is_decoder) = get_config() # Get predicted number of parameters. (predicted_num_params, output_size, hidden_depths, _) = translation_nas_net.calculate_branching_model_parameters( encoding_depth=_EMBEDDING_DEPTH, left_inputs=left_inputs, left_layers=left_layers, left_output_dims=left_output_dims, right_inputs=right_inputs, right_layers=right_layers, right_output_dims=right_output_dims, combiner_functions=combiner_functions, final_combiner_function=final_combiner_function, layer_registry=layer_registry, num_cells=num_cells, encoder_depth=_EMBEDDING_DEPTH) # Create model graph. input_tensor = tf.zeros([32, _INPUT_LENGTH, _EMBEDDING_DEPTH]) hparams = transformer.transformer_small() if is_decoder: nonpadding = None mask_future = True decoder_self_attention_bias = ( common_attention.attention_bias_lower_triangle(_INPUT_LENGTH)) encoder_cell_outputs = [input_tensor] * 6 else: nonpadding = tf.ones([32, _INPUT_LENGTH]) mask_future = False decoder_self_attention_bias = None encoder_cell_outputs = None translation_nas_net.apply_nas_layers( input_tensor=input_tensor, left_inputs=left_inputs, left_layers=left_layers, left_activations=dummy_activations, left_output_dims=left_output_dims, left_norms=dummy_norms, right_inputs=right_inputs, right_layers=right_layers, right_activations=dummy_activations, right_output_dims=right_output_dims, right_norms=dummy_norms, combiner_functions=combiner_functions, final_combiner_function=final_combiner_function, num_cells=num_cells, nonpadding=nonpadding, layer_registry=layer_registry, mask_future=mask_future, hparams=hparams, var_scope="test", encoder_decoder_attention_bias=None, encoder_cell_outputs=encoder_cell_outputs, decoder_self_attention_bias=decoder_self_attention_bias, final_layer_norm=False) # Count graph variables. trainable_variables_list = tf.trainable_variables() empirical_num_params = 0 for variable_tensor in trainable_variables_list: empirical_num_params += _list_product( variable_tensor.shape.as_list()) # Compare. self.assertEqual(empirical_num_params, predicted_num_params) self.assertEqual(output_size, _EMBEDDING_DEPTH) self.assertEqual(hidden_depths, expected_hidden_depths)
def build_model(self): # build index table index_table = tf.contrib.lookup.index_table_from_file( vocabulary_file=self.config.vocab_list, num_oov_buckets=0, default_value=0) # get data iterator self.data_iterator = self.data.get_data_iterator(index_table, mode=self.mode) # get inputs with tf.variable_scope("inputs"): # get next batch if there is no feeded data next_batch = self.data_iterator.get_next() self.input_queries = tf.placeholder_with_default( next_batch["input_queries"], [None, self.config.max_length], name="input_queries") self.input_replies = tf.placeholder_with_default( next_batch["input_replies"], [None, self.config.max_length], name="input_replies") self.query_lengths = tf.placeholder_with_default( tf.squeeze(next_batch["query_lengths"]), [None], name="query_lengths") self.reply_lengths = tf.placeholder_with_default( tf.squeeze(next_batch["reply_lengths"]), [None], name="reply_lengths") # get hyperparams self.embed_dropout_keep_prob = tf.placeholder( tf.float64, name="embed_dropout_keep_prob") self.lstm_dropout_keep_prob = tf.placeholder( tf.float32, name="lstm_dropout_keep_prob") self.dense_dropout_keep_prob = tf.placeholder( tf.float32, name="dense_dropout_keep_prob") self.num_negative_samples = tf.placeholder( tf.int32, name="num_negative_samples") with tf.variable_scope("properties"): # length properties cur_batch_length = tf.shape(self.input_queries)[0] # get hparms from tensor2tensor.models.transformer hparams = transformer.transformer_small() hparams.batch_size = self.config.batch_size hparams.learning_rate_decay_steps = 10000 hparams.learning_rate_minimum = 3e-5 # learning rate lr = learning_rate.learning_rate_schedule(hparams) self.learning_rate = lr # embedding layer with tf.variable_scope("embedding"): embeddings = tf.Variable(get_embeddings( self.config.vocab_list, self.config.pretrained_embed_dir, self.config.vocab_size, self.config.embed_dim), trainable=True, name="embeddings") embeddings = tf.nn.dropout( embeddings, keep_prob=self.embed_dropout_keep_prob, noise_shape=[tf.shape(embeddings)[0], 1]) queries_embedded = tf.to_float( tf.nn.embedding_lookup(embeddings, self.input_queries, name="queries_embedded")) replies_embedded = tf.to_float( tf.nn.embedding_lookup(embeddings, self.input_replies, name="replies_embedded")) self.queries_embedded = queries_embedded self.replies_embedded = replies_embedded # transformer layer with tf.variable_scope("transformer"): queries_expanded = tf.expand_dims(queries_embedded, axis=2, name="queries_expanded") replies_expanded = tf.expand_dims(replies_embedded, axis=2, name="replies_expanded") hparams = transformer.transformer_small() hparams.set_hparam("batch_size", self.config.batch_size) hparams.set_hparam("hidden_size", self.config.embed_dim) encoder = transformer.TransformerEncoder(hparams, mode=self.mode) self.queries_encoded = encoder({ "inputs": queries_expanded, "targets": queries_expanded })[0] self.replies_encoded = encoder({ "inputs": replies_expanded, "targets": replies_expanded })[0] self.queries_encoded = tf.squeeze( tf.reduce_sum(self.queries_encoded, axis=1, keep_dims=True)) self.replies_encoded = tf.squeeze( tf.reduce_sum(self.replies_encoded, axis=1, keep_dims=True)) with tf.variable_scope("sampling"): positive_mask = tf.eye(cur_batch_length) negative_mask = make_negative_mask( tf.zeros([cur_batch_length, cur_batch_length]), method=self.config.negative_sampling, num_negative_samples=self.num_negative_samples) negative_queries_indices, negative_replies_indices = tf.split( tf.where(tf.not_equal(negative_mask, 0)), [1, 1], 1) self.distances = tf.matmul(self.queries_encoded, self.replies_encoded, transpose_b=True) self.distances_flattened = tf.reshape(self.distances, [-1]) self.positive_distances = tf.gather( self.distances_flattened, tf.where(tf.reshape(positive_mask, [-1]))) self.negative_distances = tf.gather( self.distances_flattened, tf.where(tf.reshape(negative_mask, [-1]))) self.negative_queries_indices = tf.squeeze( negative_queries_indices) self.negative_replies_indices = tf.squeeze( negative_replies_indices) self.positive_inputs = tf.concat([ self.queries_encoded, self.positive_distances, self.replies_encoded ], 1) self.negative_inputs = tf.reshape( tf.concat([ tf.nn.embedding_lookup(self.queries_encoded, self.negative_queries_indices), self.negative_distances, tf.nn.embedding_lookup(self.replies_encoded, self.negative_replies_indices) ], 1), [ tf.shape(negative_queries_indices)[0], self.config.embed_dim * 2 + 1 ]) with tf.variable_scope("prediction"): self.hidden_outputs = tf.layers.dense(tf.concat( [self.positive_inputs, self.negative_inputs], 0), 256, tf.nn.relu, name="hidden_layer") self.logits = tf.layers.dense(self.hidden_outputs, 2, tf.nn.relu, name="output_layer") labels = tf.concat([ tf.ones([tf.shape(self.positive_inputs)[0]], tf.float64), tf.zeros([tf.shape(self.negative_inputs)[0]], tf.float64) ], 0) self.labels = tf.one_hot(tf.to_int32(labels), 2) self.probs = tf.sigmoid(self.logits) self.predictions = tf.argmax(self.probs, 1) with tf.variable_scope("loss"): self.loss = tf.reduce_mean( tf.nn.softmax_cross_entropy_with_logits_v2(labels=self.labels, logits=self.logits)) self.train_step = optimize.optimize(self.loss, lr, hparams, use_tpu=False) with tf.variable_scope("score"): correct_predictions = tf.equal(self.predictions, tf.argmax(self.labels, 1)) self.accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"), name="accuracy")
def transformer_ae_small(): """Set of hyperparameters.""" hparams = transformer.transformer_small() hparams.batch_size = 2048 hparams.learning_rate = 0.2 hparams.learning_rate_warmup_steps = 4000 hparams.num_hidden_layers = 3 hparams.hidden_size = 384 hparams.filter_size = 2048 hparams.add_hparam("compress_filter_size", 2048 * 2) hparams.label_smoothing = 0.0 hparams.optimizer = "Adam" # Can be unstable, maybe try Adam. hparams.optimizer_adam_epsilon = 1e-9 hparams.optimizer_adam_beta1 = 0.9 hparams.optimizer_adam_beta2 = 0.997 # Needs tuning, try 0.98 to 0.999. hparams.add_hparam("z_size", 14) hparams.add_hparam("noise_dev", 0.5) hparams.add_hparam("d_mix", 0.5) hparams.add_hparam("logit_normalization", True) hparams.add_hparam("word_dropout", 0.1) # Bottleneck kinds supported: dense, vae, semhash, gumbel-softmax, dvq. hparams.add_hparam("bottleneck_kind", "semhash") hparams.add_hparam("num_blocks", 1) hparams.add_hparam("num_decode_blocks", 1) # Add an hparam for number of reiduals hparams.add_hparam("num_residuals", 1) # Reshape method for DVQ: slice, project hparams.add_hparam("causal", True) hparams.add_hparam("reshape_method", "slice") hparams.add_hparam("trainable_projections", False) hparams.add_hparam("unmasked_percentage", 0.1) hparams.add_hparam("do_ae", True) hparams.add_hparam("do_mask", True) hparams.add_hparam("use_predict_mask", True) hparams.add_hparam("do_refine", False) hparams.add_hparam("do_attend_compress", False) hparams.add_hparam("do_attend_decompress", True) hparams.add_hparam("do_residual_compress", False) hparams.add_hparam("drop_inputs", False) hparams.add_hparam("v_size", 1024*64) hparams.add_hparam("max_context_length", 64) hparams.add_hparam("num_compress_steps", 3) hparams.add_hparam("startup_steps", 10000) hparams.add_hparam("mask_startup_steps", 50000) hparams.add_hparam("z_dropout", 0.1) hparams.add_hparam("is_2d", 0) hparams.add_hparam("softmax_k", 0) hparams.add_hparam("decode_autoregressive", True) hparams.add_hparam("do_vae", True) hparams.add_hparam("bit_vae", True) hparams.add_hparam("beta", 0.25) hparams.add_hparam("epsilon", 1e-5) hparams.add_hparam("decay", 0.999) hparams.add_hparam("ema", True) hparams.add_hparam("random_top_k", 1) hparams.add_hparam("soft_em", False) hparams.add_hparam("num_samples", 10) hparams.add_hparam("inv_temp", 1.0) hparams.kl_warmup_steps = 150000 hparams.force_full_predict = True # task params hparams.add_hparam("task", "translate") # translate or image tasks supported return hparams