def load_model(self): tf.compat.v1.disable_eager_execution() # placeholders self.x = tf.compat.v1.placeholder(tf.int32, shape=[self.batch_size, None]) self.y = tf.compat.v1.placeholder(tf.int32, shape=[self.batch_size, None]) self.mems_i = [tf.compat.v1.placeholder(tf.float32, [self.mem_len, self.batch_size, self.d_model]) for _ in range(self.n_layer)] # model self.global_step = tf.compat.v1.train.get_or_create_global_step() initializer = tf.compat.v1.initializers.random_normal(stddev=0.02, seed=None) proj_initializer = tf.compat.v1.initializers.random_normal(stddev=0.01, seed=None) with tf.compat.v1.variable_scope(tf.compat.v1.get_variable_scope()): xx = tf.transpose(self.x, [1, 0]) yy = tf.transpose(self.y, [1, 0]) loss, self.logits, self.new_mem = modules.transformer( dec_inp=xx, target=yy, mems=self.mems_i, n_token=self.n_token, n_layer=self.n_layer, d_model=self.d_model, d_embed=self.d_embed, n_head=self.n_head, d_head=self.d_head, d_inner=self.d_ff, dropout=self.dropout, dropatt=self.dropout, initializer=initializer, proj_initializer=proj_initializer, is_training=self.is_training, mem_len=self.mem_len, cutoffs=[], div_val=-1, tie_projs=[], same_length=False, clamp_len=-1, input_perms=None, target_perms=None, head_target=None, untie_r=False, proj_same_dim=True) self.avg_loss = tf.reduce_mean(loss) # vars all_vars = tf.compat.v1.trainable_variables() grads = tf.gradients(self.avg_loss, all_vars) grads_and_vars = list(zip(grads, all_vars)) all_trainable_vars = tf.reduce_sum([tf.reduce_prod(v.shape) for v in tf.compat.v1.trainable_variables()]) # optimizer decay_lr = tf.compat.v1.train.cosine_decay( self.learning_rate, global_step=self.global_step, decay_steps=400000, alpha=0.004) optimizer = tf.compat.v1.train.AdamOptimizer(learning_rate=decay_lr) self.train_op = optimizer.apply_gradients(grads_and_vars, self.global_step) # saver self.saver = tf.compat.v1.train.Saver() config = tf.compat.v1.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True self.sess = tf.compat.v1.Session(config=config) self.saver.restore(self.sess, self.checkpoint_path)
def load_model(self): # placeholders self.x = tf.compat.v1.placeholder(tf.int32, shape=[self.batch_size, None]) self.y = tf.compat.v1.placeholder(tf.int32, shape=[self.batch_size, None]) self.mems_i = [ tf.compat.v1.placeholder( tf.float32, [self.mem_len, self.batch_size, self.d_model]) for _ in range(self.n_layer) ] # model initializer = tf.compat.v1.initializers.random_normal(stddev=0.02, seed=None) proj_initializer = tf.compat.v1.initializers.random_normal(stddev=0.01, seed=None) with tf.compat.v1.variable_scope(tf.compat.v1.get_variable_scope()): xx = tf.transpose(self.x, [1, 0]) yy = tf.transpose(self.y, [1, 0]) loss, self.logits, self.new_mem = modules.transformer( dec_inp=xx, target=yy, mems=self.mems_i, n_token=self.n_token, n_layer=self.n_layer, d_model=self.d_model, d_embed=self.d_embed, n_head=self.n_head, d_head=self.d_head, d_inner=self.d_ff, dropout=0.0, dropatt=0.0, initializer=initializer, proj_initializer=proj_initializer, is_training=False, mem_len=self.mem_len, cutoffs=[], div_val=-1, tie_projs=[], same_length=False, clamp_len=-1, input_perms=None, target_perms=None, head_target=None, untie_r=False, proj_same_dim=True) # restore self.saver = tf.compat.v1.train.Saver() config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True self.sess = tf.compat.v1.Session(config=config) self.saver.restore(self.sess, self.checkpoint_path)
def test_transformer(self): in_seq = torch.LongTensor([[1, 2, 3, 4], [8, 7, 6, 5]]) out_seq = torch.LongTensor([[0, 1, 2], [3, 9, 3]]) transformer = Transformer(10, 6, 6, 8, 3, 2, 2, 4) probs = transformer(in_seq, out_seq) self.assertEqual(probs.shape, torch.Size([2, 3, 10]))
def load_model(self): # placeholders self.x = tf.compat.v1.placeholder(tf.int32, shape=[self.batch_size, None]) self.y = tf.compat.v1.placeholder(tf.int32, shape=[self.batch_size, None]) self.mems_i = [tf.compat.v1.placeholder(tf.float32, [self.mem_len, self.batch_size, self.d_model]) for _ in range(self.n_layer)] # model self.global_step = tf.compat.v1.train.get_or_create_global_step() initializer = tf.compat.v1.keras.initializers.glorot_normal() proj_initializer = tf.compat.v1.keras.initializers.glorot_normal() with tf.compat.v1.variable_scope(tf.compat.v1.get_variable_scope()): xx = tf.transpose(self.x, [1, 0]) yy = tf.transpose(self.y, [1, 0]) loss, self.logits, self.new_mem = modules.transformer( dec_inp=xx, target=yy, mems=self.mems_i, n_token=self.n_token, n_layer=self.n_layer, d_model=self.d_model, d_embed=self.d_embed, n_head=self.n_head, d_head=self.d_head, d_inner=self.d_ff, dropout=self.dropout, dropatt=self.dropout, initializer=initializer, proj_initializer=proj_initializer, is_training=self.is_training, mem_len=self.mem_len, rezero=self.rezero, cutoffs=[], div_val=-1, tie_projs=[], same_length=False, clamp_len=-1, input_perms=None, target_perms=None, head_target=None, untie_r=False, proj_same_dim=True) variables = tf.trainable_variables() grads = tf.gradients(self.loss, variables) grads_and_vars = list(zip(grads, variables)) self.avg_loss = tf.reduce_mean(loss) # vars decay_lr = tf.compat.v1.train.cosine_decay( self.learning_rate, global_step=self.global_step, decay_steps=400000, alpha=0.004) optimizer = RAdamOptimizer(decay_lr) optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer) self.train_op = optimizer.apply_gradients(grads_and_vars, self.global_step) # saver self.saver = tf.compat.v1.train.Saver() config = tf.compat.v1.ConfigProto(allow_soft_placement=True) config.gpu_options.allow_growth = True config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 self.sess = tf.compat.v1.Session(config=config) self.saver.restore(self.sess, self.checkpoint_path)