def _testGPipeTransformerFPropPackedInput(self, splits=1): batch = 4 tf.flags.FLAGS.tpu_compatible = True with self.session() as sess: with tf.variable_scope('transformer_test', reuse=tf.AUTO_REUSE): params = self._TransformerParams(splits=splits) params.dtype = tf.float32 params.fprop_dtype = tf.float32 packed_params = params.Copy() packed_params.packed_input = True xformer = GPipeTransformerStack(params) packed_xformer = GPipeTransformerStack(packed_params) # Prepare inputs inputs, paddings, tgt_inputs, tgt_paddings = self._random_inputs(batch) packed_inputs = tf.reshape(inputs, [-1, 1, 2]) packed_tgt_inputs = tf.reshape(tgt_inputs, [-1, 1, 2]) packed_paddings = tf.reshape(paddings, [-1, 1]) packed_tg_paddings = tf.reshape(tgt_paddings, [-1, 1]) segment_ids = tf.transpose( tf.constant([[0, 1, 2, 3, 0, 1, 2, 3]], dtype=tf.float32)) tgt_segment_id = tf.transpose( tf.constant([[0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]], dtype=tf.float32)) output = xformer.FProp(xformer.theta, inputs, paddings, tgt_inputs, tgt_paddings) packed_output = packed_xformer.FProp( packed_xformer.theta, packed_inputs, packed_paddings, packed_tgt_inputs, packed_tg_paddings, segment_ids, tgt_segment_id) packed_output = tf.reshape(packed_output, output.shape) tf.global_variables_initializer().run() output, packed_output = sess.run([output, packed_output]) self.assertAllClose(output, packed_output)
def testGPipeTransformerStackTrainTransparentFPropWithEmbeddings( self, splits=1, num_micro_batches=1): # time = 2, batch = 4 with self.session() as sess: params = _TransformerParamsWithEmbeddings( splits=splits, num_micro_batches=num_micro_batches, num_decoder_layers=3, num_encoder_layers=1) params.is_transparent = True params.transparent_merger_dropout_prob = 0.0 xformer = GPipeTransformerStack(params) input_ids, id_paddings, tgt_inputs, tgt_paddings, _, _ = _TransformerRandomInputsIds( batch=batch) inputs, paddings, _, _ = _TransformerRandomInputsVecs(batch=batch) tf.random.set_seed(1234) tf.global_variables_initializer().run() enc_outputs = xformer.EncoderFPropDefaultTheta(inputs, paddings) dec_output = xformer.FProp(xformer.theta, input_ids, id_paddings, tgt_inputs, tgt_paddings)[2] enc_out_1 = sess.run(enc_outputs) dec_out = sess.run(dec_output) self.assertAllClose( [[[0.017581, 0.802863, 0.975554, -1.164572]] * batch, [[-0.549953, 1.196884, 4.910457, -0.102137]] * batch], enc_out_1) self.assertAllClose( [[[-1.122128, 1.111972, 4.642949, -2.14831]] * batch, [[-1.336919, 1.182709, 4.785938, -2.039246]] * batch, [[-1.335168, 1.297679, 4.720459, -2.111006]] * batch], dec_out)
def _testGPipeTransformerDecoderStackFProp(self, splits=1, num_micro_batches=1): batch = 4 tf.flags.FLAGS.tpu_compatible = True with self.session() as sess: params = self._TransformerParams( num_decoder_layers=4, num_encoder_layers=0, splits=splits, num_micro_batches=num_micro_batches) params.dtype = tf.float32 params.fprop_dtype = tf.float32 xformer = GPipeTransformerStack(params) inputs, paddings, tgt_inputs, tgt_paddings = self._random_inputs( batch) output = xformer.FProp(xformer.theta, inputs, paddings, tgt_inputs, tgt_paddings) tf.global_variables_initializer().run() output_val = sess.run(output) self.assertAllCloseAccordingToType( [[[1.03550637, -1.3199079]] * batch, [[-3.36382699, -0.74492991]] * batch, [[-3.36382723, -0.74492997]] * batch], output_val)
def _testGPipeTransformerStackTrainEncoderTransparentFProp( self, splits=1, num_micro_batches=1): # time = 2, batch = 4 with self.session() as sess: params = self._TransformerParams( splits=splits, num_micro_batches=num_micro_batches, num_decoder_layers=2, num_encoder_layers=2) params.is_transparent = True params.num_transparent_outputs = 1 params.transparent_merger_dropout_prob = 0.0 xformer = GPipeTransformerStack(params) inputs, paddings, tgt_inputs, tgt_paddings = self._random_inputs( batch=batch) py_utils.GetOrCreateGlobalStep() tf.set_random_seed(1234) tf.global_variables_initializer().run() enc_output = xformer.EncoderFPropDefaultTheta(inputs, paddings) dec_output = xformer.FProp(xformer.theta, inputs, paddings, tgt_inputs, tgt_paddings) enc_out = sess.run(enc_output) dec_out = sess.run(dec_output) self.assertAllClose(enc_out, [[[-0.118476, 1.031626]] * batch, [[0.643884, -1.02581167]] * batch]) self.assertAllClose(dec_out, [[[-2.8764534, 1.00808454]] * batch, [[1.02129495, -0.78406084]] * batch, [[1.02129495, -0.78406084]] * batch])
def _testGPipeTransformerStackTrainTransparentFProp( self, splits=1, num_micro_batches=1): # time = 2, batch = 4 with self.session() as sess: params = self._TransformerParams( splits=splits, num_micro_batches=num_micro_batches, num_decoder_layers=3, num_encoder_layers=1) params.is_transparent = True params.num_transparent_outputs = 3 params.transparent_merger_dropout_prob = 0.0 xformer = GPipeTransformerStack(params) inputs, paddings, tgt_inputs, tgt_paddings = self._random_inputs( batch=batch) py_utils.GetOrCreateGlobalStep() tf.set_random_seed(1234) tf.global_variables_initializer().run() enc_outputs = xformer.EncoderFPropDefaultTheta(inputs, paddings) dec_output = xformer.FProp(xformer.theta, inputs, paddings, tgt_inputs, tgt_paddings) enc_out_1, enc_out_2, enc_out_3 = sess.run(enc_outputs) dec_out = sess.run(dec_output) self.assertAllClose(enc_out_1, enc_out_2) self.assertAllClose(enc_out_2, enc_out_3) self.assertAllClose(enc_out_1, [[[-0.27896273, 1.46589136]] * batch, [[1.03141928, -0.847896]] * batch]) self.assertAllClose(dec_out, [[[2.926736, -4.090812]] * batch, [[-1.69508219, 1.75891459]] * batch, [[-1.6950829, 1.75891507]] * batch])
def _testGPipeTransformerEncoderFPropDefaultTheta(self, splits=1, num_micro_batches=1): batch = 4 tf.flags.FLAGS.tpu_compatible = True with self.session() as sess: params = self._TransformerParams( num_decoder_layers=4, num_encoder_layers=4, splits=splits, num_micro_batches=num_micro_batches) params.dtype = tf.float32 params.fprop_dtype = tf.float32 xformer = GPipeTransformerStack(params) inputs, paddings, _, _ = self._random_inputs(batch) output = xformer.EncoderFPropDefaultTheta(inputs, paddings) tf.global_variables_initializer().run() output = sess.run(output) self.assertAllCloseAccordingToType( [[[0.21085747, 0.60925347]] * batch, [[0.21085747, 0.60925347]] * batch], output)
def testGPipeTransformerStackTrainTransparentFPropWithEmbeddings( self, splits=1, num_micro_batches=1): # time = 2, batch = 4 with self.session() as sess: params = _TransformerParamsWithEmbeddings( splits=splits, num_micro_batches=num_micro_batches, num_decoder_layers=3, num_encoder_layers=1) params.is_transparent = True params.transparent_merger_dropout_prob = 0.0 xformer = GPipeTransformerStack(params) input_ids, id_paddings, tgt_inputs, tgt_paddings, _, _ = _TransformerRandomInputsIds( batch=batch) inputs, paddings, _, _ = _TransformerRandomInputsVecs(batch=batch) tf.set_random_seed(1234) tf.global_variables_initializer().run() enc_outputs = xformer.EncoderFPropDefaultTheta(inputs, paddings) dec_output = xformer.FProp(xformer.theta, input_ids, id_paddings, tgt_inputs, tgt_paddings)[2] enc_out_1 = sess.run(enc_outputs) dec_out = sess.run(dec_output) self.assertAllClose( [[[0.68660116, 0.947429, 0.78953624, -1.20142817]] * batch, [[0.57919669, 1.12979364, 4.29336643, 0.45106331]] * batch], enc_out_1) self.assertAllClose( [[[-0.46651918, -1.62957835, 1.15657926, 1.08397353]] * batch, [[-0.34674695, -1.65999401, 1.08431196, 1.07384491]] * batch, [[-0.41073492, -1.60431314, 1.04607999, 1.08858371]] * batch], dec_out)
def testGPipeTransformerDecoderStackFPropWithEmbeddings( self, splits=1, num_micro_batches=1): batch = 4 tf.flags.FLAGS.tpu_compatible = True with self.session() as sess: params = self._TransformerParamsWithEmbeddings( num_decoder_layers=4, num_encoder_layers=0, splits=splits, num_micro_batches=num_micro_batches) params.dtype = tf.float32 xformer = GPipeTransformerStack(params) inputs, paddings, tgt_inputs, tgt_paddings = self._random_inputs_ids( batch) output = xformer.FProp(xformer.theta, inputs, paddings, tgt_inputs, tgt_paddings) tf.global_variables_initializer().run() output_val = sess.run(output) self.assertAllCloseAccordingToType( [[[-2.29650807, 0.25992393, 1.81951356, 1.52897644]] * batch, [[-2.14101386, 0.32607365, 1.73413348, 1.51806736]] * batch, [[-2.18863297, 0.34420109, 1.65913653, 1.58703828]] * batch], output_val)
def testGPipeTransformerMtModel(self, splits=1, num_micro_batches=1): batch = 4 tf.flags.FLAGS.tpu_compatible = True with self.session() as sess: with tf.variable_scope('transformer_test', reuse=tf.AUTO_REUSE): params = self._TransformerParamsWithEmbeddings( splits=splits, num_micro_batches=num_micro_batches, num_decoder_layers=2, has_softmax=True) params.state_dtype = tf.float32 xformer = GPipeTransformerStack(params) input_ids, id_paddings, tgt_inputs, tgt_paddings = ( self._random_inputs_ids(batch=batch)) labels = tf.ones([3, batch]) label_weights = tf.ones([3, batch]) tf.set_random_seed(1234) tf.global_variables_initializer().run() xent, logits = xformer.FProp(xformer.theta, input_ids, id_paddings, tgt_inputs, tgt_paddings, None, None, labels, label_weights) xent_out, logits_out = sess.run([xent, logits]) print('xent_out={}'.format(xent_out)) print('logits_out={}'.format(logits_out))
def testGPipeTransformerFPropPackedInputWithEmbeddings(self, splits=1): batch = 4 tf.flags.FLAGS.tpu_compatible = True with self.session(): with tf.variable_scope('transformer_test', reuse=tf.AUTO_REUSE): params = _TransformerParamsWithEmbeddings(splits=splits, num_decoder_layers=2) params.dtype = tf.float32 params.fprop_dtype = tf.float32 packed_params = params.Copy() packed_params.packed_input = True xformer = GPipeTransformerStack(params) packed_xformer = GPipeTransformerStack(packed_params) # Prepare inputs inputs, paddings, tgt_inputs, tgt_paddings, _, _ = _TransformerRandomInputsIds( batch) packed_inputs = tf.reshape(inputs, [-1, 1]) packed_tgt_inputs = tf.reshape(tgt_inputs, [-1, 1]) packed_paddings = tf.reshape(paddings, [-1, 1]) packed_tg_paddings = tf.reshape(tgt_paddings, [-1, 1]) segment_ids = tf.transpose( tf.constant([[0, 1, 2, 3, 0, 1, 2, 3]], dtype=tf.float32)) tgt_segment_id = tf.transpose( tf.constant([[0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3]], dtype=tf.float32)) segment_pos_id = tf.transpose( tf.constant([[0, 0, 0, 0, 1, 1, 1, 1]], dtype=tf.int32)) tgt_segment_pos_id = tf.transpose( tf.constant([[0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2]], dtype=tf.int32)) output = xformer.FProp(xformer.theta, inputs, paddings, tgt_inputs, tgt_paddings)[2] packed_output = packed_xformer.FProp( packed_xformer.theta, packed_inputs, packed_paddings, packed_tgt_inputs, packed_tg_paddings, segment_ids, tgt_segment_id, None, None, segment_pos_id, tgt_segment_pos_id)[2] packed_output = tf.reshape(packed_output, output.shape) self.evaluate(tf.global_variables_initializer()) output, packed_output = self.evaluate([output, packed_output]) self.assertAllClose(output, packed_output, rtol=1e-05, atol=1e-05)
def testGPipeTransformerStackTrainEncoderTransparentFPropEval(self): # time = 2, batch = 4 with self.session() as sess: params = self._TransformerParams(num_decoder_layers=3, num_encoder_layers=3) params.is_transparent = True params.num_transparent_outputs = 1 params.is_eval = True xformer = GPipeTransformerStack(params) inputs, paddings, _, _ = self._random_inputs(batch=batch) tf.global_variables_initializer().run() enc_outputs = xformer.EncoderFPropDefaultTheta(inputs, paddings) enc_out = sess.run(enc_outputs) self.assertAllClose(enc_out, [[[0.18823329, 0.71548849]] * batch, [[0.76032472, -0.82791042]] * batch])
def testGPipeTransformerStackTrainTransparentFPropEval(self): # time = 2, batch = 4 with self.session() as sess: params = self._TransformerParams(num_decoder_layers=3, num_encoder_layers=1) params.is_transparent = True params.is_eval = True xformer = GPipeTransformerStack(params) inputs, paddings, _, _ = self._random_inputs(batch=batch) tf.global_variables_initializer().run() enc_outputs = xformer.EncoderFPropDefaultTheta(inputs, paddings) enc_out = sess.run(enc_outputs) self.assertAllClose( enc_out, [[[[-0.27896273] * 3, [1.46589136] * 3]] * batch, [[[1.03141928] * 3, [-0.847896] * 3]] * batch])
def testGPipeTransformerStackFPropWithEmbeddings(self, splits=1, num_micro_batches=1): batch = 4 tf.flags.FLAGS.tpu_compatible = True with self.session() as sess: params = self._TransformerParamsWithEmbeddings( splits=splits, num_micro_batches=num_micro_batches) params.dtype = tf.float32 params.fprop_dtype = tf.float32 xformer = GPipeTransformerStack(params) inputs, paddings, _, _ = self._random_inputs_ids(batch) output = xformer.FProp(xformer.theta, inputs, paddings) tf.global_variables_initializer().run() output = sess.run(output) self.assertAllCloseAccordingToType( [[[-1.67121327, -1.24759686, 1.41572773, 2.42515182]] * batch, [[-1.71240354, -1.1253252, 0.23407015, 3.40547156]] * batch], output)