def testStackingStreamStepRightContext(self): tf.random.set_seed(2021) batch_size, max_seqlen, input_dim, kernel = 2, 16, 8, 3 left_context, right_context = 6, 3 num_heads, ffn_dim = 2, 4 stride = 1 num_layers = 3 num_groups = 2 # Prepares inputs. np.random.seed(None) inputs = np.random.normal( 0.1, 1, [batch_size, max_seqlen, input_dim]).astype(np.float32) print(f'np.sum(inputs): {np.sum(inputs)}') inputs = tf.convert_to_tensor(inputs) seqlen = np.random.randint(low=max_seqlen // 2, high=max_seqlen + 1, size=(batch_size, ), dtype=np.int32) print(f'seqlen: {seqlen}') seqlen = tf.convert_to_tensor(seqlen) paddings = py_utils.PaddingsFromLengths(seqlen, max_seqlen) p = conformer_layer.ConformerLayer.CommonParams( input_dim=input_dim, is_causal=True, layer_order='conv_before_mhsa', atten_num_heads=num_heads, atten_left_context=left_context, atten_right_context=right_context, use_relative_atten=False, fflayer_hidden_dim=ffn_dim, kernel_size=kernel) p.lconv_tpl.conv_norm_layer_tpl = bn_layers.GroupNormLayer.Params( ).Set(num_groups=num_groups, cumulative=True) p.params_init = py_utils.WeightInit.Xavier(scale=1.0, seed=0) ps = [p.Copy().Set(name=f'base{i}') for i in range(num_layers)] layers = [x.Instantiate() for x in ps] base_outputs = self._BuildStackingBaseGraph(layers, num_layers, inputs, paddings) outputs = self._BuildStackingStreamGraph(layers, num_layers, inputs, paddings, stride) init_op = tf.global_variables_initializer() with self.session(use_gpu=False) as sess: sess.run(init_op) expected, actual = sess.run([base_outputs, outputs]) print(f'expected: {repr(expected)}, {expected.shape}') print(f'actual: {repr(actual)}, {actual.shape}') print(f'np.sum(np.abs(expected)): {np.sum(np.abs(expected))}') print(f'np.sum(np.abs(actual)): {np.sum(np.abs(actual))}') self.assertAllClose(expected, actual, atol=2e-6, rtol=2e-6) self.assertEqual(tuple(expected.shape), (batch_size, max_seqlen, input_dim))
def _GetInputs(self, batch_size, max_seqlen, input_dim, full_seq=False): # Prepares inputs. np.random.seed(None) if self.input_rank == 3: inputs = np.random.normal( 0.5, 1, [batch_size, max_seqlen, input_dim]).astype(np.float32) else: assert self.input_rank == 4 inputs = np.random.normal( 0.5, 1, [batch_size, max_seqlen, 1, input_dim]).astype(np.float32) print(f'np.sum(inputs): {np.sum(inputs)}') inputs = tf.convert_to_tensor(inputs) if not full_seq: seqlen = np.random.randint( low=max_seqlen // 2, high=max_seqlen + 1, size=(batch_size,), dtype=np.int32) else: seqlen = np.full((batch_size,), max_seqlen, dtype=np.int32) print(f'seqlen: {seqlen}') seqlen = tf.convert_to_tensor(seqlen) paddings = py_utils.PaddingsFromLengths(seqlen, max_seqlen) return inputs, paddings
def testStreamStep(self, testonly_skip_norm_layers=False, norm_type='ln'): with flagsaver.flagsaver( testonly_skip_norm_layers=testonly_skip_norm_layers ), cluster_factory.SetEval(True): assert norm_type in ('ln', 'gn') batch, max_seqlen, input_dim, kernel = 2, 8, 2, 3 p = conformer_layer.LConvLayer.CommonParams(input_dim=input_dim, is_causal=True, kernel_size=kernel) if norm_type == 'ln': p.conv_norm_layer_tpl = lingvo_layers.LayerNorm.Params() else: p.conv_norm_layer_tpl = bn_layers.GroupNormLayer.Params().Set( num_groups=2, cumulative=True) p.name = 'lconv' l = p.Instantiate() init_op = tf.global_variables_initializer() np.random.seed(None) inputs = np.random.normal( 0.1, 0.5, [batch, max_seqlen, input_dim]).astype(np.float32) print(f'np.sum(inputs): {np.sum(inputs)}') inputs = tf.convert_to_tensor(inputs) seqlen = np.random.randint(low=1, high=max_seqlen + 1, size=(batch, ), dtype=np.int32) print(repr(seqlen)) seqlen = tf.convert_to_tensor(seqlen) paddings = py_utils.PaddingsFromLengths(seqlen, max_seqlen) base_outputs, _ = l.FProp(l.theta, inputs, paddings) base_outputs *= tf.expand_dims(1. - paddings, -1) outputs = [] state = l.zero_state(batch) for i in range(max_seqlen): output, _, state = l.StreamStep(l.theta, inputs[:, i:(i + 1), :], paddings[:, i:(i + 1)], state) outputs.append(output) # [b, t, d] outputs = tf.concat(outputs, axis=1) outputs *= tf.expand_dims(1. - paddings, -1) with self.session(use_gpu=False) as sess: sess.run(init_op) expected, actual = sess.run([base_outputs, outputs]) print(repr(expected)) print(repr(actual)) print(f'np.sum(np.abs(expected)): {np.sum(np.abs(expected))}') print(f'np.sum(np.abs(actual)): {np.sum(np.abs(actual))}') self.assertAllClose(expected, actual)
def testCausalDepthwiseConv2DLayerStreamStep(self, testonly_skip_norm_layers=False): with flagsaver.flagsaver( testonly_skip_norm_layers=testonly_skip_norm_layers): batch_size, max_seqlen, channel = 2, 32, 3 kernel, channel_multiplier = 5, 1 params = conv_layers.CausalDepthwiseConv2DLayer.Params().Set( name='conv', filter_stride=[1, 1], filter_shape=[kernel, 1, channel, channel_multiplier], params_init=py_utils.WeightInit.Gaussian(0.1)) conv_layer = params.Instantiate() init_op = tf.global_variables_initializer() np.random.seed(None) inputs = np.random.normal( 0.5, 1, [batch_size, max_seqlen, 1, channel]).astype(np.float32) print(f'np.sum(inputs): {np.sum(inputs)}') inputs = tf.convert_to_tensor(inputs) seqlen = tf.random.uniform([batch_size], minval=1, maxval=max_seqlen + 1, dtype=tf.int32) input_padding = py_utils.PaddingsFromLengths(seqlen, max_seqlen) base_outputs, _ = conv_layer.FProp(conv_layer.theta, inputs, input_padding) base_outputs *= tf.reshape(1. - input_padding, [batch_size, max_seqlen, 1, 1]) outputs = [] state = conv_layer.zero_state(batch_size) for i in range(0, max_seqlen): output, _, state = conv_layer.StreamStep(conv_layer.theta, inputs[:, i:(i + 1), :, :], input_padding[:, i:(i + 1)], state) outputs.append(output) # [b, t, 1, c * channel_multiplier] outputs = tf.concat(outputs, axis=1) outputs *= tf.reshape(1. - input_padding, [batch_size, max_seqlen, 1, 1]) with self.session(use_gpu=True) as sess: sess.run(init_op) expected, actual = sess.run([base_outputs, outputs]) print(repr(expected)) print(repr(actual)) print(f'np.sum(np.abs(ref_val)): {np.sum(np.abs(expected))}') print(f'np.sum(np.abs(new_val)): {np.sum(np.abs(actual))}') self.assertAllClose(expected, actual)
def _TestLeadingPaddingsHelper(self, stride=1): """Tests leading paddings case, useful for local atten with right ctx.""" batch, max_seqlen, channel = 2, 16, 2 kernel, channel_multiplier = 3, 2 p = conv_layers.CausalDepthwiseConv2DLayer.Params().Set( name='conv', filter_stride=[1, 1], filter_shape=[kernel, 1, channel, channel_multiplier], params_init=py_utils.WeightInit.Gaussian(0.1)) l = p.Instantiate() init_op = tf.global_variables_initializer() np.random.seed(None) inputs = np.random.normal(0.1, 0.5, [batch, max_seqlen, 1, channel]).astype( np.float32) print(f'np.sum(inputs): {np.sum(inputs)}') inputs_t = tf.convert_to_tensor(inputs) # The upperbound is always max_seqlen-1, so the batch is always padded. seqlen = np.random.randint( low=1, high=max_seqlen, size=(batch,), dtype=np.int32) print(f'seqlen: {seqlen}') paddings = py_utils.PaddingsFromLengths( tf.convert_to_tensor(seqlen), max_seqlen) shift_inputs = np.array(inputs) for i in range(batch): shift_inputs[i] = np.roll(shift_inputs[i], max_seqlen - seqlen[i], axis=0) shift_inputs_t = tf.convert_to_tensor(shift_inputs) # Has the same number of tokens as paddings per example leading_paddings = 1 - py_utils.PaddingsFromLengths( max_seqlen - tf.convert_to_tensor(seqlen), max_seqlen) def expand_pad(pad): # pylint:disable=invalid-name return py_utils.AppendDims(pad, 2) def stream(l, inputs, paddings): # pylint:disable=invalid-name state = l.zero_state(batch) all_outs = [] for i in range(max_seqlen // stride): step_inputs = inputs[:, stride * i:stride * (i + 1)] step_paddings = paddings[:, stride * i:stride * (i + 1)] output, _, state = l.StreamStep(l.theta, step_inputs, step_paddings, state) all_outs.append(output) all_outs = tf.concat(all_outs, axis=1) return all_outs * (1. - expand_pad(paddings)) base_outs = stream(l, inputs_t, paddings) actual_outs = stream(l, shift_inputs_t, leading_paddings) with self.session(use_gpu=False) as sess: sess.run(init_op) expected, actual = sess.run([base_outs, actual_outs]) for i in range(batch): actual[i] = np.roll(actual[i], -(max_seqlen - seqlen[i]), axis=0) print(f'expected: {repr(expected)}') print(f'actual: {repr(actual)}') print(f'np.sum(np.abs(expected)): {np.sum(np.abs(expected))}') print(f'np.sum(np.abs(actual)): {np.sum(np.abs(actual))}') self.assertAllClose(expected, actual)
def testStreamStep(self, testonly_skip_norm_layers=False, norm_type='ln', num_groups=2, stride=1, layer_order='conv_before_mhsa', has_lconv='depthwise', has_fflayer_start=True, right_context=0): assert norm_type in ('ln', 'gn'), norm_type with flagsaver.flagsaver( testonly_skip_norm_layers=testonly_skip_norm_layers ), cluster_factory.SetEval(True): batch, max_seqlen, input_dim, kernel = 2, 16, 8, 3 assert max_seqlen % stride == 0 if layer_order == 'mhsa': kernel = None num_heads, left_context, ffn_dim = 2, 3, 4 p = conformer_layer.ConformerLayer.CommonParams( input_dim=input_dim, is_causal=True, atten_num_heads=num_heads, atten_left_context=left_context, atten_right_context=right_context, use_relative_atten=False, fflayer_hidden_dim=ffn_dim, kernel_size=kernel, layer_order=layer_order) if norm_type == 'ln': p.lconv_tpl.conv_norm_layer_tpl = lingvo_layers.LayerNorm.Params( ) else: p.lconv_tpl.conv_norm_layer_tpl = bn_layers.GroupNormLayer.Params( ).Set(num_groups=num_groups, cumulative=True) if not has_lconv: p.lconv_tpl = None elif has_lconv == 'conv2d': p.lconv_tpl.depthwise_conv_tpl = ( conv_layers_with_time_padding.CausalConv2DLayerWithPadding. Params()) else: assert has_lconv == 'depthwise' if not has_fflayer_start: p.fflayer_start_tpl = None p.name = 'conformer' l = p.Instantiate() init_op = tf.global_variables_initializer() np.random.seed(None) inputs = 5 * np.random.normal( 0.1, 0.5, [batch, max_seqlen, input_dim]).astype(np.float32) print(f'np.sum(inputs): {np.sum(inputs)}') inputs = tf.convert_to_tensor(inputs) seqlen = np.random.randint(low=1, high=max_seqlen + 1, size=(batch, ), dtype=np.int32) print(f'seqlen: {seqlen}') seqlen = tf.convert_to_tensor(seqlen) paddings = py_utils.PaddingsFromLengths(seqlen, max_seqlen) base_output_map = l.FProp( l.theta, py_utils.NestedMap(features=inputs, paddings=paddings)) base_outputs = base_output_map.features base_outputs *= tf.expand_dims(1. - paddings, -1) outputs = [] state = l.zero_state(batch) for i in range(max_seqlen // stride + int(math.ceil(right_context / stride))): if i < max_seqlen // stride: step_inputs = inputs[:, stride * i:stride * (i + 1)] step_paddings = paddings[:, stride * i:stride * (i + 1)] else: step_inputs = tf.zeros_like(inputs[:, 0:stride]) step_paddings = tf.ones_like(paddings[:, 0:stride]) output, _, state = l.StreamStep(l.theta, step_inputs, step_paddings, state) outputs.append(output) outputs = tf.concat(outputs, axis=1) outputs = outputs[:, right_context:][:, :max_seqlen] outputs *= tf.reshape(1. - paddings, [batch, max_seqlen, 1]) with self.session(use_gpu=False) as sess: sess.run(init_op) expected, actual = sess.run([base_outputs, outputs]) print(repr(expected)) print(repr(actual)) print(f'np.sum(np.abs(expected)): {np.sum(np.abs(expected))}') print(f'np.sum(np.abs(actual)): {np.sum(np.abs(actual))}') tol = 3.e-6 if testonly_skip_norm_layers else 2.e-5 self.assertAllClose(expected, actual, atol=tol, rtol=tol)
def testStreamStep(self, testonly_skip_norm_layers=False, norm_type='ln', num_groups=2, stride=1, layer_order='conv_before_mhsa', has_lconv=True, has_fflayer_start=True): assert norm_type in ('ln', 'gn'), norm_type with flagsaver.flagsaver(testonly_skip_norm_layers=testonly_skip_norm_layers ), cluster_factory.SetEval(True): batch, max_seqlen, input_dim, kernel = 2, 16, 8, 3 if layer_order == 'mhsa': kernel = None num_heads, left_context, ffn_dim = 2, 3, 4 p = conformer_layer.ConformerLayer.CommonParams( input_dim=input_dim, is_causal=True, atten_num_heads=num_heads, atten_left_context=left_context, atten_right_context=0, use_relative_atten=False, fflayer_hidden_dim=ffn_dim, kernel_size=kernel, layer_order=layer_order) if norm_type == 'ln': p.lconv_tpl.conv_norm_layer_tpl = layers.LayerNorm.Params() else: p.lconv_tpl.conv_norm_layer_tpl = bn_layers.GroupNormLayer.Params().Set( num_groups=num_groups, cumulative=True) if not has_lconv: p.lconv_tpl = None if not has_fflayer_start: p.fflayer_start_tpl = None p.name = 'conformer' l = p.Instantiate() init_op = tf.global_variables_initializer() np.random.seed(None) inputs = 5 * np.random.normal( 0.1, 0.5, [batch, max_seqlen, input_dim]).astype(np.float32) print(f'np.sum(inputs): {np.sum(inputs)}') inputs = tf.convert_to_tensor(inputs) seqlen = np.random.randint( low=1, high=max_seqlen + 1, size=(batch,), dtype=np.int32) print(repr(seqlen)) seqlen = tf.convert_to_tensor(seqlen) paddings = py_utils.PaddingsFromLengths(seqlen, max_seqlen) base_output_map = l.FProp( l.theta, py_utils.NestedMap(features=inputs, paddings=paddings)) base_outputs = base_output_map.features base_outputs *= tf.expand_dims(1. - paddings, -1) outputs = [] state = l.zero_state(batch) for i in range(0, max_seqlen, stride): output, _, state = l.StreamStep(l.theta, inputs[:, i:(i + stride), :], paddings[:, i:(i + stride)], state) outputs.append(output) # [b, t, d] outputs = tf.concat(outputs, axis=1) outputs *= tf.expand_dims(1. - paddings, -1) with self.session(use_gpu=False) as sess: sess.run(init_op) expected, actual = sess.run([base_outputs, outputs]) print(repr(expected)) print(repr(actual)) print(f'np.sum(np.abs(expected)): {np.sum(np.abs(expected))}') print(f'np.sum(np.abs(actual)): {np.sum(np.abs(actual))}') tol = 2.e-6 if testonly_skip_norm_layers else 2.e-5 self.assertAllClose(expected, actual, atol=tol, rtol=tol)