def __init__(self, pooling_type, num_speakers): super(Tdnn, self).__init__() if pooling_type == "statistics_pooling": self.pooling_layer = statistics_pooling() self.num_speakers = num_speakers self.layer1 = nn.Sequential( nn.Conv2d(in_channels=30, out_channels=512, kernel_size=(5, 1)), nn.BatchNorm2d(512), nn.ReLU()) self.layer2 = nn.Sequential( nn.Conv2d(in_channels=512, out_channels=512, kernel_size=(5, 1)), nn.BatchNorm2d(512), nn.ReLU()) self.layer3 = nn.Sequential( nn.Conv2d(in_channels=512, out_channels=512, kernel_size=(7, 1)), nn.BatchNorm2d(512), nn.ReLU()) self.layer4 = nn.Sequential( nn.Linear(in_features=512, out_features=512), nn.BatchNorm1d(512), nn.ReLU()) self.layer5 = nn.Sequential( nn.Linear(in_features=512, out_features=1500), nn.BatchNorm1d(1500), nn.ReLU()) self.layer6 = nn.Linear(in_features=3000, out_features=512) self.layer6_bn = nn.BatchNorm1d(512) self.layer6_act = nn.ReLU() self.layer7 = nn.Sequential( nn.Linear(in_features=512, out_features=512), nn.BatchNorm1d(512), nn.ReLU()) self.softmax_layer = nn.Linear(in_features=512, out_features=num_speakers)
def tdnn_svd(features, params, is_training=None, reuse_variables=None, aux_features=None): """Build a TDNN network. The structure is similar to Kaldi, while it uses bn+relu rather than relu+bn. And there is no dilation used, so it has more parameters than Kaldi x-vector. Args: features: A tensor with shape [batch, length, dim]. params: Configuration loaded from a JSON. svd_params: Configuration to point out which layers to be svd loaded from a JSON. And it should be updated by function "update_mid_channels" before passing to "svdtdnn". is_training: True if the network is used for training. reuse_variables: True if the network has been built and enable variable reuse. aux_features: Auxiliary features (e.g. linguistic features or bottleneck features). :return: features: The output of the last layer. endpoints: An OrderedDict containing output of every components. The outputs are in the order that they add to the network. Thus it is convenient to split the network by a output name """ name = 'tdnn_svd' svd_json_path = '/data2/liry/test/tf-kaldi-speaker/model/hello.json' svd_params = Params(svd_json_path) assert svd_params.updated # ReLU is a normal choice while other activation function is possible. relu = tf.nn.relu for layer in svd_params.split: if svd_params.split[layer] and svd_params.mid_channels[layer] == -1: raise AttributeError( 'Please update the mid_channels of %s before construct the graph' % layer) if "network_relu_type" in params.dict: if params.network_relu_type == "prelu": relu = prelu if params.network_relu_type == "lrelu": relu = tf.nn.leaky_relu endpoints = OrderedDict() with tf.variable_scope(name, reuse=reuse_variables): # Convert to [b, 1, l, d] features = tf.expand_dims(features, 1) # Layer 1: [-2,-1,0,1,2] --> [b, 1, l-4, 512] # conv2d + batchnorm + relu if svd_params.split['tdnn1_conv']: features = tf.layers.conv2d( features, svd_params.mid_channels['tdnn1_conv'], (1, 5), activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='tdnn1.0_conv', bias_initializer=tf.zeros_initializer()) endpoints["tdnn1.0_conv"] = features features = tf.layers.conv2d( features, 32, (1, 1), activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='tdnn1.5_conv') endpoints["tdnn1.5_conv"] = features else: features = tf.layers.conv2d( features, 32, (1, 5), activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='tdnn1_conv') endpoints["tdnn1_conv"] = features features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn1_bn") endpoints["tdnn1_bn"] = features features = relu(features, name='tdnn1_relu') endpoints["tdnn1_relu"] = features # Layer 2: [-2, -1, 0, 1, 2] --> [b ,1, l-4, 512] # conv2d + batchnorm + relu # This is slightly different with Kaldi which use dilation convolution if svd_params.split['tdnn2_conv']: features = tf.layers.conv2d( features, svd_params.mid_channels['tdnn2_conv'], (1, 5), activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='tdnn2.0_conv', bias_initializer=tf.zeros_initializer()) endpoints["tdnn2.0_conv"] = features features = tf.layers.conv2d( features, 32, (1, 1), activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='tdnn2.5_conv') endpoints["tdnn2.5_conv"] = features else: features = tf.layers.conv2d( features, 32, (1, 5), activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='tdnn2_conv') endpoints["tdnn2_conv"] = features features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn2_bn") endpoints["tdnn2_bn"] = features features = relu(features, name='tdnn2_relu') endpoints["tdnn2_relu"] = features # Layer 3: [-3, -2, -1, 0, 1, 2, 3] --> [b, 1, l-6, 512] # conv2d + batchnorm + relu # Still, use a non-dilation one if svd_params.split['tdnn3_conv']: features = tf.layers.conv2d( features, svd_params.mid_channels['tdnn3_conv'], (1, 7), activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='tdnn3.0_conv', bias_initializer=tf.zeros_initializer()) endpoints["tdnn3.0_conv"] = features features = tf.layers.conv2d( features, 32, (1, 1), activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='tdnn3.5_conv') endpoints["tdnn3.5_conv"] = features else: features = tf.layers.conv2d( features, 32, (1, 7), activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='tdnn3_conv') endpoints["tdnn3_conv"] = features features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn3_bn") endpoints["tdnn3_bn"] = features features = relu(features, name='tdnn3_relu') endpoints["tdnn3_relu"] = features # Convert to [b, l, 512] features = tf.squeeze(features, axis=1) # The output of the 3-rd layer can simply be rank 3. endpoints["tdnn3_relu"] = features # Layer 4: [b, l, 512] --> [b, l, 512] if svd_params.split['tdnn4_dense']: features = tf.layers.dense( features, svd_params.mid_channels['tdnn4_dense'], activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name="tdnn4.0_dense", bias_initializer=tf.zeros_initializer()) endpoints["tdnn4.0_dense"] = features features = tf.layers.dense( features, 512, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name="tdnn4.5_dense") endpoints["tdnn4.5_dense"] = features else: features = tf.layers.dense( features, 512, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name="tdnn4_dense") endpoints["tdnn4_dense"] = features features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn4_bn") endpoints["tdnn4_bn"] = features features = relu(features, name='tdnn4_relu') endpoints["tdnn4_relu"] = features # Layer 5: [b, l, x] if "num_nodes_pooling_layer" not in params.dict: # The default number of nodes before pooling params.dict["num_nodes_pooling_layer"] = 1500 if svd_params.split['tdnn5_dense']: features = tf.layers.dense( features, svd_params.mid_channels['tdnn5_dense'], activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name="tdnn5.0_dense", bias_initializer=tf.zeros_initializer()) endpoints["tdnn5.0_dense"] = features features = tf.layers.dense( features, params.num_nodes_pooling_layer, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name="tdnn5.5_dense") endpoints["tdnn5.5_dense"] = features else: features = tf.layers.dense( features, params.num_nodes_pooling_layer, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name="tdnn5_dense") endpoints["tdnn5_dense"] = features features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn5_bn") endpoints["tdnn5_bn"] = features features = relu(features, name='tdnn5_relu') endpoints["tdnn5_relu"] = features # Pooling layer # If you add new pooling layer, modify this code. # Statistics pooling # [b, l, 1500] --> [b, x] if params.pooling_type == "statistics_pooling": features = statistics_pooling(features, aux_features, endpoints, params, is_training) elif params.pooling_type == "self_attention": features = self_attention(features, aux_features, endpoints, params, is_training) elif params.pooling_type == "ghost_vlad": features = ghost_vlad(features, aux_features, endpoints, params, is_training) # elif params.pooling_type == "aux_attention": # features = aux_attention(features, aux_features, endpoints, params, is_training) else: raise NotImplementedError("Not implement %s pooling" % params.pooling_type) endpoints['pooling'] = features # Utterance-level network # Layer 6: [b, 512] if svd_params.split['tdnn6_dense']: features = tf.layers.dense( features, svd_params.mid_channels['tdnn6_dense'], activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='tdnn6.0_dense', bias_initializer=tf.zeros_initializer()) endpoints['tdnn6.0_dense'] = features features = tf.layers.dense( features, 512, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='tdnn6.5_dense') endpoints['tdnn6.5_dense'] = features else: features = tf.layers.dense( features, 512, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='tdnn6_dense') endpoints['tdnn6_dense'] = features features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn6_bn") endpoints["tdnn6_bn"] = features features = relu(features, name='tdnn6_relu') endpoints["tdnn6_relu"] = features # Layer 7: [b, x] if "num_nodes_last_layer" not in params.dict: # The default number of nodes in the last layer params.dict["num_nodes_last_layer"] = 512 if svd_params.split['tdnn7_dense']: features = tf.layers.dense( features, svd_params.mid_channels['tdnn7_dense'], activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='tdnn7.0_dense', bias_initializer=tf.zeros_initializer()) endpoints['tdnn7.0_dense'] = features features = tf.layers.dense( features, params.num_nodes_last_layer, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='tdnn7.5_dense') endpoints['tdnn7.5_dense'] = features else: features = tf.layers.dense( features, params.num_nodes_last_layer, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='tdnn7_dense') endpoints['tdnn7_dense'] = features if "last_layer_no_bn" not in params.dict: params.last_layer_no_bn = False if not params.last_layer_no_bn: features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn7_bn") endpoints["tdnn7_bn"] = features if "last_layer_linear" not in params.dict: params.last_layer_linear = False if not params.last_layer_linear: # If the last layer is linear, no further activation is needed. features = relu(features, name='tdnn7_relu') endpoints["tdnn7_relu"] = features return features, endpoints
def tdnn_distill(features, params, is_training=None, reuse_variables=None, aux_features=None): """Build a TDNN network. The structure is similar to Kaldi, while it uses bn+relu rather than relu+bn. And there is no dilation used, so it has more parameters than Kaldi x-vector. Args: features: A tensor with shape [batch, length, dim]. params: Configuration loaded from a JSON. is_training: True if the network is used for training. reuse_variables: True if the network has been built and enable variable reuse. aux_features: Auxiliary features (e.g. linguistic features or bottleneck features). :return: features: The output of the last layer. endpoints: An OrderedDict containing output of every components. The outputs are in the order that they add to the network. Thus it is convenient to split the network by a output name """ # ReLU is a normal choice while other activation function is possible. relu = tf.nn.relu if "network_relu_type" in params.dict: if params.network_relu_type == "prelu": relu = prelu if params.network_relu_type == "lrelu": relu = tf.nn.leaky_relu endpoints = OrderedDict() with tf.variable_scope("tdnn", reuse=reuse_variables): # Convert to [b, 1, l, d] features = tf.expand_dims(features, 1) # Layer 1: [-2,-1,0,1,2] --> [b, 1, l-4, 512] # conv2d + batchnorm + relu features = tf.layers.conv2d(features, 64, (1, 5), activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer(params.weight_l2_regularizer), name='tdnn1_conv') endpoints["tdnn1_conv"] = features features = tf.layers.batch_normalization(features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn1_bn") endpoints["tdnn1_bn"] = features features = relu(features, name='tdnn1_relu') endpoints["tdnn1_relu"] = features # Layer 2: [-2, -1, 0, 1, 2] --> [b ,1, l-4, 512] # conv2d + batchnorm + relu # This is slightly different with Kaldi which use dilation convolution features = tf.layers.conv2d(features, 64, (1, 5), activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer(params.weight_l2_regularizer), name='tdnn2_conv') endpoints["tdnn2_conv"] = features features = tf.layers.batch_normalization(features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn2_bn") endpoints["tdnn2_bn"] = features features = relu(features, name='tdnn2_relu') endpoints["tdnn2_relu"] = features # Layer 3: [-3, -2, -1, 0, 1, 2, 3] --> [b, 1, l-6, 512] # conv2d + batchnorm + relu # Still, use a non-dilation one features = tf.layers.conv2d(features, 64, (1, 7), activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer(params.weight_l2_regularizer), name='tdnn3_conv') endpoints["tdnn3_conv"] = features features = tf.layers.batch_normalization(features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn3_bn") endpoints["tdnn3_bn"] = features features = relu(features, name='tdnn3_relu') endpoints["tdnn3_relu"] = features # Convert to [b, l, 512] features = tf.squeeze(features, axis=1) # The output of the 3-rd layer can simply be rank 3. endpoints["tdnn3_relu"] = features # Layer 4: [b, l, 512] --> [b, l, 512] features = tf.layers.dense(features, 512, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer(params.weight_l2_regularizer), name="tdnn4_dense") endpoints["tdnn4_dense"] = features features = tf.layers.batch_normalization(features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn4_bn") endpoints["tdnn4_bn"] = features features = relu(features, name='tdnn4_relu') endpoints["tdnn4_relu"] = features # Layer 5: [b, l, x] if "num_nodes_pooling_layer" not in params.dict: # The default number of nodes before pooling params.dict["num_nodes_pooling_layer"] = 1500 features = tf.layers.dense(features, params.num_nodes_pooling_layer, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer(params.weight_l2_regularizer), name="tdnn5_dense") endpoints["tdnn5_dense"] = features features = tf.layers.batch_normalization(features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn5_bn") endpoints["tdnn5_bn"] = features features = relu(features, name='tdnn5_relu') endpoints["tdnn5_relu"] = features # Pooling layer # If you add new pooling layer, modify this code. # Statistics pooling # [b, l, 1500] --> [b, x] if params.pooling_type == "statistics_pooling": features = statistics_pooling(features, aux_features, endpoints, params, is_training) elif params.pooling_type == "self_attention": features = self_attention(features, aux_features, endpoints, params, is_training) elif params.pooling_type == "ghost_vlad": features = ghost_vlad(features, aux_features, endpoints, params, is_training) # elif params.pooling_type == "aux_attention": # features = aux_attention(features, aux_features, endpoints, params, is_training) else: raise NotImplementedError("Not implement %s pooling" % params.pooling_type) endpoints['pooling'] = features # Utterance-level network # Layer 6: [b, 512] features = tf.layers.dense(features, 512, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer(params.weight_l2_regularizer), name='tdnn6_dense') endpoints['tdnn6_dense'] = features features = tf.layers.batch_normalization(features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn6_bn") endpoints["tdnn6_bn"] = features features = relu(features, name='tdnn6_relu') endpoints["tdnn6_relu"] = features # Layer 7: [b, x] if "num_nodes_last_layer" not in params.dict: # The default number of nodes in the last layer params.dict["num_nodes_last_layer"] = 64 features = tf.layers.dense(features, params.num_nodes_last_layer, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer(params.weight_l2_regularizer), name='tdnn7_dense') endpoints['tdnn7_dense'] = features if "last_layer_no_bn" not in params.dict: params.last_layer_no_bn = False if not params.last_layer_no_bn: features = tf.layers.batch_normalization(features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn7_bn") endpoints["tdnn7_bn"] = features if "last_layer_linear" not in params.dict: params.last_layer_linear = False if not params.last_layer_linear: # If the last layer is linear, no further activation is needed. features = relu(features, name='tdnn7_relu') endpoints["tdnn7_relu"] = features return features, endpoints