def statistics_pooling_v2(features, feat_length, endpoints, params, is_training): """Statistics pooling Note that we need to take care of the zeros in the variance since the sqrt on 0 will lead to NaN. Args: features: A tensor with shape [batch, length, dim]. feat_length: The length of each utterance. endpoints: Outputs of different parts of the network. params: is_training: :return: Statistics pooling result [mean, stddev] with shape [batch, dim]. """ with tf.variable_scope("stat_pooling"): feat_shape = shape_list(features) frame_index = tf.tile(tf.expand_dims(tf.range(feat_shape[1]), axis=0), [feat_shape[0], 1]) feat_length = tf.expand_dims(feat_length, axis=1) feat_length_new = tf.tile(feat_length, [1, feat_shape[1]]) mask = tf.expand_dims(tf.to_float(tf.less(frame_index, feat_length_new)), axis=2) feat_length = tf.to_float(tf.expand_dims(feat_length, axis=2)) mean = tf.reduce_sum(features * mask, axis=1, keep_dims=True) / (feat_length + 1e-16) variance = tf.reduce_sum(tf.squared_difference(features, mean) * mask, axis=1, keep_dims=True) / (feat_length + 1e-16) mean = tf.squeeze(mean, 1) variance = tf.squeeze(variance, 1) mask = tf.to_float(tf.less_equal(variance, VAR2STD_EPSILON)) variance = (1.0 - mask) * variance + mask * VAR2STD_EPSILON stddev = tf.sqrt(variance) stat_pooling = tf.concat([mean, stddev], 1, name="concat") return stat_pooling
def self_attention(features, aux_features, endpoints, params, is_training=None): """Self-attention. In this implementation, `self` is not accurate because the key and value may come from different nodes. Note that the key should be the same length with the value, i.e. no convnet is applied after the key layer, or some trimming strategy should be applied before the weighted sum. Note: We do not use features in this function. The key and value are specified using params and are extracted from endpoints. Args: features: A tensor with shape [batch, length, dim]. aux_features: Auxiliary input features with shape [batch, length, dim]. endpoints: Outputs of different parts of the network. Useful when doing attention. params: Parameters for self-attention. params.att_key_input: endpoints[params.att_key_input] is used to compute the key. params.att_key_num_nodes: #nodes of the network to compute the key. params.att_key_network_type: The last layer to compute the key. In the intermediate layers, affine+bn+relu is usually applied 0: affine 1: affine + relu 2: affine + bn + relu 3: affine + tanh params.att_value_input: endpoints[params.att_value_input] is used as the value of the component. params.att_value_num_nodes: #nodes of the network to compute the value. params.att_value_network_type: The layer layer to compute value (if exists). params.att_apply_nonlinear: The nonlinearity is applied after the attention weighted sum (default: false). params.att_use_scale: Whether to apply a scaling factor when doing the key*query operation. params.att_num_heads: The number of heads in multi-head attention. params.att_split_key: Whether to split the key when multi-head attention is used. params.att_penalty_term: The coefficient of the penalty term. is_training: Used in BN. :return: Attention result. Also in the statistic format [weighted_mean, weighted_stddev] """ relu = tf.nn.relu if "network_relu_type" in params.dict: if params.network_relu_type == "prelu": relu = prelu if params.network_relu_type == "lrelu": relu = tf.nn.leaky_relu with tf.variable_scope("attention"): value_features = endpoints[params.att_value_input] key_features = endpoints[params.att_key_input] # Key forward if len(params.att_key_num_nodes) > 1: for index, num_nodes in enumerate(params.att_key_num_nodes[:-1]): # The intermediate layers use affine+bn+relu key_features = dense_bn_relu(key_features, num_nodes, endpoints, params, is_training, name=("att_key%d" % index)) # The last layer has different choices if params.att_key_network_type == 0: key_features = dense(key_features, params.att_key_num_nodes[-1], endpoints, params, is_training, name=("att_key%d" % (len(params.att_key_num_nodes) - 1))) elif params.att_key_network_type == 1: key_features = dense_relu( key_features, params.att_key_num_nodes[-1], endpoints, params, is_training, name=("att_key%d" % (len(params.att_key_num_nodes) - 1))) elif params.att_key_network_type == 2: key_features = dense_bn_relu( key_features, params.att_key_num_nodes[-1], endpoints, params, is_training, name=("att_key%d" % (len(params.att_key_num_nodes) - 1))) elif params.att_key_network_type == 3: key_features = dense_tanh( key_features, params.att_key_num_nodes[-1], endpoints, params, is_training, name=("att_key%d" % (len(params.att_key_num_nodes) - 1))) # Value forward if len(params.att_value_num_nodes) > 0: if len(params.att_value_num_nodes) > 1: for index, num_nodes in enumerate( params.att_value_num_nodes[:-1]): value_features = dense_bn_relu(value_features, num_nodes, endpoints, params, is_training, name=("att_value%d" % index)) if params.att_value_network_type == 0: value_features = dense( value_features, params.att_value_num_nodes[-1], endpoints, params, is_training, name=("att_value%d" % (len(params.att_value_num_nodes) - 1))) elif params.att_value_network_type == 1: value_features = dense_relu( value_features, params.att_value_num_nodes[-1], endpoints, params, is_training, name=("att_value%d" % (len(params.att_value_num_nodes) - 1))) elif params.att_value_network_type == 2: value_features = dense_bn_relu( value_features, params.att_value_num_nodes[-1], endpoints, params, is_training, name=("att_value%d" % (len(params.att_value_num_nodes) - 1))) elif params.att_value_network_type == 3: value_features = dense_tanh( value_features, params.att_value_num_nodes[-1], endpoints, params, is_training, name=("att_value%d" % (len(params.att_value_num_nodes) - 1))) # The last element in att_key_num_nodes and att_value_num_nodes # is the dimension of the key and the value. In multi-head attention, they are extended n times. n_heads = params.att_num_heads assert shape_list( value_features )[2] % n_heads == 0, "The dim of the value must be divided by the num of heads." if params.att_split_key: assert shape_list(key_features)[2] % n_heads == 0 # Split the value and key. value_features = split_heads(value_features, n_heads) if params.att_split_key: key_features = split_heads(key_features, n_heads) else: key_features = tf.expand_dims(key_features, axis=1) val_shape = shape_list(value_features) key_shape = shape_list(key_features) tf.logging.info( "Attention:\n" " The dim of the value: %d, the dim of the key: %d\n" " The layer has %d heads, resulting in the dim of value/key each head %d/%d.\n" " With weighted mean and stddev, the attention layer results in output with dim %d." % (val_shape[1] * val_shape[-1], key_shape[1] * key_shape[-1], n_heads, val_shape[-1], key_shape[-1], val_shape[1] * val_shape[-1] * 2)) # Initialize query thus the weight for each time step is equal at the beginning. # TODO: How to decide the initial number of query? query = tf.get_variable( "query", [n_heads, key_shape[-1]], dtype=tf.float32, initializer=tf.initializers.truncated_normal(stddev=0.1)) if not params.att_split_key: query_time_key = tf.einsum('bmld, hd->blh', key_features, query, name="query_time_key") else: query_time_key = tf.einsum('bhld, hd->blh', key_features, query, name="query_time_key") if params.att_use_scale: query_time_key = query_time_key * tf.rsqrt( tf.to_float(key_shape[-1])) # weights is [b, h, l] weights = tf.nn.softmax(tf.transpose(query_time_key, [0, 2, 1]), name="weights") endpoints["attention_weights"] = weights att_mean = tf.einsum('bhld,bhl->bhd', value_features, weights, name="att_mean") att_stddev = tf.einsum('bhld,bhl->bhd', tf.squared_difference( value_features, tf.expand_dims(att_mean, axis=2)), weights, name="att_stddev") att_mean = combine_last_two_dimensions(att_mean) att_stddev = combine_last_two_dimensions(att_stddev) mask = tf.to_float(tf.less_equal(att_stddev, VAR2STD_EPSILON)) att_stddev = (1.0 - mask) * att_stddev + mask * VAR2STD_EPSILON att_stddev = tf.sqrt(att_stddev) att = tf.concat([att_mean, att_stddev], axis=1, name="concat") endpoints["att_output_before_nonlinear"] = att if params.att_apply_nonlinear: att = tf.layers.batch_normalization( att, momentum=params.batchnorm_momentum, training=is_training, name="att_post_bn") endpoints["att_post_bn"] = att att = relu(att, name='att_post_relu') endpoints["att_post_relu"] = att # Penalty term when multi-head attention is used. penalty = tf.einsum('ijk,ikl->ijl', weights, tf.transpose(weights, [0, 2, 1])) - tf.eye( n_heads, batch_shape=[val_shape[0]]) # Normalize using the batch size penalty = tf.reduce_sum(tf.square(penalty)) / tf.to_float(val_shape[0]) penalty = params.att_penalty_term * penalty tf.add_to_collection("PENALTY", penalty) tf.summary.scalar("attention_penalty", penalty) return att
def ghost_vlad(features, aux_features, endpoints, params, is_training): """NetVLAD and GhostVLAD See: NetVLAD: https://arxiv.org/abs/1511.07247 GhostVLAD: https://arxiv.org/abs/1810.09951 Args: features: A tensor with shape [batch, length, dim]. aux_features: endpoints: Outputs of different parts of the network. params: params.vlad_num_centers: #centers of the NetVLAD. params.vlad_num_ghosts: #centers for the ghost clusters params.vlad_key_input: The key used to compute the weights params.vlad_key_num_nodes: #nodes of the network to compute the key. An additional layer is applied to obtain the weights. params.vlad_value_input: The value to be aggregated params.vlad_value_num_nodes: #nodes of the network to compute the value. params.vlad_final_l2_norm: Do the final L2 normalization after concatenation. is_training: Used in BN. :return: """ relu = tf.nn.relu if "network_relu_type" in params.dict: if params.network_relu_type == "prelu": relu = prelu if params.network_relu_type == "lrelu": relu = tf.nn.leaky_relu with tf.variable_scope("vlad"): value_features = endpoints[params.vlad_value_input] key_features = endpoints[params.vlad_key_input] # Value forward -> [b, l, d] if len(params.vlad_value_num_nodes) > 0: for index, num_nodes in enumerate(params.vlad_value_num_nodes): value_features = dense_bn_relu(value_features, num_nodes, endpoints, params, is_training, name=("vlad_value%d" % index)) # Key forward if len(params.vlad_key_num_nodes) > 0: for index, num_nodes in enumerate(params.vlad_key_num_nodes): key_features = dense_bn_relu(key_features, num_nodes, endpoints, params, is_training, name=("vlad_key%d" % index)) # Affine: wx+b -> [b, l, nclusters] key_features = tf.layers.dense( key_features, params.vlad_num_centers + params.vlad_num_ghosts, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name="vlad_weight_affine") # The weights A = tf.nn.softmax(key_features, axis=-1, name="vlad_weights") endpoints["vlad_weights"] = A # Compute the residual cluster = tf.get_variable( "vlad_centers", [ params.vlad_num_centers + params.vlad_num_ghosts, shape_list(value_features)[-1] ], dtype=tf.float32, initializer=tf.contrib.layers.xavier_initializer(), regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer)) res = tf.expand_dims(value_features, axis=2) - cluster A = tf.expand_dims(A, axis=-1) weighted_res = A * res cluster_res = tf.reduce_sum(weighted_res, axis=1) tf.logging.info("VLAD is used: %d clusters" % params.vlad_num_centers) if params.vlad_num_ghosts > 0: tf.logging.info(" %d ghost clusters is added" % params.vlad_num_ghosts) cluster_res = cluster_res[:, :params.vlad_num_centers, :] cluster_res = tf.nn.l2_normalize(cluster_res, axis=-1) output = tf.reshape( cluster_res, [-1, params.vlad_num_centers * shape_list(cluster_res)[-1]]) if params.vlad_final_l2_norm: output = tf.nn.l2_normalize(output, axis=-1) endpoints["vlad_value"] = value_features endpoints["vlad_key"] = key_features endpoints["vlad_centers"] = cluster return output
def self_attention(features, aux_features, endpoints, params, is_training=None): """Self-attention. Note that the key should be the same length with the value, i.e. no convnet is applied after the key layer, or some trimming strategy should be applied before the weighted sum. (Refer to linguistic_attention) Args: features: A tensor with shape [batch, length, dim]. aux_features: Auxiliary input features with shape [batch, length, dim]. endpoints: Outputs of different parts of the network. Useful when doing attention. params: Parameters for self-attention. params.self_att_key_input: Use endpoints[params.self_att_key_input] to compute the key. params.self_att_key_num_nodes: The network to compute the key. params.self_att_value_num_nodes: The network to compute the value. params.self_att_num_heads: The number of heads in multi-head attention. params.self_att_penalty_term: The coefficient of the penalty term. The final dimension of the key and the value is decided by self_att_key_num_nodes and self_att_value_num_nodes. If multi-head attention is used, the value will be split first (the key remains the original dim). is_training: Used in BN. :return: Attention result. Also in the statistic format [weighted_mean, weighted_stddev] """ assert "self_att_key_input" in params.dict assert "self_att_key_num_nodes" in params.dict assert "self_att_value_num_nodes" in params.dict assert "self_att_num_heads" in params.dict assert "self_att_penalty_term" in params.dict with tf.variable_scope("attention"): value_features = features key_features = endpoints[params.self_att_key_input] if len(params.self_att_key_num_nodes) != 0: # According to "A STRUCTURED SELF-ATTENTIVE SENTENCE EMBEDDING", # the last layer of the key network is `affine + tanh`. if len(params.self_att_key_num_nodes) > 1: for index, node in enumerate( params.self_att_key_num_nodes[:-1]): key_features = dense_relu(key_features, node, endpoints, params, is_training, name=("att_key%d" % index)) key_features = dense_tanh( key_features, params.self_att_key_num_nodes[-1], endpoints, params, is_training, name=("att_key%d" % (len(params.self_att_key_num_nodes) - 1))) if len(params.self_att_value_num_nodes) != 0: tf.logging.info("Note: Add network to process the value input %s" % value_features.name) for index, node in enumerate(params.self_att_value_num_nodes): value_features = dense_relu(value_features, node, endpoints, params, is_training, name=("att_value%d" % index)) # The last element in self_att_key_num_nodes and self_att_value_num_nodes # is the dimension of the key and the value. In multi-head attention, they are extended n times. n_heads = params.self_att_num_heads assert shape_list( value_features )[2] % n_heads == 0, "The dim of the value must be divided by the num of heads." # Split the value. The key can use the entire key vector (without splitting). value_features = split_heads(value_features, n_heads) val_shape = shape_list(value_features) key_shape = shape_list(key_features) tf.logging.info( "Attention:\n" " The dim of the value: %d, the dim of the key: %d\n" " The layer has %d heads, resulting in the dim of value of each head %d.\n" " With weighted mean and stddev, the attention layer results in output with dim %d." % (val_shape[1] * val_shape[-1], key_shape[-1], n_heads, val_shape[-1], val_shape[1] * val_shape[-1] * 2)) # Initialize query thus the weight for each time step is equal at the beginning. query = tf.get_variable( "query", [n_heads, key_shape[-1]], dtype=tf.float32, initializer=tf.initializers.truncated_normal(stddev=0.1)) query_time_key = tf.einsum('ijl,kl->ijk', key_features, query, name="query_time_key") weights = tf.nn.softmax(tf.transpose(query_time_key, [0, 2, 1]), name="weights") att_mean = tf.einsum('bnld,bnl->bnd', value_features, weights, name="att_mean") att_stddev = tf.einsum('bnld,bnl->bnd', tf.squared_difference( value_features, tf.expand_dims(att_mean, axis=2)), weights, name="att_stddev") att_mean = combine_last_two_dimensions(att_mean) att_stddev = combine_last_two_dimensions(att_stddev) mask = tf.to_float(tf.less_equal(att_stddev, VAR2STD_EPSILON)) att_stddev = (1.0 - mask) * att_stddev + mask * VAR2STD_EPSILON att_stddev = tf.sqrt(att_stddev) att = tf.concat([att_mean, att_stddev], 1, name="concat") endpoints["attention_weights"] = weights # Penalty term penalty = tf.einsum('ijk,ikl->ijl', weights, tf.transpose(weights, [0, 2, 1])) - tf.eye( n_heads, batch_shape=[val_shape[0]]) # Normalize using the batch size penalty = tf.reduce_sum(tf.square(penalty)) / tf.to_float(val_shape[0]) tf.add_to_collection("PENALTY", params.self_att_penalty_term * penalty) tf.summary.scalar("attention_penalty", params.self_att_penalty_term * penalty) # # Debug # # Comment lines when running the code # endpoints["att_query"] = query # endpoints["att_key"] = key_features # endpoints["att_value"] = value_features return att
def aux_attention(features, aux_features, endpoints, params, is_training=None): """Attention using auxiliary features. The attention layer has a minor problem that the length of the key may be different with the length of the value, due to the convnet. The key usually has the original feature length while the length of the value is shorter. We always using the fully-connected layer in the key network, so the length remains the same. A workaround is to use the center of the key to make length of the key and the value the same. Note: When auxiliary key is used, the hypothesis is that the length of this auxiliary feature is the same with the value. Args: features: A tensor with shape [batch, length, dim]. aux_features: A dict. aux_featuers["aux_feat_name"]: The length is LONGER than features!!! The features is processed by convnet thus the length becomes shorter. TODO: How to trim the auxiliary features? Align left or center? endpoints: Outputs of different parts of the network. params: Parameters for self-attention. params.att_aux_name: The name of the auxiliary features. params.att_aux_key_input: Additional key input except for the auxiliary features. If None then only the auxiliary features are used. params.att_key_num_nodes: The network to compute the key. params.att_value_num_nodes: The network to compute the value. params.att_num_heads: The number of heads in multi-head attention. params.att_penalty_term: The coefficient of the penalty term. The final dimension of the key and the value is decided by self_att_key_num_nodes and self_att_value_num_nodes. If multi-head attention is used, the value will be split first (the key remains the original dim). is_training: Used in BN. :return: """ assert "att_aux_name" in params.dict assert "att_key_input" in params.dict assert "att_key_num_nodes" in params.dict assert "att_value_num_nodes" in params.dict assert "att_num_heads" in params.dict assert "att_penalty_term" in params.dict with tf.variable_scope("attention"): value_features = features for aux_name in params.att_aux_name: if aux_name not in aux_features: sys.exit("The aux features %s is not in aux_features." % aux_name) key_features = [] for aux_name in params.att_aux_name: # Center trimming. Use the center of the key to match the length of the value. trim_length = (shape_list(aux_features[aux_name])[1] - shape_list(value_features)[1]) / 2 # This requires the total kernel size is a odd number. key_features.append( aux_features[aux_name][:, trim_length:-trim_length, :]) # # TODO: If the length of the key and the value is the same, the next line is useful. # # But the above line looks more neat (What...). # key_features = tf.cond(tf.equal(trim_length, 0), # lambda: aux_features[aux_name], # lambda: aux_features[aux_name][:, trim_length:-trim_length, :]) tf.logging.info("Attention using auxiliary features:") if params.att_key_input is not None: if params.att_key_input not in endpoints: sys.exit( "You specify the appended key %s, but I cannot find it in the endpoints." % params.att_key_input) tf.logging.info("Append %s to the auxiliary features" % params.att_key_input) key_features.append(endpoints[params.att_key_input]) # Concatenate all the features to the key. key_features = tf.concat(key_features, axis=-1, name="key_features") if len(params.att_key_num_nodes) != 0: # According to "A STRUCTURED SELF-ATTENTIVE SENTENCE EMBEDDING", # the last layer of the key network is `affine + tanh`. if len(params.att_key_num_nodes) > 1: for index, node in enumerate(params.att_key_num_nodes[:-1]): key_features = dense_relu(key_features, node, endpoints, params, is_training, name=("att_key%d" % index)) key_features = dense_tanh( key_features, params.att_key_num_nodes[-1], endpoints, params, is_training, name=("att_key%d" % (len(params.att_key_num_nodes) - 1))) if len(params.att_value_num_nodes) != 0: tf.logging.info("Note: Add network to process the value input %s" % value_features.name) for index, node in enumerate(params.att_value_num_nodes): value_features = dense_relu(value_features, node, endpoints, params, is_training, name=("att_value%d" % index)) # The last element in self_att_key_num_nodes and self_att_value_num_nodes # is the dimension of the key and the value. In multi-head attention, they are extended n times. n_heads = params.att_num_heads assert shape_list( value_features )[2] % n_heads == 0, "The dim of the value must be divided by the num of heads." # Split the value. The key can use the entire vector. value_features = split_heads(value_features, n_heads) val_shape = shape_list(value_features) key_shape = shape_list(key_features) tf.logging.info( " The dim of the value: %d, the dim of the key: %d\n" " The layer has %d heads, resulting in the dim of value of each head %d.\n" " With weighted mean and stddev, the attention layer results in output with dim %d." % (val_shape[1] * val_shape[-1], key_shape[-1], n_heads, val_shape[-1], val_shape[1] * val_shape[-1] * 2)) # Initialize query thus the weight for each time step is equal at the beginning. query = tf.get_variable( "query", [n_heads, key_shape[-1]], dtype=tf.float32, initializer=tf.initializers.truncated_normal(stddev=0.1)) query_time_key = tf.einsum('ijl,kl->ijk', key_features, query, name="query_time_key") weights = tf.nn.softmax(tf.transpose(query_time_key, [0, 2, 1]), name="weights") att_mean = tf.einsum('bnld,bnl->bnd', value_features, weights, name="att_mean") att_stddev = tf.einsum('bnld,bnl->bnd', tf.squared_difference( value_features, tf.expand_dims(att_mean, axis=2)), weights, name="att_stddev") att_mean = combine_last_two_dimensions(att_mean) att_stddev = combine_last_two_dimensions(att_stddev) mask = tf.to_float(tf.less_equal(att_stddev, VAR2STD_EPSILON)) att_stddev = (1.0 - mask) * att_stddev + mask * VAR2STD_EPSILON att_stddev = tf.sqrt(att_stddev) att = tf.concat([att_mean, att_stddev], 1, name="concat") endpoints["attention_weights"] = weights # Penalty term penalty = tf.einsum('ijk,ikl->ijl', weights, tf.transpose(weights, [0, 2, 1])) - tf.eye( n_heads, batch_shape=[val_shape[0]]) penalty = tf.reduce_sum(tf.square(penalty)) / tf.to_float(val_shape[0]) tf.add_to_collection("PENALTY", params.att_penalty_term * penalty) tf.summary.scalar("attention_penalty", params.att_penalty_term * penalty) # # Debug # # Comment lines when running the code # endpoints["att_query"] = query # endpoints["att_key"] = key_features # endpoints["att_value"] = value_features return att
def build_phone_encoder(features, speaker_labels, feature_length, params, endpoints, reuse_variables, is_training=False): """Build encoder for phone latent variable. Use the tdnn and share the same structure in the lower layers. Args: features: the input features. speaker_labels: the speaker labels (i.e. the speaker index). may be used in the future. feature_length: the length of each feature. params: the parameters. endpoints: will be updated during building. reuse_variables: if true, reuse the existing variables is_training: used in batchnorm. :return: sampled_zs, mu_zs, logvar_zs """ relu = tf.nn.relu if "network_relu_type" in params.dict: if params.network_relu_type == "prelu": relu = prelu if params.network_relu_type == "lrelu": relu = tf.nn.leaky_relu # # This is moved to the model config file. # # Acoustic network params: # # Most share 4 layers with x-vector network. # # [-2,2], [-2,2], [-3,3], [0], [-4,0,4] # # The last fully-connected layer is appended as the phonetic embedding # layer_size = [512, 512, 512, 512, 512] # kernel_size = [5, 5, 7, 1, 3] # dilation_size = [1, 1, 1, 1, 4] num_layers = len(params.phone_kernel_size) layer_index = 0 if params.num_shared_layers > 0: # We may share the lower layers of the two tasks. # Go through the shared layers between the speaker and phone networks. assert params.num_shared_layers < num_layers with tf.variable_scope("encoder", reuse=True): for i in range(params.num_shared_layers): if params.phone_kernel_size[layer_index] > 1: if len(shape_list(features)) == 3: # Add a dummy dim to support 2d conv features = tf.expand_dims(features, axis=1) features = tf.layers.conv2d( features, params.phone_layer_size[layer_index], (1, params.phone_kernel_size[layer_index]), activation=None, dilation_rate=( 1, params.phone_dilation_size[layer_index]), kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='conv%d' % (layer_index + 1)) elif params.phone_kernel_size[layer_index] == 1: if len(shape_list(features)) == 4: # Remove a dummy dim to do dense layer features = tf.squeeze(features, axis=1) features = tf.layers.dense( features, params.phone_layer_size[layer_index], activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name="dense%d" % (layer_index + 1)) features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="bn%d" % (layer_index + 1)) features = relu(features, name='relu%d' % (layer_index + 1)) layer_index += 1 with tf.variable_scope("encoder_phone", reuse=reuse_variables): # In the unshared part, the endpoints should be updated. while layer_index < num_layers: if params.phone_kernel_size[layer_index] > 1: if len(shape_list(features)) == 3: features = tf.expand_dims(features, axis=1) features = tf.layers.conv2d( features, params.phone_layer_size[layer_index], (1, params.phone_kernel_size[layer_index]), activation=None, dilation_rate=(1, params.phone_dilation_size[layer_index]), kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='phn_conv%d' % (layer_index + 1)) endpoints["phn_conv%d" % (layer_index + 1)] = features elif params.phone_kernel_size[layer_index] == 1: if len(shape_list(features)) == 4: features = tf.squeeze(features, axis=1) features = tf.layers.dense( features, params.phone_layer_size[layer_index], activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name="phn_dense%d" % (layer_index + 1)) endpoints["phn_dense%d" % (layer_index + 1)] = features features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="phn_bn%d" % (layer_index + 1)) endpoints["phn_bn%d" % (layer_index + 1)] = features features = relu(features, name='phn_relu%d' % (layer_index + 1)) endpoints["phn_relu%d" % (layer_index + 1)] = features layer_index += 1 # The last layer if len(shape_list(features)) == 4: features = tf.squeeze(features, axis=1) # Similar with the speaker network, we may need to slice the feature due to the different context between # the speaker and phone network. At this moment, I just make a hypothesis that the phone context will be # larger which means there is no need to slice for the phone network if (params.speaker_left_context > params.phone_left_context and params.speaker_right_context > params.phone_right_context): raise NotImplementedError( "The speake and phone context is not supported now.") # features = features[:, params.speaker_left_context - params.phone_left_context: # params.phone_right_context - params.speaker_right_context, :] # # We do not validate the length because this will introduce the alignment -- phn_labels, which # # is unnecessary when doing the phone inference. # with tf.control_dependencies([tf.assert_equal(shape_list(features)[1], shape_list(self.phn_labels)[1])]): # features = tf.identity(features) if "phone_dim" not in params.dict: params.dict["phone_dim"] = 512 mu = tf.layers.dense( features, params.phone_dim, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name="zp_dense") endpoints['zp_mu_dense'] = mu mu = tf.layers.batch_normalization(mu, momentum=params.batchnorm_momentum, training=is_training, name="zp_bn") endpoints['zp_mu_bn'] = mu mu = relu(mu, name='zp_mu_relu') endpoints['zp_mu_relu'] = mu logvar = 0 # epsilon = tf.random_normal(tf.shape(mu), name='zp_epsilon') # sample = mu + tf.exp(0.5 * logvar) * epsilon sample = mu return sample, mu, logvar
def build_speaker_encoder(features, phone_labels, feature_length, params, endpoints, reuse_variables, is_training=False): """Build encoder for speaker latent variable. Use the same tdnn network with x-vector. Args: features: the input features. phone_labels: the phone labels (i.e. alignment). will be used in the future. feature_length: the length of each feature. params: the parameters. endpoints: will be updated during building. reuse_variables: if true, reuse the existing variables. is_training: used in batchnorm :return: sampled_zs, mu_zs, logvar_zs """ relu = tf.nn.relu if "network_relu_type" in params.dict: if params.network_relu_type == "prelu": relu = prelu if params.network_relu_type == "lrelu": relu = tf.nn.leaky_relu with tf.variable_scope("encoder", reuse=reuse_variables): # Layer 1: [-2,-1,0,1,2] --> [b, 1, l-4, 512] # conv2d + batchnorm + relu features = tf.layers.conv2d( features, 512, (1, 5), activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='conv1') endpoints["conv1"] = features features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="bn1") endpoints["bn1"] = features features = relu(features, name='relu1') endpoints["relu1"] = features # Layer 2: [-2, -1, 0, 1, 2] --> [b ,1, l-4, 512] # conv2d + batchnorm + relu # This is slightly different with Kaldi which use dilation convolution features = tf.layers.conv2d( features, 512, (1, 5), activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='conv2') endpoints["conv2"] = features features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="bn2") endpoints["bn2"] = features features = relu(features, name='relu2') endpoints["relu2"] = features # Layer 3: [-3, -2, -1, 0, 1, 2, 3] --> [b, 1, l-6, 512] # conv2d + batchnorm + relu # Still, use a non-dilation one features = tf.layers.conv2d( features, 512, (1, 7), activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='conv3') endpoints["conv3"] = features features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="bn3") endpoints["bn3"] = features features = relu(features, name='relu3') endpoints["relu3"] = features # Convert to [b, l, 512] features = tf.squeeze(features, axis=1) # The output of the 3-rd layer can simply be rank 3. endpoints["relu3"] = features # Layer 4: [b, l, 512] --> [b, l, 512] features = tf.layers.dense( features, 512, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name="dense4") endpoints["dense4"] = features features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="bn4") endpoints["bn4"] = features features = relu(features, name='relu4') endpoints["relu4"] = features # Layer 5: [b, l, x] if "num_nodes_pooling_layer" not in params.dict: # The default number of nodes before pooling params.dict["num_nodes_pooling_layer"] = 1500 features = tf.layers.dense( features, params.num_nodes_pooling_layer, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name="dense5") endpoints["dense5"] = features features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="bn5") endpoints["bn5"] = features features = relu(features, name='relu5') endpoints["relu5"] = features # Here, we need to slice the feature since the original feature is expanded by the larger context between # the speaker and phone context. I make a hypothesis that the phone context will be larger. # So the speaker network need to slicing. if (params.speaker_left_context < params.phone_left_context and params.speaker_right_context < params.phone_right_context): features = features[:, params.phone_left_context - params.speaker_left_context:params. speaker_right_context - params.phone_right_context, :] else: raise NotImplementedError( "The speake and phone context is not supported now.") # Make sure we've got the right feature with tf.control_dependencies([ tf.assert_equal( shape_list(features)[1], shape_list(phone_labels)[1]) ]): # Pooling layer # The length of utterances may be different. # The original pooling use all the frames which is not appropriate for this case. # So we create a new function (I don't want to change the original one). if params.pooling_type == "statistics_pooling": features = statistics_pooling_v2(features, feature_length, endpoints, params, is_training) else: raise NotImplementedError("Not implement %s pooling" % params.pooling_type) endpoints['pooling'] = features # Utterance-level network # Layer 6: [b, 512] features = tf.layers.dense( features, 512, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='dense6') endpoints['dense6'] = features features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="bn6") endpoints["bn6"] = features features = relu(features, name='relu6') endpoints["relu6"] = features # Layer 7: [b, x] if "speaker_dim" not in params.dict: # The default number of nodes in the last layer params.dict["speaker_dim"] = 512 # We need mean and logvar. mu = tf.layers.dense( features, params.speaker_dim, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name="zs_dense") endpoints['zs_mu_dense'] = mu if "spk_last_layer_no_bn" not in params.dict: params.spk_last_layer_no_bn = False if not params.spk_last_layer_no_bn: mu = tf.layers.batch_normalization( mu, momentum=params.batchnorm_momentum, training=is_training, name="zs_bn") endpoints['zs_mu_bn'] = mu if "spk_last_layer_linear" not in params.dict: params.spk_last_layer_linear = False if not params.spk_last_layer_linear: mu = relu(mu, name="zs_mu_relu") endpoints['zs_mu_relu'] = mu # We do not compute logvar in this version. # Set logvar=0 ==> var=1 logvar = 0 # epsilon = tf.random_normal(tf.shape(mu), name='zs_epsilon') # sample = mu + tf.exp(0.5 * logvar) * epsilon sample = mu return sample, mu, logvar
def resnet_18(features, params, is_training=None, reuse_variables=None, aux_features=None): """ Build a ResNet. Modified ResNet-18, the blocks are: [3/64, 3/64], [3/128, 3/128], [3/256, 3/256], [3/512, 3/512] The default number of blocks: [2, 2, 2, 2] The last 3 blocks can downsample the features. N fully-connected layers are appended to the output the res blocks. There are actually 2 more layers than standard ResNet implementation. The downsample in ResNet-50 with 1*1 kernel may lose the frequency resolution. About the network parameters (no batchnorm included): TDNN: 2.6M (or 4.2M without dilation) ETDNN: 4.4M (or 7.6M without dilation) Modified FTDNN: 9.2M Modified EFTDNN: 32M FTDNN: 9.0M EFTDNN: 19.8M (much smaller than modified eftdnn) ResNet-18: 13.5M ResNet-34: 23.6M ResNet-50: 16.1M ResNet-101: 28.4M Args: features: A tensor with shape [batch, length, dim]. params: Configuration loaded from a JSON. is_training: True if the network is used for training. reuse_variables: True if the network has been built and enable variable reuse. aux_features: Auxiliary features (e.g. linguistic features or bottleneck features). :return: features: The output of the last layer. endpoints: An OrderedDict containing output of every components. The outputs are in the order that they add to the network. Thus it is convenient to split the network by a output name """ # The strides only affect the last 3 conv block time_stride = 2 if params.resnet_time_stride else 1 # The dimension of the features should be 40 assert (shape_list(features)[-1] == 40) tf.logging.info("Build a ResNet-18 network.") # ReLU is a normal choice while other activation function is possible. relu = tf.nn.relu if "network_relu_type" in params.dict: if params.network_relu_type == "prelu": relu = prelu elif params.network_relu_type == "lrelu": relu = tf.nn.leaky_relu # The block parameters # default: [2, 2, 2, 2] if "resnet_blocks" not in params.dict: params.dict["resnet_blocks"] = [2, 2, 2, 2] tf.logging.info("The resnet blocks: [%d, %d, %d, %d]", params.resnet_blocks[0], params.resnet_blocks[1], params.resnet_blocks[2], params.resnet_blocks[3]) endpoints = OrderedDict() with tf.variable_scope("resnet_18", reuse=reuse_variables): # features: [N, L, F, 1] # ndim = shape_list(features)[-1] features = tf.expand_dims(features, axis=3) # Since we use 40-dim FBanks, the kernel should be smaller. # First conv # No strides are applied. features = tf.layers.conv2d( features, 64, (3, 3), padding='same', activation=None, use_bias=False, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='conv0_1') features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="conv0_bn") features = relu(features, name='conv0_relu') if params.resnet_maxpooling: features = tf.layers.max_pooling2d(features, (3, 3), (1, 1), padding='same', name='conv0_max') # Conv Block 1 features = conv_block(features, [[3, 3], [3, 3]], [64, 64], [1, 1], params, is_training, relu, "conv1a") for i in range(params.resnet_blocks[0] - 1): features = identity_block(features, [[3, 3], [3, 3]], [64, 64], params, is_training, relu, "conv1b_%d" % i) # Conv Block 2 features = conv_block(features, [[3, 3], [3, 3]], [128, 128], [time_stride, 2], params, is_training, relu, "conv2a") for i in range(params.resnet_blocks[1] - 1): features = identity_block(features, [[3, 3], [3, 3]], [128, 128], params, is_training, relu, "conv2b_%d" % i) # Conv Block 3 features = conv_block(features, [[3, 3], [3, 3]], [256, 256], [time_stride, 2], params, is_training, relu, "conv3a") for i in range(params.resnet_blocks[2] - 1): features = identity_block(features, [[3, 3], [3, 3]], [256, 256], params, is_training, relu, "conv3b_%d" % i) # Conv Block 4 features = conv_block(features, [[3, 3], [3, 3]], [512, 512], [time_stride, 2], params, is_training, relu, "conv4a") for i in range(params.resnet_blocks[3] - 1): features = identity_block(features, [[3, 3], [3, 3]], [512, 512], params, is_training, relu, "conv4b_%d" % i) # features: [N, L/t, 5, 512] # The original resnet use average pooling to get [N, 512] which I think will eliminate the time resolution. # Hence, in this implementation, we first obtain [N, L, 512] via conv layer and use dense layer to process the features features = tf.layers.conv2d( features, 512, (1, shape_list(features)[2]), activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='conv5') features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="conv5_bn") features = relu(features, name='conv5_relu') features = tf.squeeze(features, axis=2) # FC layers * 2 features = tf.layers.dense( features, 512, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='dense1') features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="dense1_bn") features = relu(features, name='dense1_relu') features = tf.layers.dense( features, 1500, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='dense2') features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="dense2_bn") features = relu(features, name='dense2_relu') # Compute the number of parameters num_params = 3*3*64 + (2*3*3*64*64*params.resnet_blocks[0] + 64*64) + \ (3*3*64*128 + 3*3*128*128 + 64*128 + 2*3*3*128*128*(params.resnet_blocks[1]-1)) + \ (3*3*128*256 + 3*3*256*256 + 128*256 + 2*3*3*256*256*(params.resnet_blocks[2]-1)) + \ (3*3*256*512 + 3*3*512*512 + 256*512 + 2*3*3*512*512*(params.resnet_blocks[3]-1)) + \ (1*5*512*512 + 512*512 + 512*1500) tf.logging.info( "The number of parameters of the frame-level network: %d" % num_params) num_layers = 4 + 2 * ( params.resnet_blocks[0] + params.resnet_blocks[1] + params.resnet_blocks[2] + params.resnet_blocks[3]) tf.logging.info("The number of layers: %d" % num_layers) # Pooling features = general_pooling(features, aux_features, endpoints, params, is_training) # Utterance-level network # Layer 6: [b, 512] features = tf.layers.dense( features, 512, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='tdnn6_dense') endpoints['tdnn6_dense'] = features features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn6_bn") endpoints["tdnn6_bn"] = features features = relu(features, name='tdnn6_relu') endpoints["tdnn6_relu"] = features # Layer 7: [b, x] if "num_nodes_last_layer" not in params.dict: # The default number of nodes in the last layer params.dict["num_nodes_last_layer"] = 512 features = tf.layers.dense( features, params.num_nodes_last_layer, activation=None, kernel_regularizer=tf.contrib.layers.l2_regularizer( params.weight_l2_regularizer), name='tdnn7_dense') endpoints['tdnn7_dense'] = features if "last_layer_no_bn" not in params.dict: params.last_layer_no_bn = False if not params.last_layer_no_bn: features = tf.layers.batch_normalization( features, momentum=params.batchnorm_momentum, training=is_training, name="tdnn7_bn") endpoints["tdnn7_bn"] = features if "last_layer_linear" not in params.dict: params.last_layer_linear = False if not params.last_layer_linear: # If the last layer is linear, no further activation is needed. features = relu(features, name='tdnn7_relu') endpoints["tdnn7_relu"] = features return features, endpoints