Пример #1
0
def statistics_pooling_v2(features, feat_length, endpoints, params, is_training):
    """Statistics pooling
    Note that we need to take care of the zeros in the variance since the sqrt on 0 will lead to NaN.

    Args:
        features: A tensor with shape [batch, length, dim].
        feat_length: The length of each utterance.
        endpoints: Outputs of different parts of the network.
        params:
        is_training:
    :return:
        Statistics pooling result [mean, stddev] with shape [batch, dim].
    """
    with tf.variable_scope("stat_pooling"):
        feat_shape = shape_list(features)
        frame_index = tf.tile(tf.expand_dims(tf.range(feat_shape[1]), axis=0), [feat_shape[0], 1])
        feat_length = tf.expand_dims(feat_length, axis=1)
        feat_length_new = tf.tile(feat_length, [1, feat_shape[1]])
        mask = tf.expand_dims(tf.to_float(tf.less(frame_index, feat_length_new)), axis=2)
        feat_length = tf.to_float(tf.expand_dims(feat_length, axis=2))
        mean = tf.reduce_sum(features * mask, axis=1, keep_dims=True) / (feat_length + 1e-16)
        variance = tf.reduce_sum(tf.squared_difference(features, mean) * mask, axis=1, keep_dims=True) / (feat_length + 1e-16)

        mean = tf.squeeze(mean, 1)
        variance = tf.squeeze(variance, 1)

        mask = tf.to_float(tf.less_equal(variance, VAR2STD_EPSILON))
        variance = (1.0 - mask) * variance + mask * VAR2STD_EPSILON
        stddev = tf.sqrt(variance)
        stat_pooling = tf.concat([mean, stddev], 1, name="concat")

    return stat_pooling
Пример #2
0
def self_attention(features,
                   aux_features,
                   endpoints,
                   params,
                   is_training=None):
    """Self-attention.
    In this implementation, `self` is not accurate because the key and value may come from different nodes.
    Note that the key should be the same length with the value, i.e. no convnet is applied after the key layer, or
    some trimming strategy should be applied before the weighted sum.

    Note: We do not use features in this function. The key and value are specified using params
          and are extracted from endpoints.

    Args:
        features: A tensor with shape [batch, length, dim].
        aux_features: Auxiliary input features with shape [batch, length, dim].
        endpoints: Outputs of different parts of the network. Useful when doing attention.
        params: Parameters for self-attention.
            params.att_key_input: endpoints[params.att_key_input] is used to compute the key.
            params.att_key_num_nodes: #nodes of the network to compute the key.
            params.att_key_network_type: The last layer to compute the key.
                                         In the intermediate layers, affine+bn+relu is usually applied
                                         0: affine
                                         1: affine + relu
                                         2: affine + bn + relu
                                         3: affine + tanh
            params.att_value_input: endpoints[params.att_value_input] is used as the value of the component.
            params.att_value_num_nodes: #nodes of the network to compute the value.
            params.att_value_network_type: The layer layer to compute value (if exists).
            params.att_apply_nonlinear: The nonlinearity is applied after the attention weighted sum (default: false).
            params.att_use_scale: Whether to apply a scaling factor when doing the key*query operation.
            params.att_num_heads: The number of heads in multi-head attention.
            params.att_split_key: Whether to split the key when multi-head attention is used.
            params.att_penalty_term: The coefficient of the penalty term.
        is_training: Used in BN.
    :return:
        Attention result. Also in the statistic format [weighted_mean, weighted_stddev]
    """
    relu = tf.nn.relu
    if "network_relu_type" in params.dict:
        if params.network_relu_type == "prelu":
            relu = prelu
        if params.network_relu_type == "lrelu":
            relu = tf.nn.leaky_relu

    with tf.variable_scope("attention"):
        value_features = endpoints[params.att_value_input]
        key_features = endpoints[params.att_key_input]

        # Key forward
        if len(params.att_key_num_nodes) > 1:
            for index, num_nodes in enumerate(params.att_key_num_nodes[:-1]):
                # The intermediate layers use affine+bn+relu
                key_features = dense_bn_relu(key_features,
                                             num_nodes,
                                             endpoints,
                                             params,
                                             is_training,
                                             name=("att_key%d" % index))
        # The last layer has different choices
        if params.att_key_network_type == 0:
            key_features = dense(key_features,
                                 params.att_key_num_nodes[-1],
                                 endpoints,
                                 params,
                                 is_training,
                                 name=("att_key%d" %
                                       (len(params.att_key_num_nodes) - 1)))
        elif params.att_key_network_type == 1:
            key_features = dense_relu(
                key_features,
                params.att_key_num_nodes[-1],
                endpoints,
                params,
                is_training,
                name=("att_key%d" % (len(params.att_key_num_nodes) - 1)))
        elif params.att_key_network_type == 2:
            key_features = dense_bn_relu(
                key_features,
                params.att_key_num_nodes[-1],
                endpoints,
                params,
                is_training,
                name=("att_key%d" % (len(params.att_key_num_nodes) - 1)))
        elif params.att_key_network_type == 3:
            key_features = dense_tanh(
                key_features,
                params.att_key_num_nodes[-1],
                endpoints,
                params,
                is_training,
                name=("att_key%d" % (len(params.att_key_num_nodes) - 1)))

        # Value forward
        if len(params.att_value_num_nodes) > 0:
            if len(params.att_value_num_nodes) > 1:
                for index, num_nodes in enumerate(
                        params.att_value_num_nodes[:-1]):
                    value_features = dense_bn_relu(value_features,
                                                   num_nodes,
                                                   endpoints,
                                                   params,
                                                   is_training,
                                                   name=("att_value%d" %
                                                         index))
            if params.att_value_network_type == 0:
                value_features = dense(
                    value_features,
                    params.att_value_num_nodes[-1],
                    endpoints,
                    params,
                    is_training,
                    name=("att_value%d" %
                          (len(params.att_value_num_nodes) - 1)))
            elif params.att_value_network_type == 1:
                value_features = dense_relu(
                    value_features,
                    params.att_value_num_nodes[-1],
                    endpoints,
                    params,
                    is_training,
                    name=("att_value%d" %
                          (len(params.att_value_num_nodes) - 1)))
            elif params.att_value_network_type == 2:
                value_features = dense_bn_relu(
                    value_features,
                    params.att_value_num_nodes[-1],
                    endpoints,
                    params,
                    is_training,
                    name=("att_value%d" %
                          (len(params.att_value_num_nodes) - 1)))
            elif params.att_value_network_type == 3:
                value_features = dense_tanh(
                    value_features,
                    params.att_value_num_nodes[-1],
                    endpoints,
                    params,
                    is_training,
                    name=("att_value%d" %
                          (len(params.att_value_num_nodes) - 1)))

        # The last element in att_key_num_nodes and att_value_num_nodes
        # is the dimension of the key and the value. In multi-head attention, they are extended n times.
        n_heads = params.att_num_heads
        assert shape_list(
            value_features
        )[2] % n_heads == 0, "The dim of the value must be divided by the num of heads."
        if params.att_split_key:
            assert shape_list(key_features)[2] % n_heads == 0

        # Split the value and key.
        value_features = split_heads(value_features, n_heads)
        if params.att_split_key:
            key_features = split_heads(key_features, n_heads)
        else:
            key_features = tf.expand_dims(key_features, axis=1)

        val_shape = shape_list(value_features)
        key_shape = shape_list(key_features)

        tf.logging.info(
            "Attention:\n"
            "  The dim of the value: %d, the dim of the key: %d\n"
            "  The layer has %d heads, resulting in the dim of value/key each head %d/%d.\n"
            "  With weighted mean and stddev, the attention layer results in output with dim %d."
            % (val_shape[1] * val_shape[-1], key_shape[1] * key_shape[-1],
               n_heads, val_shape[-1], key_shape[-1],
               val_shape[1] * val_shape[-1] * 2))

        # Initialize query thus the weight for each time step is equal at the beginning.
        # TODO: How to decide the initial number of query?
        query = tf.get_variable(
            "query", [n_heads, key_shape[-1]],
            dtype=tf.float32,
            initializer=tf.initializers.truncated_normal(stddev=0.1))

        if not params.att_split_key:
            query_time_key = tf.einsum('bmld, hd->blh',
                                       key_features,
                                       query,
                                       name="query_time_key")
        else:
            query_time_key = tf.einsum('bhld, hd->blh',
                                       key_features,
                                       query,
                                       name="query_time_key")

        if params.att_use_scale:
            query_time_key = query_time_key * tf.rsqrt(
                tf.to_float(key_shape[-1]))

        # weights is [b, h, l]
        weights = tf.nn.softmax(tf.transpose(query_time_key, [0, 2, 1]),
                                name="weights")
        endpoints["attention_weights"] = weights

        att_mean = tf.einsum('bhld,bhl->bhd',
                             value_features,
                             weights,
                             name="att_mean")
        att_stddev = tf.einsum('bhld,bhl->bhd',
                               tf.squared_difference(
                                   value_features,
                                   tf.expand_dims(att_mean, axis=2)),
                               weights,
                               name="att_stddev")
        att_mean = combine_last_two_dimensions(att_mean)
        att_stddev = combine_last_two_dimensions(att_stddev)
        mask = tf.to_float(tf.less_equal(att_stddev, VAR2STD_EPSILON))
        att_stddev = (1.0 - mask) * att_stddev + mask * VAR2STD_EPSILON
        att_stddev = tf.sqrt(att_stddev)
        att = tf.concat([att_mean, att_stddev], axis=1, name="concat")

        endpoints["att_output_before_nonlinear"] = att

        if params.att_apply_nonlinear:
            att = tf.layers.batch_normalization(
                att,
                momentum=params.batchnorm_momentum,
                training=is_training,
                name="att_post_bn")
            endpoints["att_post_bn"] = att
            att = relu(att, name='att_post_relu')
            endpoints["att_post_relu"] = att

        # Penalty term when multi-head attention is used.
        penalty = tf.einsum('ijk,ikl->ijl', weights,
                            tf.transpose(weights, [0, 2, 1])) - tf.eye(
                                n_heads, batch_shape=[val_shape[0]])
        # Normalize using the batch size
        penalty = tf.reduce_sum(tf.square(penalty)) / tf.to_float(val_shape[0])
        penalty = params.att_penalty_term * penalty
        tf.add_to_collection("PENALTY", penalty)
        tf.summary.scalar("attention_penalty", penalty)

    return att
Пример #3
0
def ghost_vlad(features, aux_features, endpoints, params, is_training):
    """NetVLAD and GhostVLAD

    See:
        NetVLAD: https://arxiv.org/abs/1511.07247
        GhostVLAD: https://arxiv.org/abs/1810.09951

    Args:
        features: A tensor with shape [batch, length, dim].
        aux_features:
        endpoints: Outputs of different parts of the network.
        params:
            params.vlad_num_centers: #centers of the NetVLAD.
            params.vlad_num_ghosts: #centers for the ghost clusters
            params.vlad_key_input: The key used to compute the weights
            params.vlad_key_num_nodes: #nodes of the network to compute the key.
                                       An additional layer is applied to obtain the weights.
            params.vlad_value_input: The value to be aggregated
            params.vlad_value_num_nodes: #nodes of the network to compute the value.
            params.vlad_final_l2_norm: Do the final L2 normalization after concatenation.
        is_training: Used in BN.
    :return:
    """
    relu = tf.nn.relu
    if "network_relu_type" in params.dict:
        if params.network_relu_type == "prelu":
            relu = prelu
        if params.network_relu_type == "lrelu":
            relu = tf.nn.leaky_relu

    with tf.variable_scope("vlad"):
        value_features = endpoints[params.vlad_value_input]
        key_features = endpoints[params.vlad_key_input]

        # Value forward -> [b, l, d]
        if len(params.vlad_value_num_nodes) > 0:
            for index, num_nodes in enumerate(params.vlad_value_num_nodes):
                value_features = dense_bn_relu(value_features,
                                               num_nodes,
                                               endpoints,
                                               params,
                                               is_training,
                                               name=("vlad_value%d" % index))

        # Key forward
        if len(params.vlad_key_num_nodes) > 0:
            for index, num_nodes in enumerate(params.vlad_key_num_nodes):
                key_features = dense_bn_relu(key_features,
                                             num_nodes,
                                             endpoints,
                                             params,
                                             is_training,
                                             name=("vlad_key%d" % index))

        # Affine: wx+b -> [b, l, nclusters]
        key_features = tf.layers.dense(
            key_features,
            params.vlad_num_centers + params.vlad_num_ghosts,
            activation=None,
            kernel_regularizer=tf.contrib.layers.l2_regularizer(
                params.weight_l2_regularizer),
            name="vlad_weight_affine")

        # The weights
        A = tf.nn.softmax(key_features, axis=-1, name="vlad_weights")
        endpoints["vlad_weights"] = A

        # Compute the residual
        cluster = tf.get_variable(
            "vlad_centers", [
                params.vlad_num_centers + params.vlad_num_ghosts,
                shape_list(value_features)[-1]
            ],
            dtype=tf.float32,
            initializer=tf.contrib.layers.xavier_initializer(),
            regularizer=tf.contrib.layers.l2_regularizer(
                params.weight_l2_regularizer))

        res = tf.expand_dims(value_features, axis=2) - cluster
        A = tf.expand_dims(A, axis=-1)
        weighted_res = A * res
        cluster_res = tf.reduce_sum(weighted_res, axis=1)

        tf.logging.info("VLAD is used: %d clusters" % params.vlad_num_centers)
        if params.vlad_num_ghosts > 0:
            tf.logging.info("  %d ghost clusters is added" %
                            params.vlad_num_ghosts)
            cluster_res = cluster_res[:, :params.vlad_num_centers, :]

        cluster_res = tf.nn.l2_normalize(cluster_res, axis=-1)
        output = tf.reshape(
            cluster_res,
            [-1, params.vlad_num_centers * shape_list(cluster_res)[-1]])
        if params.vlad_final_l2_norm:
            output = tf.nn.l2_normalize(output, axis=-1)

        endpoints["vlad_value"] = value_features
        endpoints["vlad_key"] = key_features
        endpoints["vlad_centers"] = cluster

        return output
Пример #4
0
def self_attention(features,
                   aux_features,
                   endpoints,
                   params,
                   is_training=None):
    """Self-attention.
    Note that the key should be the same length with the value, i.e. no convnet is applied after the key layer, or
    some trimming strategy should be applied before the weighted sum. (Refer to linguistic_attention)

    Args:
        features: A tensor with shape [batch, length, dim].
        aux_features: Auxiliary input features with shape [batch, length, dim].
        endpoints: Outputs of different parts of the network. Useful when doing attention.
        params: Parameters for self-attention.
            params.self_att_key_input: Use endpoints[params.self_att_key_input] to compute the key.
            params.self_att_key_num_nodes: The network to compute the key.
            params.self_att_value_num_nodes: The network to compute the value.
            params.self_att_num_heads: The number of heads in multi-head attention.
            params.self_att_penalty_term: The coefficient of the penalty term.
            The final dimension of the key and the value is decided by self_att_key_num_nodes and self_att_value_num_nodes.
            If multi-head attention is used, the value will be split first (the key remains the original dim).
        is_training: Used in BN.
    :return:
        Attention result. Also in the statistic format [weighted_mean, weighted_stddev]
    """
    assert "self_att_key_input" in params.dict
    assert "self_att_key_num_nodes" in params.dict
    assert "self_att_value_num_nodes" in params.dict
    assert "self_att_num_heads" in params.dict
    assert "self_att_penalty_term" in params.dict

    with tf.variable_scope("attention"):
        value_features = features
        key_features = endpoints[params.self_att_key_input]

        if len(params.self_att_key_num_nodes) != 0:
            # According to "A STRUCTURED SELF-ATTENTIVE SENTENCE EMBEDDING",
            # the last layer of the key network is `affine + tanh`.
            if len(params.self_att_key_num_nodes) > 1:
                for index, node in enumerate(
                        params.self_att_key_num_nodes[:-1]):
                    key_features = dense_relu(key_features,
                                              node,
                                              endpoints,
                                              params,
                                              is_training,
                                              name=("att_key%d" % index))
            key_features = dense_tanh(
                key_features,
                params.self_att_key_num_nodes[-1],
                endpoints,
                params,
                is_training,
                name=("att_key%d" % (len(params.self_att_key_num_nodes) - 1)))

        if len(params.self_att_value_num_nodes) != 0:
            tf.logging.info("Note: Add network to process the value input %s" %
                            value_features.name)
            for index, node in enumerate(params.self_att_value_num_nodes):
                value_features = dense_relu(value_features,
                                            node,
                                            endpoints,
                                            params,
                                            is_training,
                                            name=("att_value%d" % index))

        # The last element in self_att_key_num_nodes and self_att_value_num_nodes
        # is the dimension of the key and the value. In multi-head attention, they are extended n times.
        n_heads = params.self_att_num_heads
        assert shape_list(
            value_features
        )[2] % n_heads == 0, "The dim of the value must be divided by the num of heads."

        # Split the value. The key can use the entire key vector (without splitting).
        value_features = split_heads(value_features, n_heads)
        val_shape = shape_list(value_features)
        key_shape = shape_list(key_features)

        tf.logging.info(
            "Attention:\n"
            "  The dim of the value: %d, the dim of the key: %d\n"
            "  The layer has %d heads, resulting in the dim of value of each head %d.\n"
            "  With weighted mean and stddev, the attention layer results in output with dim %d."
            % (val_shape[1] * val_shape[-1], key_shape[-1], n_heads,
               val_shape[-1], val_shape[1] * val_shape[-1] * 2))

        # Initialize query thus the weight for each time step is equal at the beginning.
        query = tf.get_variable(
            "query", [n_heads, key_shape[-1]],
            dtype=tf.float32,
            initializer=tf.initializers.truncated_normal(stddev=0.1))

        query_time_key = tf.einsum('ijl,kl->ijk',
                                   key_features,
                                   query,
                                   name="query_time_key")
        weights = tf.nn.softmax(tf.transpose(query_time_key, [0, 2, 1]),
                                name="weights")

        att_mean = tf.einsum('bnld,bnl->bnd',
                             value_features,
                             weights,
                             name="att_mean")
        att_stddev = tf.einsum('bnld,bnl->bnd',
                               tf.squared_difference(
                                   value_features,
                                   tf.expand_dims(att_mean, axis=2)),
                               weights,
                               name="att_stddev")

        att_mean = combine_last_two_dimensions(att_mean)
        att_stddev = combine_last_two_dimensions(att_stddev)

        mask = tf.to_float(tf.less_equal(att_stddev, VAR2STD_EPSILON))
        att_stddev = (1.0 - mask) * att_stddev + mask * VAR2STD_EPSILON
        att_stddev = tf.sqrt(att_stddev)

        att = tf.concat([att_mean, att_stddev], 1, name="concat")
        endpoints["attention_weights"] = weights

        # Penalty term
        penalty = tf.einsum('ijk,ikl->ijl', weights,
                            tf.transpose(weights, [0, 2, 1])) - tf.eye(
                                n_heads, batch_shape=[val_shape[0]])
        # Normalize using the batch size
        penalty = tf.reduce_sum(tf.square(penalty)) / tf.to_float(val_shape[0])
        tf.add_to_collection("PENALTY", params.self_att_penalty_term * penalty)
        tf.summary.scalar("attention_penalty",
                          params.self_att_penalty_term * penalty)

        # # Debug
        # # Comment lines when running the code
        # endpoints["att_query"] = query
        # endpoints["att_key"] = key_features
        # endpoints["att_value"] = value_features
    return att
Пример #5
0
def aux_attention(features, aux_features, endpoints, params, is_training=None):
    """Attention using auxiliary features.

    The attention layer has a minor problem that the length of the key may be different with the length of the value,
    due to the convnet. The key usually has the original feature length while the length of the value is shorter.
    We always using the fully-connected layer in the key network, so the length remains the same.
    A workaround is to use the center of the key to make length of the key and the value the same.

    Note: When auxiliary key is used, the hypothesis is that the length of this auxiliary feature is the same with the value.

    Args:
        features: A tensor with shape [batch, length, dim].
        aux_features: A dict.
        aux_featuers["aux_feat_name"]: The length is LONGER than features!!!
                                    The features is processed by convnet thus the length becomes shorter.
        TODO: How to trim the auxiliary features? Align left or center?
        endpoints: Outputs of different parts of the network.
        params: Parameters for self-attention.
            params.att_aux_name: The name of the auxiliary features.
            params.att_aux_key_input: Additional key input except for the auxiliary features.
                                      If None then only the auxiliary features are used.
            params.att_key_num_nodes: The network to compute the key.
            params.att_value_num_nodes: The network to compute the value.
            params.att_num_heads: The number of heads in multi-head attention.
            params.att_penalty_term: The coefficient of the penalty term.
            The final dimension of the key and the value is decided by self_att_key_num_nodes and self_att_value_num_nodes.
            If multi-head attention is used, the value will be split first (the key remains the original dim).
        is_training: Used in BN.
    :return:
    """
    assert "att_aux_name" in params.dict
    assert "att_key_input" in params.dict
    assert "att_key_num_nodes" in params.dict
    assert "att_value_num_nodes" in params.dict
    assert "att_num_heads" in params.dict
    assert "att_penalty_term" in params.dict

    with tf.variable_scope("attention"):
        value_features = features
        for aux_name in params.att_aux_name:
            if aux_name not in aux_features:
                sys.exit("The aux features %s is not in aux_features." %
                         aux_name)

        key_features = []
        for aux_name in params.att_aux_name:
            # Center trimming. Use the center of the key to match the length of the value.
            trim_length = (shape_list(aux_features[aux_name])[1] -
                           shape_list(value_features)[1]) / 2
            # This requires the total kernel size is a odd number.
            key_features.append(
                aux_features[aux_name][:, trim_length:-trim_length, :])

            # # TODO: If the length of the key and the value is the same, the next line is useful.
            # # But the above line looks more neat (What...).
            # key_features = tf.cond(tf.equal(trim_length, 0),
            #                        lambda: aux_features[aux_name],
            #                        lambda: aux_features[aux_name][:, trim_length:-trim_length, :])

        tf.logging.info("Attention using auxiliary features:")
        if params.att_key_input is not None:
            if params.att_key_input not in endpoints:
                sys.exit(
                    "You specify the appended key %s, but I cannot find it in the endpoints."
                    % params.att_key_input)
            tf.logging.info("Append %s to the auxiliary features" %
                            params.att_key_input)
            key_features.append(endpoints[params.att_key_input])

        # Concatenate all the features to the key.
        key_features = tf.concat(key_features, axis=-1, name="key_features")

        if len(params.att_key_num_nodes) != 0:
            # According to "A STRUCTURED SELF-ATTENTIVE SENTENCE EMBEDDING",
            # the last layer of the key network is `affine + tanh`.
            if len(params.att_key_num_nodes) > 1:
                for index, node in enumerate(params.att_key_num_nodes[:-1]):
                    key_features = dense_relu(key_features,
                                              node,
                                              endpoints,
                                              params,
                                              is_training,
                                              name=("att_key%d" % index))
            key_features = dense_tanh(
                key_features,
                params.att_key_num_nodes[-1],
                endpoints,
                params,
                is_training,
                name=("att_key%d" % (len(params.att_key_num_nodes) - 1)))

        if len(params.att_value_num_nodes) != 0:
            tf.logging.info("Note: Add network to process the value input %s" %
                            value_features.name)
            for index, node in enumerate(params.att_value_num_nodes):
                value_features = dense_relu(value_features,
                                            node,
                                            endpoints,
                                            params,
                                            is_training,
                                            name=("att_value%d" % index))

        # The last element in self_att_key_num_nodes and self_att_value_num_nodes
        # is the dimension of the key and the value. In multi-head attention, they are extended n times.
        n_heads = params.att_num_heads
        assert shape_list(
            value_features
        )[2] % n_heads == 0, "The dim of the value must be divided by the num of heads."

        # Split the value. The key can use the entire vector.
        value_features = split_heads(value_features, n_heads)
        val_shape = shape_list(value_features)
        key_shape = shape_list(key_features)

        tf.logging.info(
            "  The dim of the value: %d, the dim of the key: %d\n"
            "  The layer has %d heads, resulting in the dim of value of each head %d.\n"
            "  With weighted mean and stddev, the attention layer results in output with dim %d."
            % (val_shape[1] * val_shape[-1], key_shape[-1], n_heads,
               val_shape[-1], val_shape[1] * val_shape[-1] * 2))

        # Initialize query thus the weight for each time step is equal at the beginning.
        query = tf.get_variable(
            "query", [n_heads, key_shape[-1]],
            dtype=tf.float32,
            initializer=tf.initializers.truncated_normal(stddev=0.1))

        query_time_key = tf.einsum('ijl,kl->ijk',
                                   key_features,
                                   query,
                                   name="query_time_key")
        weights = tf.nn.softmax(tf.transpose(query_time_key, [0, 2, 1]),
                                name="weights")

        att_mean = tf.einsum('bnld,bnl->bnd',
                             value_features,
                             weights,
                             name="att_mean")
        att_stddev = tf.einsum('bnld,bnl->bnd',
                               tf.squared_difference(
                                   value_features,
                                   tf.expand_dims(att_mean, axis=2)),
                               weights,
                               name="att_stddev")

        att_mean = combine_last_two_dimensions(att_mean)
        att_stddev = combine_last_two_dimensions(att_stddev)

        mask = tf.to_float(tf.less_equal(att_stddev, VAR2STD_EPSILON))
        att_stddev = (1.0 - mask) * att_stddev + mask * VAR2STD_EPSILON
        att_stddev = tf.sqrt(att_stddev)

        att = tf.concat([att_mean, att_stddev], 1, name="concat")
        endpoints["attention_weights"] = weights

        # Penalty term
        penalty = tf.einsum('ijk,ikl->ijl', weights,
                            tf.transpose(weights, [0, 2, 1])) - tf.eye(
                                n_heads, batch_shape=[val_shape[0]])
        penalty = tf.reduce_sum(tf.square(penalty)) / tf.to_float(val_shape[0])
        tf.add_to_collection("PENALTY", params.att_penalty_term * penalty)
        tf.summary.scalar("attention_penalty",
                          params.att_penalty_term * penalty)

        # # Debug
        # # Comment lines when running the code
        # endpoints["att_query"] = query
        # endpoints["att_key"] = key_features
        # endpoints["att_value"] = value_features
    return att
Пример #6
0
def build_phone_encoder(features,
                        speaker_labels,
                        feature_length,
                        params,
                        endpoints,
                        reuse_variables,
                        is_training=False):
    """Build encoder for phone latent variable.
    Use the tdnn and share the same structure in the lower layers.

    Args:
        features: the input features.
        speaker_labels: the speaker labels (i.e. the speaker index). may be used in the future.
        feature_length: the length of each feature.
        params: the parameters.
        endpoints: will be updated during building.
        reuse_variables: if true, reuse the existing variables
        is_training: used in batchnorm.
    :return: sampled_zs, mu_zs, logvar_zs
    """
    relu = tf.nn.relu
    if "network_relu_type" in params.dict:
        if params.network_relu_type == "prelu":
            relu = prelu
        if params.network_relu_type == "lrelu":
            relu = tf.nn.leaky_relu

    # # This is moved to the model config file.
    # # Acoustic network params:
    # # Most share 4 layers with x-vector network.
    # # [-2,2], [-2,2], [-3,3], [0], [-4,0,4]
    # # The last fully-connected layer is appended as the phonetic embedding
    # layer_size = [512, 512, 512, 512, 512]
    # kernel_size = [5, 5, 7, 1, 3]
    # dilation_size = [1, 1, 1, 1, 4]

    num_layers = len(params.phone_kernel_size)
    layer_index = 0
    if params.num_shared_layers > 0:
        # We may share the lower layers of the two tasks.
        # Go through the shared layers between the speaker and phone networks.
        assert params.num_shared_layers < num_layers
        with tf.variable_scope("encoder", reuse=True):
            for i in range(params.num_shared_layers):
                if params.phone_kernel_size[layer_index] > 1:
                    if len(shape_list(features)) == 3:
                        # Add a dummy dim to support 2d conv
                        features = tf.expand_dims(features, axis=1)
                    features = tf.layers.conv2d(
                        features,
                        params.phone_layer_size[layer_index],
                        (1, params.phone_kernel_size[layer_index]),
                        activation=None,
                        dilation_rate=(
                            1, params.phone_dilation_size[layer_index]),
                        kernel_regularizer=tf.contrib.layers.l2_regularizer(
                            params.weight_l2_regularizer),
                        name='conv%d' % (layer_index + 1))
                elif params.phone_kernel_size[layer_index] == 1:
                    if len(shape_list(features)) == 4:
                        # Remove a dummy dim to do dense layer
                        features = tf.squeeze(features, axis=1)
                    features = tf.layers.dense(
                        features,
                        params.phone_layer_size[layer_index],
                        activation=None,
                        kernel_regularizer=tf.contrib.layers.l2_regularizer(
                            params.weight_l2_regularizer),
                        name="dense%d" % (layer_index + 1))

                features = tf.layers.batch_normalization(
                    features,
                    momentum=params.batchnorm_momentum,
                    training=is_training,
                    name="bn%d" % (layer_index + 1))
                features = relu(features, name='relu%d' % (layer_index + 1))
                layer_index += 1

    with tf.variable_scope("encoder_phone", reuse=reuse_variables):
        # In the unshared part, the endpoints should be updated.
        while layer_index < num_layers:
            if params.phone_kernel_size[layer_index] > 1:
                if len(shape_list(features)) == 3:
                    features = tf.expand_dims(features, axis=1)
                features = tf.layers.conv2d(
                    features,
                    params.phone_layer_size[layer_index],
                    (1, params.phone_kernel_size[layer_index]),
                    activation=None,
                    dilation_rate=(1, params.phone_dilation_size[layer_index]),
                    kernel_regularizer=tf.contrib.layers.l2_regularizer(
                        params.weight_l2_regularizer),
                    name='phn_conv%d' % (layer_index + 1))
                endpoints["phn_conv%d" % (layer_index + 1)] = features
            elif params.phone_kernel_size[layer_index] == 1:
                if len(shape_list(features)) == 4:
                    features = tf.squeeze(features, axis=1)
                features = tf.layers.dense(
                    features,
                    params.phone_layer_size[layer_index],
                    activation=None,
                    kernel_regularizer=tf.contrib.layers.l2_regularizer(
                        params.weight_l2_regularizer),
                    name="phn_dense%d" % (layer_index + 1))
                endpoints["phn_dense%d" % (layer_index + 1)] = features

            features = tf.layers.batch_normalization(
                features,
                momentum=params.batchnorm_momentum,
                training=is_training,
                name="phn_bn%d" % (layer_index + 1))
            endpoints["phn_bn%d" % (layer_index + 1)] = features
            features = relu(features, name='phn_relu%d' % (layer_index + 1))
            endpoints["phn_relu%d" % (layer_index + 1)] = features
            layer_index += 1

        # The last layer
        if len(shape_list(features)) == 4:
            features = tf.squeeze(features, axis=1)

        # Similar with the speaker network, we may need to slice the feature due to the different context between
        # the speaker and phone network. At this moment, I just make a hypothesis that the phone context will be
        # larger which means there is no need to slice for the phone network
        if (params.speaker_left_context > params.phone_left_context
                and params.speaker_right_context > params.phone_right_context):
            raise NotImplementedError(
                "The speake and phone context is not supported now.")
            # features = features[:, params.speaker_left_context - params.phone_left_context:
            #                        params.phone_right_context - params.speaker_right_context, :]

        # # We do not validate the length because this will introduce the alignment -- phn_labels, which
        # # is unnecessary when doing the phone inference.
        # with tf.control_dependencies([tf.assert_equal(shape_list(features)[1], shape_list(self.phn_labels)[1])]):
        #     features = tf.identity(features)

        if "phone_dim" not in params.dict:
            params.dict["phone_dim"] = 512
        mu = tf.layers.dense(
            features,
            params.phone_dim,
            activation=None,
            kernel_regularizer=tf.contrib.layers.l2_regularizer(
                params.weight_l2_regularizer),
            name="zp_dense")
        endpoints['zp_mu_dense'] = mu
        mu = tf.layers.batch_normalization(mu,
                                           momentum=params.batchnorm_momentum,
                                           training=is_training,
                                           name="zp_bn")
        endpoints['zp_mu_bn'] = mu
        mu = relu(mu, name='zp_mu_relu')
        endpoints['zp_mu_relu'] = mu

        logvar = 0
        # epsilon = tf.random_normal(tf.shape(mu), name='zp_epsilon')
        # sample = mu + tf.exp(0.5 * logvar) * epsilon
        sample = mu

    return sample, mu, logvar
Пример #7
0
def build_speaker_encoder(features,
                          phone_labels,
                          feature_length,
                          params,
                          endpoints,
                          reuse_variables,
                          is_training=False):
    """Build encoder for speaker latent variable.
    Use the same tdnn network with x-vector.

    Args:
        features: the input features.
        phone_labels: the phone labels (i.e. alignment). will be used in the future.
        feature_length: the length of each feature.
        params: the parameters.
        endpoints: will be updated during building.
        reuse_variables: if true, reuse the existing variables.
        is_training: used in batchnorm
    :return: sampled_zs, mu_zs, logvar_zs
    """
    relu = tf.nn.relu
    if "network_relu_type" in params.dict:
        if params.network_relu_type == "prelu":
            relu = prelu
        if params.network_relu_type == "lrelu":
            relu = tf.nn.leaky_relu

    with tf.variable_scope("encoder", reuse=reuse_variables):
        # Layer 1: [-2,-1,0,1,2] --> [b, 1, l-4, 512]
        # conv2d + batchnorm + relu
        features = tf.layers.conv2d(
            features,
            512, (1, 5),
            activation=None,
            kernel_regularizer=tf.contrib.layers.l2_regularizer(
                params.weight_l2_regularizer),
            name='conv1')
        endpoints["conv1"] = features
        features = tf.layers.batch_normalization(
            features,
            momentum=params.batchnorm_momentum,
            training=is_training,
            name="bn1")
        endpoints["bn1"] = features
        features = relu(features, name='relu1')
        endpoints["relu1"] = features

        # Layer 2: [-2, -1, 0, 1, 2] --> [b ,1, l-4, 512]
        # conv2d + batchnorm + relu
        # This is slightly different with Kaldi which use dilation convolution
        features = tf.layers.conv2d(
            features,
            512, (1, 5),
            activation=None,
            kernel_regularizer=tf.contrib.layers.l2_regularizer(
                params.weight_l2_regularizer),
            name='conv2')
        endpoints["conv2"] = features
        features = tf.layers.batch_normalization(
            features,
            momentum=params.batchnorm_momentum,
            training=is_training,
            name="bn2")
        endpoints["bn2"] = features
        features = relu(features, name='relu2')
        endpoints["relu2"] = features

        # Layer 3: [-3, -2, -1, 0, 1, 2, 3] --> [b, 1, l-6, 512]
        # conv2d + batchnorm + relu
        # Still, use a non-dilation one
        features = tf.layers.conv2d(
            features,
            512, (1, 7),
            activation=None,
            kernel_regularizer=tf.contrib.layers.l2_regularizer(
                params.weight_l2_regularizer),
            name='conv3')
        endpoints["conv3"] = features
        features = tf.layers.batch_normalization(
            features,
            momentum=params.batchnorm_momentum,
            training=is_training,
            name="bn3")
        endpoints["bn3"] = features
        features = relu(features, name='relu3')
        endpoints["relu3"] = features

        # Convert to [b, l, 512]
        features = tf.squeeze(features, axis=1)
        # The output of the 3-rd layer can simply be rank 3.
        endpoints["relu3"] = features

        # Layer 4: [b, l, 512] --> [b, l, 512]
        features = tf.layers.dense(
            features,
            512,
            activation=None,
            kernel_regularizer=tf.contrib.layers.l2_regularizer(
                params.weight_l2_regularizer),
            name="dense4")
        endpoints["dense4"] = features
        features = tf.layers.batch_normalization(
            features,
            momentum=params.batchnorm_momentum,
            training=is_training,
            name="bn4")
        endpoints["bn4"] = features
        features = relu(features, name='relu4')
        endpoints["relu4"] = features

        # Layer 5: [b, l, x]
        if "num_nodes_pooling_layer" not in params.dict:
            # The default number of nodes before pooling
            params.dict["num_nodes_pooling_layer"] = 1500

        features = tf.layers.dense(
            features,
            params.num_nodes_pooling_layer,
            activation=None,
            kernel_regularizer=tf.contrib.layers.l2_regularizer(
                params.weight_l2_regularizer),
            name="dense5")
        endpoints["dense5"] = features
        features = tf.layers.batch_normalization(
            features,
            momentum=params.batchnorm_momentum,
            training=is_training,
            name="bn5")
        endpoints["bn5"] = features
        features = relu(features, name='relu5')
        endpoints["relu5"] = features

        # Here, we need to slice the feature since the original feature is expanded by the larger context between
        # the speaker and phone context. I make a hypothesis that the phone context will be larger.
        # So the speaker network need to slicing.
        if (params.speaker_left_context < params.phone_left_context
                and params.speaker_right_context < params.phone_right_context):
            features = features[:, params.phone_left_context -
                                params.speaker_left_context:params.
                                speaker_right_context -
                                params.phone_right_context, :]
        else:
            raise NotImplementedError(
                "The speake and phone context is not supported now.")

        # Make sure we've got the right feature
        with tf.control_dependencies([
                tf.assert_equal(
                    shape_list(features)[1],
                    shape_list(phone_labels)[1])
        ]):
            # Pooling layer
            # The length of utterances may be different.
            # The original pooling use all the frames which is not appropriate for this case.
            # So we create a new function (I don't want to change the original one).
            if params.pooling_type == "statistics_pooling":
                features = statistics_pooling_v2(features, feature_length,
                                                 endpoints, params,
                                                 is_training)
            else:
                raise NotImplementedError("Not implement %s pooling" %
                                          params.pooling_type)
            endpoints['pooling'] = features

        # Utterance-level network
        # Layer 6: [b, 512]
        features = tf.layers.dense(
            features,
            512,
            activation=None,
            kernel_regularizer=tf.contrib.layers.l2_regularizer(
                params.weight_l2_regularizer),
            name='dense6')
        endpoints['dense6'] = features
        features = tf.layers.batch_normalization(
            features,
            momentum=params.batchnorm_momentum,
            training=is_training,
            name="bn6")
        endpoints["bn6"] = features
        features = relu(features, name='relu6')
        endpoints["relu6"] = features

        # Layer 7: [b, x]
        if "speaker_dim" not in params.dict:
            # The default number of nodes in the last layer
            params.dict["speaker_dim"] = 512

        # We need mean and logvar.
        mu = tf.layers.dense(
            features,
            params.speaker_dim,
            activation=None,
            kernel_regularizer=tf.contrib.layers.l2_regularizer(
                params.weight_l2_regularizer),
            name="zs_dense")
        endpoints['zs_mu_dense'] = mu

        if "spk_last_layer_no_bn" not in params.dict:
            params.spk_last_layer_no_bn = False

        if not params.spk_last_layer_no_bn:
            mu = tf.layers.batch_normalization(
                mu,
                momentum=params.batchnorm_momentum,
                training=is_training,
                name="zs_bn")
            endpoints['zs_mu_bn'] = mu

        if "spk_last_layer_linear" not in params.dict:
            params.spk_last_layer_linear = False

        if not params.spk_last_layer_linear:
            mu = relu(mu, name="zs_mu_relu")
            endpoints['zs_mu_relu'] = mu

        # We do not compute logvar in this version.
        # Set logvar=0 ==> var=1
        logvar = 0

        # epsilon = tf.random_normal(tf.shape(mu), name='zs_epsilon')
        # sample = mu + tf.exp(0.5 * logvar) * epsilon
        sample = mu

    return sample, mu, logvar
Пример #8
0
def resnet_18(features,
              params,
              is_training=None,
              reuse_variables=None,
              aux_features=None):
    """ Build a ResNet.
        Modified ResNet-18, the blocks are: [3/64, 3/64], [3/128, 3/128], [3/256, 3/256], [3/512, 3/512]
        The default number of blocks: [2, 2, 2, 2]
        The last 3 blocks can downsample the features.
        N fully-connected layers are appended to the output the res blocks.
        There are actually 2 more layers than standard ResNet implementation.

        The downsample in ResNet-50 with 1*1 kernel may lose the frequency resolution.

        About the network parameters (no batchnorm included):
        TDNN: 2.6M (or 4.2M without dilation)
        ETDNN: 4.4M (or 7.6M without dilation)
        Modified FTDNN: 9.2M
        Modified EFTDNN: 32M
        FTDNN: 9.0M
        EFTDNN: 19.8M (much smaller than modified eftdnn)

        ResNet-18: 13.5M
        ResNet-34: 23.6M
        ResNet-50: 16.1M
        ResNet-101: 28.4M

        Args:
            features: A tensor with shape [batch, length, dim].
            params: Configuration loaded from a JSON.
            is_training: True if the network is used for training.
            reuse_variables: True if the network has been built and enable variable reuse.
            aux_features: Auxiliary features (e.g. linguistic features or bottleneck features).
        :return:
            features: The output of the last layer.
            endpoints: An OrderedDict containing output of every components. The outputs are in the order that they add to
                       the network. Thus it is convenient to split the network by a output name
    """
    # The strides only affect the last 3 conv block
    time_stride = 2 if params.resnet_time_stride else 1

    # The dimension of the features should be 40
    assert (shape_list(features)[-1] == 40)

    tf.logging.info("Build a ResNet-18 network.")
    # ReLU is a normal choice while other activation function is possible.
    relu = tf.nn.relu
    if "network_relu_type" in params.dict:
        if params.network_relu_type == "prelu":
            relu = prelu
        elif params.network_relu_type == "lrelu":
            relu = tf.nn.leaky_relu

    # The block parameters
    # default: [2, 2, 2, 2]
    if "resnet_blocks" not in params.dict:
        params.dict["resnet_blocks"] = [2, 2, 2, 2]
    tf.logging.info("The resnet blocks: [%d, %d, %d, %d]",
                    params.resnet_blocks[0], params.resnet_blocks[1],
                    params.resnet_blocks[2], params.resnet_blocks[3])

    endpoints = OrderedDict()
    with tf.variable_scope("resnet_18", reuse=reuse_variables):
        # features: [N, L, F, 1]
        # ndim = shape_list(features)[-1]
        features = tf.expand_dims(features, axis=3)

        # Since we use 40-dim FBanks, the kernel should be smaller.
        # First conv
        # No strides are applied.
        features = tf.layers.conv2d(
            features,
            64, (3, 3),
            padding='same',
            activation=None,
            use_bias=False,
            kernel_regularizer=tf.contrib.layers.l2_regularizer(
                params.weight_l2_regularizer),
            name='conv0_1')
        features = tf.layers.batch_normalization(
            features,
            momentum=params.batchnorm_momentum,
            training=is_training,
            name="conv0_bn")
        features = relu(features, name='conv0_relu')
        if params.resnet_maxpooling:
            features = tf.layers.max_pooling2d(features, (3, 3), (1, 1),
                                               padding='same',
                                               name='conv0_max')

        # Conv Block 1
        features = conv_block(features, [[3, 3], [3, 3]], [64, 64], [1, 1],
                              params, is_training, relu, "conv1a")
        for i in range(params.resnet_blocks[0] - 1):
            features = identity_block(features, [[3, 3], [3, 3]], [64, 64],
                                      params, is_training, relu,
                                      "conv1b_%d" % i)

        # Conv Block 2
        features = conv_block(features, [[3, 3], [3, 3]], [128, 128],
                              [time_stride, 2], params, is_training, relu,
                              "conv2a")
        for i in range(params.resnet_blocks[1] - 1):
            features = identity_block(features, [[3, 3], [3, 3]], [128, 128],
                                      params, is_training, relu,
                                      "conv2b_%d" % i)

        # Conv Block 3
        features = conv_block(features, [[3, 3], [3, 3]], [256, 256],
                              [time_stride, 2], params, is_training, relu,
                              "conv3a")
        for i in range(params.resnet_blocks[2] - 1):
            features = identity_block(features, [[3, 3], [3, 3]], [256, 256],
                                      params, is_training, relu,
                                      "conv3b_%d" % i)

        # Conv Block 4
        features = conv_block(features, [[3, 3], [3, 3]], [512, 512],
                              [time_stride, 2], params, is_training, relu,
                              "conv4a")
        for i in range(params.resnet_blocks[3] - 1):
            features = identity_block(features, [[3, 3], [3, 3]], [512, 512],
                                      params, is_training, relu,
                                      "conv4b_%d" % i)

        # features: [N, L/t, 5, 512]
        # The original resnet use average pooling to get [N, 512] which I think will eliminate the time resolution.
        # Hence, in this implementation, we first obtain [N, L, 512] via conv layer and use dense layer to process the features
        features = tf.layers.conv2d(
            features,
            512, (1, shape_list(features)[2]),
            activation=None,
            kernel_regularizer=tf.contrib.layers.l2_regularizer(
                params.weight_l2_regularizer),
            name='conv5')
        features = tf.layers.batch_normalization(
            features,
            momentum=params.batchnorm_momentum,
            training=is_training,
            name="conv5_bn")
        features = relu(features, name='conv5_relu')
        features = tf.squeeze(features, axis=2)

        # FC layers * 2
        features = tf.layers.dense(
            features,
            512,
            activation=None,
            kernel_regularizer=tf.contrib.layers.l2_regularizer(
                params.weight_l2_regularizer),
            name='dense1')
        features = tf.layers.batch_normalization(
            features,
            momentum=params.batchnorm_momentum,
            training=is_training,
            name="dense1_bn")
        features = relu(features, name='dense1_relu')

        features = tf.layers.dense(
            features,
            1500,
            activation=None,
            kernel_regularizer=tf.contrib.layers.l2_regularizer(
                params.weight_l2_regularizer),
            name='dense2')
        features = tf.layers.batch_normalization(
            features,
            momentum=params.batchnorm_momentum,
            training=is_training,
            name="dense2_bn")
        features = relu(features, name='dense2_relu')

        # Compute the number of parameters
        num_params = 3*3*64 + (2*3*3*64*64*params.resnet_blocks[0] + 64*64) + \
                     (3*3*64*128 + 3*3*128*128 + 64*128 + 2*3*3*128*128*(params.resnet_blocks[1]-1)) + \
                     (3*3*128*256 + 3*3*256*256 + 128*256 + 2*3*3*256*256*(params.resnet_blocks[2]-1)) + \
                     (3*3*256*512 + 3*3*512*512 + 256*512 + 2*3*3*512*512*(params.resnet_blocks[3]-1)) + \
                     (1*5*512*512 + 512*512 + 512*1500)
        tf.logging.info(
            "The number of parameters of the frame-level network: %d" %
            num_params)
        num_layers = 4 + 2 * (
            params.resnet_blocks[0] + params.resnet_blocks[1] +
            params.resnet_blocks[2] + params.resnet_blocks[3])
        tf.logging.info("The number of layers: %d" % num_layers)

        # Pooling
        features = general_pooling(features, aux_features, endpoints, params,
                                   is_training)

        # Utterance-level network
        # Layer 6: [b, 512]
        features = tf.layers.dense(
            features,
            512,
            activation=None,
            kernel_regularizer=tf.contrib.layers.l2_regularizer(
                params.weight_l2_regularizer),
            name='tdnn6_dense')
        endpoints['tdnn6_dense'] = features
        features = tf.layers.batch_normalization(
            features,
            momentum=params.batchnorm_momentum,
            training=is_training,
            name="tdnn6_bn")
        endpoints["tdnn6_bn"] = features
        features = relu(features, name='tdnn6_relu')
        endpoints["tdnn6_relu"] = features

        # Layer 7: [b, x]
        if "num_nodes_last_layer" not in params.dict:
            # The default number of nodes in the last layer
            params.dict["num_nodes_last_layer"] = 512

        features = tf.layers.dense(
            features,
            params.num_nodes_last_layer,
            activation=None,
            kernel_regularizer=tf.contrib.layers.l2_regularizer(
                params.weight_l2_regularizer),
            name='tdnn7_dense')
        endpoints['tdnn7_dense'] = features

        if "last_layer_no_bn" not in params.dict:
            params.last_layer_no_bn = False

        if not params.last_layer_no_bn:
            features = tf.layers.batch_normalization(
                features,
                momentum=params.batchnorm_momentum,
                training=is_training,
                name="tdnn7_bn")
            endpoints["tdnn7_bn"] = features

        if "last_layer_linear" not in params.dict:
            params.last_layer_linear = False

        if not params.last_layer_linear:
            # If the last layer is linear, no further activation is needed.
            features = relu(features, name='tdnn7_relu')
            endpoints["tdnn7_relu"] = features

    return features, endpoints