示例#1
0
def get_lcmv_vector(atf_vectors, response_vector, noise_psd_matrix):
    """Calculates an LCMV beamforming vector.

    :param atf_vectors: Acoustic transfer function vectors for
        each source with shape (targets k, bins f, sensors d)
    :param response_vector: Defines, which sources you are interested in.
        Set it to [1, 0, ..., 0], if you are interested in the first speaker.
        It has the shape (targets,)
    :param noise_psd_matrix: Noise PSD matrix
        with shape (bins f, sensors d, sensors D)
    :return: Set of beamforming vectors with shape (bins f, sensors d)
    """
    response_vector = np.asarray(response_vector)
    # TODO: If it is a list, a list of response_vectors is returned.
    K, F, D = atf_vectors.shape

    assert noise_psd_matrix.shape == (F, D, D), noise_psd_matrix.shape

    Phi_inverse_times_H = np.squeeze(stable_solve(
        np.broadcast_to(noise_psd_matrix[None, :, :, :], (K, F, D, D)),
        atf_vectors[:, :, :, None]  # k, f, d
    ), axis=-1)  # k, f, d
    assert Phi_inverse_times_H.shape == (K, F, D), Phi_inverse_times_H.shape

    H_times_Phi_inverse_times_H = np.einsum(
        'k...d,K...d->...kK',
        atf_vectors.conj(),
        Phi_inverse_times_H
    )  # f, k, K

    response_vector = response_vector[None, :, None].astype(np.complex64)
    response_vector = np.repeat(response_vector, F, axis=0)
    temp = stable_solve(
        H_times_Phi_inverse_times_H,
        response_vector,  # F, K, 1
    )  # f, k
    beamforming_vector = np.einsum(
        'k...d,...k->...d',
        Phi_inverse_times_H,
        np.squeeze(temp, axis=-1)
    )

    return beamforming_vector
示例#2
0
def get_wmwf_vector(target_psd_matrix,
                    noise_psd_matrix,
                    reference_channel=None,
                    channel_selection_vector=None,
                    distortion_weight=1.):
    """Speech distortion weighted multichannel Wiener filter.

    This filter is the solution to the optimization problem
    `min E[|h^{H}x - X_{k}|^2] + mu E[|h^{H}n|^2]`.
    I.e. it minimizes the MSE between the filtered signal and the target image
    from channel k. The parameter mu allows for a trade-off between speech
    distortion and noise suppression. For mu = 0, it resembles the MVDR filter.

    Args:
      target_psd_matrix: `Array` of shape (..., frequency, sensor, sensor)
        with the covariance statistics for the target signal.
      noise_psd_matrix: `Array` of shape (..., frequency, sensor, sensor)
        with the covariance statistics for the noise signal.
      reference_channel: Reference channel for minimization. See description
        above. Has no effect if a channel selection vector is provided.
      channel_selection_vector: A vector of shape (batch, channel) to
        select a weighted "reference" channel for each batch.
      distortion_weight: `float` or 'frequency_dependent' to trade-off
        distortion and suppression. Passing 'frequency_dependent' will use a
        frequency-dependent trade-off factor inspired by the Max-SNR criterion.
        See https://arxiv.org/abs/1707.00201 for details.

    Raises:
      ValueError: Wrong rank_one_estimation_type

    Returns:
      `Tensor` of shape (batch, frequency, channel) with filter coefficients

    """
    assert noise_psd_matrix is not None

    phi = stable_solve(noise_psd_matrix, target_psd_matrix)
    lambda_ = np.trace(phi, axis1=-1, axis2=-2)[..., None, None]
    if distortion_weight == 'frequency_dependent':
        phi_x1x1 = target_psd_matrix[..., 0:1, 0:1]
        distortion_weight = np.sqrt(phi_x1x1 * lambda_)
        filter_ = phi / distortion_weight
    else:
        filter_ = phi / (distortion_weight + lambda_)
    if channel_selection_vector is not None:
        projected = filter_ * channel_selection_vector[..., None, :]
        return np.sum(projected, axis=-1)
    else:
        if reference_channel is None:
            reference_channel = get_optimal_reference_channel(
                filter_, target_psd_matrix, noise_psd_matrix)

        assert np.isscalar(reference_channel), reference_channel
        filter_ = filter_[..., reference_channel]
        return filter_
示例#3
0
def get_lcmv_vector_souden(target_psd_matrix,
                           interference_psd_matrix,
                           noise_psd_matrix,
                           ref_channel=None,
                           eps=None,
                           return_ref_channel=False):
    """
    In "A Study of the LCMV and MVDR Noise Reduction Filters" Mehrez Souden
    elaborates an alternative formulation for the LCMV beamformer in the
    appendix for a rank one interference matrix.

    Therefore, this algorithm is only valid, when the interference PSD matrix
    is approximately rank one, or (in other words) only 2 speakers are present
    in total.

    Args:
        target_psd_matrix:
        interference_psd_matrix:
        noise_psd_matrix:
        ref_channel:
        eps:
        return_ref_channel:

    Returns:

    """
    raise NotImplementedError(
        'This is not yet thoroughly tested. It also misses the response vector,'
        'thus it is unclear, how to select, which speaker to attend to.')
    phi_in = stable_solve(noise_psd_matrix, interference_psd_matrix)
    phi_xn = stable_solve(noise_psd_matrix, target_psd_matrix)

    D = phi_in.shape[-1]

    # Equation 5, 6
    gamma_in = np.trace(phi_in, axis1=-1, axis2=-2)[..., None, None]
    gamma_xn = np.trace(phi_xn, axis1=-1, axis2=-2)[..., None, None]

    # Can be written in a single einsum call, here separate for clarity
    # Equation 11
    gamma = gamma_in * gamma_xn - np.trace(
        np.einsum('...ab,...bc->...ac', phi_in, phi_xn))[..., None, None]
    # Possibly:
    # gamma = gamma_in * gamma_xn - np.einsum('...ab,...ba->...', phi_in, phi_xn)

    eye = np.eye(D)[(phi_in.ndim - 2) * [None] + [...]]

    # TODO: Should be determined automatically (per speaker)?
    ref_channel = 0

    # Equation 51, first fraction
    if eps is None:
        eps = np.finfo(gamma.dtype).tiny
    mat = gamma_in * eye - phi_in / np.maximum(gamma.real, eps)

    # Equation 51
    # Faster, when we select the ref_channel before matrix multiplication.
    beamformer = np.einsum('...ab,...bc->...ac', mat, phi_xn)[..., ref_channel]
    # beamformer = np.einsum('...ab,...b->...a', mat, phi_xn[..., ref_channel])

    if return_ref_channel:
        return beamformer, ref_channel
    else:
        return beamformer
示例#4
0
def get_mvdr_vector_souden(target_psd_matrix,
                           noise_psd_matrix,
                           ref_channel=None,
                           eps=None,
                           return_ref_channel=False):
    """
    Returns the MVDR beamforming vector described in [Souden2010MVDR].
    The implementation is based on the description of [Erdogan2016MVDR].

    The ref_channel is selected based of an SNR estimate.

    The eps ensures that the SNR estimation for the ref_channel works
    as long target_psd_matrix and noise_psd_matrix do not contain inf or nan.
    Also zero matrices work. The default eps is the smallest non zero value.

    Note: the frequency dimension is necessary for the ref_channel estimation.
    Note: Currently this function does not support independent dimensions with
          an estimated ref_channel. There is an open point to discuss:
          Should the independent dimension be considered in the SNR estimate
          or not?

    :param target_psd_matrix: Target PSD matrix
        with shape (..., bins, sensors, sensors)
    :param noise_psd_matrix: Noise PSD matrix
        with shape (..., bins, sensors, sensors)
    :param ref_channel:
    :param return_ref_channel:
    :param eps: If None use the smallest number bigger than zero.
    :return: Set of beamforming vectors with shape (bins, sensors)

    Returns:

    @article{Souden2010MVDR,
      title={On optimal frequency-domain multichannel linear filtering for noise reduction},
      author={Souden, Mehrez and Benesty, Jacob and Affes, Sofi{\`e}ne},
      journal={IEEE Transactions on audio, speech, and language processing},
      volume={18},
      number={2},
      pages={260--276},
      year={2010},
      publisher={IEEE}
    }
    @inproceedings{Erdogan2016MVDR,
      title={Improved MVDR Beamforming Using Single-Channel Mask Prediction Networks.},
      author={Erdogan, Hakan and Hershey, John R and Watanabe, Shinji and Mandel, Michael I and Le Roux, Jonathan},
      booktitle={Interspeech},
      pages={1981--1985},
      year={2016}
    }

    """
    assert noise_psd_matrix is not None

    phi = stable_solve(noise_psd_matrix, target_psd_matrix)
    lambda_ = np.trace(phi, axis1=-1, axis2=-2)[..., None, None]
    if eps is None:
        eps = np.finfo(lambda_.dtype).tiny
    mat = phi / np.maximum(lambda_.real, eps)

    if ref_channel is None:
        ref_channel = get_optimal_reference_channel(mat,
                                                    target_psd_matrix,
                                                    noise_psd_matrix,
                                                    eps=eps)

    assert np.isscalar(ref_channel), ref_channel
    beamformer = mat[..., ref_channel]

    if return_ref_channel:
        return beamformer, ref_channel
    else:
        return beamformer