def get_lcmv_vector(atf_vectors, response_vector, noise_psd_matrix): """Calculates an LCMV beamforming vector. :param atf_vectors: Acoustic transfer function vectors for each source with shape (targets k, bins f, sensors d) :param response_vector: Defines, which sources you are interested in. Set it to [1, 0, ..., 0], if you are interested in the first speaker. It has the shape (targets,) :param noise_psd_matrix: Noise PSD matrix with shape (bins f, sensors d, sensors D) :return: Set of beamforming vectors with shape (bins f, sensors d) """ response_vector = np.asarray(response_vector) # TODO: If it is a list, a list of response_vectors is returned. K, F, D = atf_vectors.shape assert noise_psd_matrix.shape == (F, D, D), noise_psd_matrix.shape Phi_inverse_times_H = np.squeeze(stable_solve( np.broadcast_to(noise_psd_matrix[None, :, :, :], (K, F, D, D)), atf_vectors[:, :, :, None] # k, f, d ), axis=-1) # k, f, d assert Phi_inverse_times_H.shape == (K, F, D), Phi_inverse_times_H.shape H_times_Phi_inverse_times_H = np.einsum( 'k...d,K...d->...kK', atf_vectors.conj(), Phi_inverse_times_H ) # f, k, K response_vector = response_vector[None, :, None].astype(np.complex64) response_vector = np.repeat(response_vector, F, axis=0) temp = stable_solve( H_times_Phi_inverse_times_H, response_vector, # F, K, 1 ) # f, k beamforming_vector = np.einsum( 'k...d,...k->...d', Phi_inverse_times_H, np.squeeze(temp, axis=-1) ) return beamforming_vector
def get_wmwf_vector(target_psd_matrix, noise_psd_matrix, reference_channel=None, channel_selection_vector=None, distortion_weight=1.): """Speech distortion weighted multichannel Wiener filter. This filter is the solution to the optimization problem `min E[|h^{H}x - X_{k}|^2] + mu E[|h^{H}n|^2]`. I.e. it minimizes the MSE between the filtered signal and the target image from channel k. The parameter mu allows for a trade-off between speech distortion and noise suppression. For mu = 0, it resembles the MVDR filter. Args: target_psd_matrix: `Array` of shape (..., frequency, sensor, sensor) with the covariance statistics for the target signal. noise_psd_matrix: `Array` of shape (..., frequency, sensor, sensor) with the covariance statistics for the noise signal. reference_channel: Reference channel for minimization. See description above. Has no effect if a channel selection vector is provided. channel_selection_vector: A vector of shape (batch, channel) to select a weighted "reference" channel for each batch. distortion_weight: `float` or 'frequency_dependent' to trade-off distortion and suppression. Passing 'frequency_dependent' will use a frequency-dependent trade-off factor inspired by the Max-SNR criterion. See https://arxiv.org/abs/1707.00201 for details. Raises: ValueError: Wrong rank_one_estimation_type Returns: `Tensor` of shape (batch, frequency, channel) with filter coefficients """ assert noise_psd_matrix is not None phi = stable_solve(noise_psd_matrix, target_psd_matrix) lambda_ = np.trace(phi, axis1=-1, axis2=-2)[..., None, None] if distortion_weight == 'frequency_dependent': phi_x1x1 = target_psd_matrix[..., 0:1, 0:1] distortion_weight = np.sqrt(phi_x1x1 * lambda_) filter_ = phi / distortion_weight else: filter_ = phi / (distortion_weight + lambda_) if channel_selection_vector is not None: projected = filter_ * channel_selection_vector[..., None, :] return np.sum(projected, axis=-1) else: if reference_channel is None: reference_channel = get_optimal_reference_channel( filter_, target_psd_matrix, noise_psd_matrix) assert np.isscalar(reference_channel), reference_channel filter_ = filter_[..., reference_channel] return filter_
def get_lcmv_vector_souden(target_psd_matrix, interference_psd_matrix, noise_psd_matrix, ref_channel=None, eps=None, return_ref_channel=False): """ In "A Study of the LCMV and MVDR Noise Reduction Filters" Mehrez Souden elaborates an alternative formulation for the LCMV beamformer in the appendix for a rank one interference matrix. Therefore, this algorithm is only valid, when the interference PSD matrix is approximately rank one, or (in other words) only 2 speakers are present in total. Args: target_psd_matrix: interference_psd_matrix: noise_psd_matrix: ref_channel: eps: return_ref_channel: Returns: """ raise NotImplementedError( 'This is not yet thoroughly tested. It also misses the response vector,' 'thus it is unclear, how to select, which speaker to attend to.') phi_in = stable_solve(noise_psd_matrix, interference_psd_matrix) phi_xn = stable_solve(noise_psd_matrix, target_psd_matrix) D = phi_in.shape[-1] # Equation 5, 6 gamma_in = np.trace(phi_in, axis1=-1, axis2=-2)[..., None, None] gamma_xn = np.trace(phi_xn, axis1=-1, axis2=-2)[..., None, None] # Can be written in a single einsum call, here separate for clarity # Equation 11 gamma = gamma_in * gamma_xn - np.trace( np.einsum('...ab,...bc->...ac', phi_in, phi_xn))[..., None, None] # Possibly: # gamma = gamma_in * gamma_xn - np.einsum('...ab,...ba->...', phi_in, phi_xn) eye = np.eye(D)[(phi_in.ndim - 2) * [None] + [...]] # TODO: Should be determined automatically (per speaker)? ref_channel = 0 # Equation 51, first fraction if eps is None: eps = np.finfo(gamma.dtype).tiny mat = gamma_in * eye - phi_in / np.maximum(gamma.real, eps) # Equation 51 # Faster, when we select the ref_channel before matrix multiplication. beamformer = np.einsum('...ab,...bc->...ac', mat, phi_xn)[..., ref_channel] # beamformer = np.einsum('...ab,...b->...a', mat, phi_xn[..., ref_channel]) if return_ref_channel: return beamformer, ref_channel else: return beamformer
def get_mvdr_vector_souden(target_psd_matrix, noise_psd_matrix, ref_channel=None, eps=None, return_ref_channel=False): """ Returns the MVDR beamforming vector described in [Souden2010MVDR]. The implementation is based on the description of [Erdogan2016MVDR]. The ref_channel is selected based of an SNR estimate. The eps ensures that the SNR estimation for the ref_channel works as long target_psd_matrix and noise_psd_matrix do not contain inf or nan. Also zero matrices work. The default eps is the smallest non zero value. Note: the frequency dimension is necessary for the ref_channel estimation. Note: Currently this function does not support independent dimensions with an estimated ref_channel. There is an open point to discuss: Should the independent dimension be considered in the SNR estimate or not? :param target_psd_matrix: Target PSD matrix with shape (..., bins, sensors, sensors) :param noise_psd_matrix: Noise PSD matrix with shape (..., bins, sensors, sensors) :param ref_channel: :param return_ref_channel: :param eps: If None use the smallest number bigger than zero. :return: Set of beamforming vectors with shape (bins, sensors) Returns: @article{Souden2010MVDR, title={On optimal frequency-domain multichannel linear filtering for noise reduction}, author={Souden, Mehrez and Benesty, Jacob and Affes, Sofi{\`e}ne}, journal={IEEE Transactions on audio, speech, and language processing}, volume={18}, number={2}, pages={260--276}, year={2010}, publisher={IEEE} } @inproceedings{Erdogan2016MVDR, title={Improved MVDR Beamforming Using Single-Channel Mask Prediction Networks.}, author={Erdogan, Hakan and Hershey, John R and Watanabe, Shinji and Mandel, Michael I and Le Roux, Jonathan}, booktitle={Interspeech}, pages={1981--1985}, year={2016} } """ assert noise_psd_matrix is not None phi = stable_solve(noise_psd_matrix, target_psd_matrix) lambda_ = np.trace(phi, axis1=-1, axis2=-2)[..., None, None] if eps is None: eps = np.finfo(lambda_.dtype).tiny mat = phi / np.maximum(lambda_.real, eps) if ref_channel is None: ref_channel = get_optimal_reference_channel(mat, target_psd_matrix, noise_psd_matrix, eps=eps) assert np.isscalar(ref_channel), ref_channel beamformer = mat[..., ref_channel] if return_ref_channel: return beamformer, ref_channel else: return beamformer