def spectrogram_to_image(signal, batch_first=False, color='viridis', origin='lower'): """ For more details of the output shape, see the tensorboardx docs Args: signal: Shape (frames, batch [optional], features) batch_first: if true mask shape (batch [optional], frames, features] color: A color map name. The name is forwarded to `matplotlib.pyplot.cm.get_cmap` to get the color map. Returns: Shape(features, frames) """ signal = to_numpy(signal, detach=True) signal = signal / (np.max(signal) + np.finfo(signal.dtype).tiny) signal = _remove_batch_axis(signal, batch_first=batch_first) visible_dB = 50 # remove problematic small numbers floor = 10**(-visible_dB / 20) signal = np.maximum(signal, floor) # Scale such that X dB are visible (i.e. in the range 0 to 1) signal = (20 / visible_dB) * np.log10(signal) + 1 signal = (signal * 255).astype(np.uint8) return _colorize(_apply_origin(signal.T, origin=origin), color)
def audio(signal, sampling_rate: int = 16000, batch_first=False, normalize=True): """ Args: signal: Shape (samples, batch [optional]). If `batch_first = True`, (batch [optional], samples). sampling_rate: Sampling rate of the audio signal batch_first: If `True`, the optional batch dimension is assumed to be the first axis, otherwise the second one. normalize: If `True`, the signal is normalized to a max amplitude of 0.95 to prevent clipping """ signal = to_numpy(signal, detach=True) signal = _remove_batch_axis(signal, batch_first=batch_first, ndim=1) # Normalize so that there is no clipping if normalize: denominator = np.max(np.abs(signal)) if denominator > 0: signal = signal / denominator signal *= 0.95 return signal, sampling_rate
def example_to_numpy(example, detach=False): """ Moves a nested structure to numpy. Opposite of example_to_device. Args: example: Returns: example on where each tensor is converted to numpy """ from padertorch.utils import to_numpy if isinstance(example, dict): return example.__class__({ key: example_to_numpy(value, detach=detach) for key, value in example.items() }) elif isinstance(example, (tuple, list)): return example.__class__( [example_to_numpy(element, detach=detach) for element in example]) elif torch.is_tensor(example) or 'ComplexTensor' in str(type(example)): return to_numpy(example, detach=detach) elif isinstance(example, np.ndarray): return example elif hasattr(example, '__dataclass_fields__'): return example.__class__( **{ f: example_to_numpy(getattr(example, f), detach=detach) for f in example.__dataclass_fields__ }) else: return example
def mask_to_image(mask: _T_input, batch_first: bool = False, color: Optional[str] = None, origin: str = 'lower') -> np.ndarray: """ Creates an image from a mask `Tensor` or `ndarray`. For more details of the output shape, see the tensorboardx docs Args: mask: Mask to plot batch_first: If `True`, `signal` is expected to have shape `(batch [optional], frames, features)`. If `False`, the batch axis is assumed to be in the second position, i.e., `(frames, batch [optional], features)`. color: A color map name. The name is forwarded to `matplotlib.pyplot.cm.get_cmap` to get the color map. If `None`, grayscale is used. origin: Origin of the plot. Can be `'upper'` or `'lower'`. Returns: Colorized image with shape (color (1 or 3), features, frames) """ mask = to_numpy(mask, detach=True) image = np.clip(mask * 255, 0, 255) image = image.astype(np.uint8) image = _remove_batch_axis(image, batch_first=batch_first) return _colorize(_apply_origin(image.T, origin), color)
def convert(value): id_ = id(value) if id_ in memo: return memo[id_] if isinstance(value, torch.Tensor) or 'ComplexTensor' in str( type(value)): value = to_numpy(value, detach=detach) memo[id_] = value return value
def spectrogram_to_image( signal: _T_input, batch_first: bool = False, color: str = 'viridis', origin: str = 'lower', log: bool = True, visible_dB: float = 50, ) -> np.ndarray: """ Creates an image from a spectrogram (power). Note: When The input is the absolute value of the STFT, the value for visible_dB is effectively two times larger (i.e. default 100) and the image looks more noisy. For more details of the output shape, see the tensorboardx docs Args: signal: Spectrogram to plot. batch_first: If `True`, `signal` is expected to have shape `(batch [optional], frames, features)`. If `False`, the batch axis is assumed to be in the second position, i.e., `(frames, batch [optional], features)`. color: A color map name. The name is forwarded to `matplotlib.pyplot.cm.get_cmap` to get the color map. origin: Origin of the plot. Can be `'upper'` or `'lower'`. log: If `True`, the spectrogram is plotted in log domain and shows a 50dB range. The 50dB can be changed with the argument `visible_dB`. visible_dB: Only used when `log` is `True`. Specifies how many dB will be visible in the plot. Assumes the input is the power of the STFT signal, i.e., the abs square of it. Returns: Colorized image with shape (channels (3), features, frames) """ signal = to_numpy(signal, detach=True) signal = signal / (np.max(np.abs(signal)) + np.finfo(signal.dtype).tiny) signal = _remove_batch_axis(signal, batch_first=batch_first) if log: # remove problematic small numbers floor = 10 ** (-visible_dB / 10) signal = np.maximum(signal, floor) # Scale such that X dB are visible (i.e. in the range 0 to 1) signal = (10 / visible_dB) * np.log10(signal) + 1 signal = (signal * 255).astype(np.uint8) return _colorize(_apply_origin(signal.T, origin=origin), color)
def stft_to_image( signal: _T_input, batch_first: bool = False, color: str = 'viridis', origin: str = 'lower', visible_dB: float = 50, ) -> np.ndarray: """ Creates an image from an STFT signal. For more details of the output shape, see the tensorboardx docs Args: signal: Shape (frames, batch [optional], features) batch_first: if true mask shape (batch [optional], frames, features] color: A color map name. The name is forwarded to `matplotlib.pyplot.cm.get_cmap` to get the color map. If `None`, grayscale is used. origin: Origin of the plot. Can be `'upper'` or `'lower'`. visible_dB: How many dezibel are visible in the image. Note: `paderbox.visualization.plot.stft` uses `visible_dB == 60` internally. So by default it shows 10 dB more. Returns: Colorized image with shape (color (1 or 3), features, frames) Small test to see the effect of `visible_dB`: >>> visible_dB = 60 >>> 10 ** (-visible_dB / 20) 0.001 >>> data = [1, 0.004, 0.003, 0.001_05, 0.001] >>> np.squeeze(stft_to_image(np.array(data)[:, None], color=None)) array([255, 10, 0, 0, 0], dtype=uint8) >>> np.squeeze(stft_to_image( ... np.array(data)[:, None], color=None, visible_dB=60)) array([255, 51, 40, 1, 0], dtype=uint8) """ signal = to_numpy(signal, detach=True) return spectrogram_to_image( signal.real ** 2 + signal.imag ** 2, batch_first=batch_first, color=color, origin=origin, visible_dB=visible_dB, )
def spectrogram_to_image(signal, batch_first=False, color='viridis'): """ For more details of the output shape, see the tensorboardx docs Args: signal: Shape (frames, batch [optional], features) batch_first: if true mask shape (batch [optional], frames, features] color: A color map name. The name is forwarded to `matplotlib.pyplot.cm.get_cmap` to get the color map. Returns: Shape(features, frames) """ signal = to_numpy(signal, detach=True) signal = signal / (np.max(signal) + np.finfo(signal.dtype).tiny) signal = _remove_batch_axis(signal, batch_first=batch_first) visible_dB = 50 # remove problematic small numbers floor = 10**(-visible_dB / 20) signal = np.maximum(signal, floor) # Scale such that X dB are visible (i.e. in the range 0 to 1) signal = (20 / visible_dB) * np.log10(signal) + 1 signal = (signal * 255).astype(np.uint8) if color is not None: try: cmap = _spectrogram_to_image_cmap[color] except KeyError: try: import matplotlib.pyplot as plt cmap = plt.cm.get_cmap(color) _spectrogram_to_image_cmap[color] = cmap except ImportError: from warnings import warn gray_scale = lambda x: x.transpose(1, 0)[None, ::-1, :] warn('Since matplotlib is not installed, all images are ' 'switched to grey scale') _spectrogram_to_image_cmap[color] = gray_scale # gray image return gray_scale(signal) return cmap(signal).transpose(2, 1, 0)[:, ::-1, :] else: # gray image return signal.transpose(1, 0)[None, ::-1, :]
def stft_to_image(signal, batch_first=False, color='viridis'): """ For more details of the output shape, see the tensorboardx docs Args: signal: Shape (frames, batch [optional], features) batch_first: if true mask shape (batch [optional], frames, features] Returns: Shape(features, frames) """ signal = to_numpy(signal, detach=True) return spectrogram_to_image(np.abs(signal), batch_first=batch_first, color=color)
def mask_to_image(mask, batch_first=False): """ For more details of the output shape, see the tensorboardx docs Args: mask: Shape (frames, batch [optional], features) batch_first: if true mask shape (batch [optional], frames, features] Returns: Shape(color, features, frames) """ mask = to_numpy(mask, detach=True) image = np.clip(mask * 255, 0, 255) image = image.astype(np.uint8) image = _remove_batch_axis(image, batch_first=batch_first) return image[None].transpose(0, 2, 1)[:, ::-1]
def mask_to_image(mask, batch_first=False, color=None, origin='lower'): """ For more details of the output shape, see the tensorboardx docs Args: mask: Shape (frames, batch [optional], features) batch_first: if true mask shape (batch [optional], frames, features] Returns: Shape(color, features, frames) """ mask = to_numpy(mask, detach=True) image = np.clip(mask * 255, 0, 255) image = image.astype(np.uint8) image = _remove_batch_axis(image, batch_first=batch_first) return _colorize(_apply_origin(image.T, origin), color)
def spectrogram_to_image(signal, batch_first=False, color='viridis'): """ For more details of the output shape, see the tensorboardx docs Args: mask: Shape (frames, batch [optional], features) batch_first: if true mask shape (batch [optional], frames, features] Returns: Shape(features, frames) """ signal = to_numpy(signal, detach=True) signal = signal / np.max(signal) signal = _remove_batch_axis(signal, batch_first=batch_first) visible_dB = 50 # remove problematic small numbers floor = 10**(-visible_dB / 20) signal = np.maximum(signal, floor) # Scale such that X dB are visible (i.e. in the range 0 to 1) signal = (20 / visible_dB) * np.log10(signal) + 1 signal = (signal * 255).astype(np.uint8) if color is not None: try: cmap = _spectrogram_to_image_cmap[color] except KeyError: import matplotlib.pyplot as plt cmap = plt.cm.get_cmap(color) _spectrogram_to_image_cmap[color] = cmap return cmap(signal).transpose(2, 1, 0)[:, ::-1, :] else: # gray image return signal.transpose(1, 0)[None, ::-1, :]
def audio( signal: _T_input, sampling_rate: int = 16000, batch_first: bool = False, normalize: bool = True, ) -> Tuple[np.ndarray, int]: """ Adds an audio signal to tensorboard. Args: signal: Time-domain signal with shape (samples, batch [optional]). If `batch_first = True`, (batch [optional], samples). sampling_rate: Sampling rate of the audio signal batch_first: If `True`, `signal` is expected to have shape `(batch [optional], samples)`. If `False`, the batch axis is assumed to be in the second position, i.e., `(samples, batch [optional])`. normalize: If `True`, the signal is normalized to a max amplitude of 0.95 to prevent clipping. Returns: A tuple consisting of the signal and the sampling rate. See tensorboardX docs for further information on the return type. """ signal = to_numpy(signal, detach=True) signal = _remove_batch_axis(signal, batch_first=batch_first, ndim=1) # Normalize so that there is no clipping if normalize: denominator = np.max(np.abs(signal)) if denominator > 0: signal = signal / denominator signal *= 0.95 return signal, sampling_rate
def pit_loss_from_loss_matrix( pair_wise_loss_matrix, *, reduction='mean', algorithm: ['optimal', 'greedy'] = 'optimal', return_permutation=False, ): """ Calculates the PIT loss given a pair_wise_loss matrix. Args: pair_wise_loss_matrix: shape: (K, K) reduction: 'mean' or 'sum' algorithm: return_permutation: Returns: >>> import numpy as np >>> score_matrix = np.array([[11., 10, 0],[4, 5, 10],[6, 0, 5]]) >>> score_matrix array([[11., 10., 0.], [ 4., 5., 10.], [ 6., 0., 5.]]) >>> pair_wise_loss_matrix = torch.tensor(-score_matrix) >>> pit_loss_from_loss_matrix(pair_wise_loss_matrix, reduction='sum', algorithm='optimal') tensor(-26., dtype=torch.float64) >>> pit_loss_from_loss_matrix(pair_wise_loss_matrix, reduction='sum', algorithm='greedy') tensor(-21., dtype=torch.float64) """ import scipy.optimize from padertorch.utils import to_numpy assert len(pair_wise_loss_matrix.shape) == 2, pair_wise_loss_matrix.shape assert pair_wise_loss_matrix.shape[-2] == pair_wise_loss_matrix.shape[-1], pair_wise_loss_matrix.shape sources = pair_wise_loss_matrix.shape[-1] # We have to detach here because pair_wise_loss_matrix should require grads pair_wise_loss_np = to_numpy(pair_wise_loss_matrix, detach=True) if algorithm == 'optimal': row_ind, col_ind = scipy.optimize.linear_sum_assignment( pair_wise_loss_np) elif algorithm == 'greedy': from pb_bss.permutation_alignment import _mapping_from_score_matrix col_ind = _mapping_from_score_matrix(-pair_wise_loss_np, algorithm='greedy') row_ind = range(sources) else: raise ValueError(algorithm) if reduction == 'mean': min_loss = pair_wise_loss_matrix[row_ind, col_ind].mean() elif reduction == 'sum': min_loss = pair_wise_loss_matrix[row_ind, col_ind].sum() else: raise ValueError(reduction) if return_permutation: return min_loss, col_ind else: return min_loss