def __call__(self, example): def maybe_add_channel(signal): if signal.ndim == 1: return np.expand_dims(signal, axis=0) elif signal.ndim == 2: return signal else: raise ValueError('Either the signal has ndim 1 or 2', signal.shape) example[M_K.OBSERVATION_STFT] = self.stft( maybe_add_channel(example[OBSERVATION])) example[M_K.OBSERVATION_ABS] = np.abs( example[M_K.OBSERVATION_STFT]).astype(np.float32) example[NUM_FRAMES] = example[M_K.OBSERVATION_STFT].shape[-2] if SPEECH_IMAGE in example and NOISE_IMAGE in example: speech = self.stft(maybe_add_channel(example[SPEECH_IMAGE])) noise = self.stft(maybe_add_channel(example[NOISE_IMAGE])) target_mask, noise_mask = biased_binary_mask( np.stack([speech, noise], axis=0), low_cut=self.opts.low_cut, high_cut=self.opts.high_cut if self.opts.high_cut >= 0 else speech.shape[-1] + self.opts.high_cut) example[M_K.SPEECH_MASK_TARGET] = target_mask.astype(np.float32) example[M_K.NOISE_MASK_TARGET] = noise_mask.astype(np.float32) return example
def change_example_structure(example): stft = pb.transform.stft audio_data = example[K.AUDIO_DATA] net_input = dict() net_input['observation_stft'] = stft(audio_data[K.OBSERVATION]).astype( np.complex64) net_input['observation_abs'] = np.abs( net_input['observation_stft']).astype(np.float32) speech_image = stft(audio_data[K.SPEECH_IMAGE]) noise_image = stft(audio_data[K.NOISE_IMAGE]) target_mask, noise_mask = biased_binary_mask( np.stack([speech_image, noise_image], axis=0)) net_input['speech_mask_target'] = target_mask.astype(np.float32) net_input['noise_mask_target'] = noise_mask.astype(np.float32) return net_input
def prepare_data(example): stft = pb.transform.STFT(shift=256, size=1024) net_input = dict() audio_data = dict() for key in ['observation', 'speech_image', 'noise_image']: audio_data[key] = stft( np.array([ pb.io.load_audio(audio) for audio in example['audio_path'][key] ])) net_input['observation_abs'] = np.abs(audio_data['observation']).astype( np.float32) target_mask, noise_mask = biased_binary_mask( np.stack([audio_data['speech_image'], audio_data['noise_image']], axis=0)) net_input['speech_mask_target'] = target_mask.astype(np.float32) net_input['noise_mask_target'] = noise_mask.astype(np.float32) return net_input