def load_test_from_csv(path_file, base_path, sample_1='audio_1', sample_2='audio_2'): """ Function permit to load an audio test set from csv :args Contains both path of file and the folder of file audio :sample_1 Column that contains the file audio for speaker_1 :sample_2 Column that contains the file audio for speaker_2 :label Column that contains labels :return: (list of audio samples, list of audio samples), labels """ x1 = [] x2 = [] print('Loading testing data') df = pd.read_csv(path_file, encoding='latin1', error_bad_lines=False, warn_bad_lines=False) for path_1 in df[sample_1]: x1.append( decode_audio(os.path.join(base_path, path_1)).reshape((-1, 1))) for path_2 in df[sample_2]: x2.append( decode_audio(os.path.join(base_path, path_2)).reshape((-1, 1))) y = np.array([string for string in df['label']]) return (x1, x2), y
def load_data_raw(base_path, trials_path, n_pairs=10, sample_rate=16000, n_seconds=3, print_interval=100): """ Function to load raw paired audio samples for verification :param base_path: Base path to the dataset samples :param trials_path: Path to the list of trial pairs :param n_pairs: Number of pairs to be loaded :param sample_rate: Sample rate of the audio files to be processed :param n_seconds: Max number of seconds of an audio sample to be processed :param print_interval: Print interval (verbosity) :return: (list of audio samples, list of audio samples), labels """ pairs = pd.read_csv(trials_path, names=['target', 'path_1', 'path_2'], delimiter=' ') n_real_pairs = n_pairs if n_pairs > 0 else len(pairs['target']) y = pairs['target'].values[:n_real_pairs] x1 = [] x2 = [] for i, (path_1, path_2) in enumerate( zip(pairs['path_1'].values[:n_real_pairs], pairs['path_2'].values[:n_real_pairs])): if (i + 1) % print_interval == 0: print('\r> pair %5.0f / %5.0f' % (i + 1, len(y)), end='') x1.append( decode_audio(os.path.join(base_path, path_1), tgt_sample_rate=sample_rate).reshape((-1, 1))) x2.append( decode_audio(os.path.join(base_path, path_2), tgt_sample_rate=sample_rate).reshape((-1, 1))) return (x1, x2), y
def main(): parser = argparse.ArgumentParser( description='Filterbanks functionality testing') # Parameters parser.add_argument( '--audio_path', dest='audio_path', default='/beegfs/mm10572/voxceleb1/test/id10281/Yw8v8055uPc/00001.wav', type=str, action='store', help='Audio path') parser.add_argument('--sample_rate', dest='sample_rate', default=16000, type=int, action='store', help='Sample rate audio') args = parser.parse_args() print('Parameters summary') print('>', 'Audio path: {}'.format(args.audio_path)) print('>', 'Sample rate: {}'.format(args.sample_rate)) print('Compute spectrum') xt = decode_audio(os.path.join(args.audio_path)).astype(np.float32) print('> signal:', xt.shape) sp_np, _, _ = get_np_filterbanks(xt, args.sample_rate) print('> numpy spectrum:', sp_np.shape, sp_np.min(), sp_np.max()) @tf.function def forward(signal): return get_tf_filterbanks(signal) sp_tf = np.squeeze(forward(xt.reshape((1, -1, 1))).numpy()) print('> tensorflow filterbanks:', sp_tf.shape, sp_tf.min(), sp_tf.max()) print('Saving filterbanks comparison plot') fig, axes = plt.subplots(2, 1) fig.set_size_inches((16, 8)) axes[0].matshow(sp_np, aspect="auto") axes[0].set_title('Numpy spectrum') axes[1].matshow(sp_tf, aspect="auto") axes[1].set_title('Tensorflow filterbanks') plt.savefig('./tests/filterbanks_comparison.png')
def data_pipeline_generator_verifier(x, y, classes, sample_rate=16000, n_seconds=2,input_format='aud',num_fft=512,spec_len=250): """ Function to simulate a (signal, impulse_flags), label generator for training a verifier :param x: List of audio paths :param y: List of users' labels :param classes: Number of target classes :param augment: Augmentation flag - 0 for non-augmentation, 1 for augmentation :param sample_rate: Sample rate of the audio files to be processed :param n_seconds: Max number of seconds of an audio file to be processed :return: (signal, impulse_flags), label """ indexes = list(range(len(x))) count = 0 print(len(x)) random.shuffle(indexes) for index in indexes: count += 1 audio = decode_audio(x[index], tgt_sample_rate=sample_rate) if len(audio) > (sample_rate*n_seconds): start_sample = random.choice(range(len(audio) - sample_rate*n_seconds)) end_sample = start_sample + sample_rate*n_seconds else: bucket = np.zeros(abs(len(audio) - (sample_rate*n_seconds))) audio = np.concatenate([audio, bucket]) start_sample = 0 end_sample = start_sample + sample_rate*n_seconds input = audio[start_sample:end_sample] if input_format == 'spec': spectrogram = get_tf_spectrum2(input, spec_len=spec_len, n_fft=num_fft) input = np.expand_dims(spectrogram, axis=2) if input_format == 'bank': input = np.expand_dims(input, axis=[0, 2]) input = np.float32(input) filterbank = get_tf_filterbanks(input) input = np.squeeze(filterbank) elif input_format == 'aud': input = np.expand_dims(input, axis=1) label = y[index] impulse = np.random.randint(2, size=3) #yield {'input1': audio, 'input_2': impulse}, tf.keras.utils.to_categorical(label, num_classes=classes, dtype='float32') yield input, tf.keras.utils.to_categorical(label, num_classes=classes, dtype='float32') raise StopIteration()
def data_pipeline_generator_gan(x, slice_len, sample_rate=16000): """ Function to simulate a signal generator for training a gan :param x: List of audio paths :param slice_len: Length of each audio sample :param sample_rate: Sample rate of the audio files to be processed :return: (signal) """ indexes = list(range(len(x))) random.shuffle(indexes) for index in indexes: audio = decode_audio(x[index], tgt_sample_rate=sample_rate) start_sample = random.choice(range(len(audio) - slice_len)) end_sample = start_sample + slice_len audio = audio[start_sample:end_sample].reshape((1, -1, 1)) yield audio raise StopIteration()
def data_pipeline_generator_mv(x, sample_rate=16000, n_seconds=3): """ Function to simulate a signal generator for training a master voice vocoder :param x: List of audio paths :param sample_rate: Sample rate of the audio files to be processed :param n_seconds: Max number of seconds of an audio file to be processed :return: (Signal) """ indexes = list(range(len(x))) random.shuffle(indexes) for index in indexes: audio = decode_audio(x[index], tgt_sample_rate=sample_rate) start_sample = random.choice(range(len(audio) - sample_rate*n_seconds)) end_sample = start_sample + sample_rate*n_seconds audio = audio[start_sample:end_sample].reshape((1, -1, 1)) yield audio raise StopIteration()
def main(): parser = argparse.ArgumentParser( description='Playback functionality testing') # Parameters parser.add_argument( '--audio_path', dest='audio_path', default='/beegfs/mm10572/voxceleb1/test/id10281/Yw8v8055uPc/00001.wav', type=str, action='store', help='Audio path') parser.add_argument('--sample_rate', dest='sample_rate', default=16000, type=int, action='store', help='Sample rate audio') parser.add_argument('--speaker_flag', dest='speaker_flag', default=0, type=int, choices=[0, 1], action='store', help='Speaker flag') parser.add_argument('--room_flag', dest='room_flag', default=0, type=int, choices=[0, 1], action='store', help='Room flag') parser.add_argument('--microphone_flag', dest='microphone_flag', default=0, type=int, choices=[0, 1], action='store', help='Microphone flag') args = parser.parse_args() print('Parameters summary') print('>', 'Audio path: {}'.format(args.audio_path)) print('>', 'Sample rate: {}'.format(args.sample_rate)) print('>', 'Speaker flag: {}'.format(args.speaker_flag)) print('>', 'Room flag: {}'.format(args.room_flag)) print('>', 'Microphone flag: {}'.format(args.microphone_flag)) impulse_flags = [args.speaker_flag, args.room_flag, args.microphone_flag] print('Load impulse response paths') noise_paths = load_noise_paths('./data/vs_noise_data') print('Cache impulse response data') noise_cache = cache_noise_data(noise_paths, sample_rate=args.sample_rate) print('Noise samples') print('Speaker', noise_cache[noise_paths['speaker'][0]].shape) print('Room', noise_cache[noise_paths['room'][0]].shape) print('Microphone', noise_cache[noise_paths['microphone'][0]].shape) print('Compute playback & recording') xt = decode_audio(os.path.join(args.audio_path)).reshape( (1, -1, 1)).astype(np.float32) xn = np.array(impulse_flags, dtype=np.float32).reshape(1, -1) print('> signal:', xt.shape) print('> impulse_flags:', xn.shape) @tf.function def forward(signal, impulse_flags): return play_n_rec((signal, impulse_flags), noises=noise_paths, cache=noise_cache, noise_strength='random') xf = forward(xt, xn).numpy() print('> playback signal:', xf.shape) print('> data flow in the model:') print('>>> original audio: {} -> [{:.2f}, {:.2f}] // {:.1f} s'.format( xt.shape, xt.min(), xt.max(), xt.size / args.sample_rate)) print('>>> p&rec audio: {} -> [{:.2f}, {:.2f}] // {:.1f} s'.format( xf.shape, xf.min(), xf.max(), xf.size / args.sample_rate)) print('Saving playback comparison plot') fig, axes = plt.subplots(2, 1) fig.set_size_inches((16, 8)) axes[0].plot(xt.ravel()) axes[0].set_title('Speech sample') axes[1].plot(xf.ravel()) axes[1].set_title('Playback sample') plt.savefig('./tests/playback_comparison.png') print('Saving original and playback audio samples') sf.write('./tests/original_audio.wav', np.squeeze(xt), args.sample_rate) sf.write('./tests/playback_audio.wav', np.squeeze(xf), args.sample_rate)
def load_mv_data(mv_analysis_path, mv_base_path, audio_meta, sample_rate=16000, n_seconds=3, n_templates=10): """ Function to load data for master voice impersonation :param mv_analysis_path: File path to master voice analysis metadata :param mv_base_path: Base path of the dataset from which master-voice-used audio samples are retrieved :param audio_meta: Path to the file with gender information :param sample_rate: Sample rate of the audio files to be processed :param n_seconds: Max number of seconds of an audio sample to be processed :param n_templates: Number of audio samples per user to be loaded :return: (list of audio samples, list of labels, list of male user ids, list of female user ids) """ print('Loading master voice data') mv_analysis_data = np.load(mv_analysis_path) mv_paths = [ os.path.join(mv_base_path, path) for path in mv_analysis_data['x_test'] ] mv_labels = mv_analysis_data['y_test'] print('> found', len(mv_paths), 'paths from', len(np.unique(mv_labels)), 'users') data_set_df = pd.read_csv(audio_meta, delimiter=' ') gender_map = { k: v for k, v in zip(data_set_df['id'].values, data_set_df['gender'].values) } x_mv_test, y_mv_test, male_x_mv_test, female_x_mv_test = [], [], [], [] samples_per_user = int(len(mv_paths) // len(np.unique(mv_labels))) for class_index, _ in enumerate(np.unique(mv_labels)): class_paths = random.sample( mv_paths[class_index * samples_per_user:(class_index + 1) * samples_per_user], n_templates) for path in class_paths: x_mv_test.append( decode_audio(path.replace('.m4a', '.wav'), tgt_sample_rate=sample_rate).reshape( (-1, 1))[:sample_rate * n_seconds, :]) y_mv_test.append(class_index) if gender_map[class_paths[0].split(os.path.sep)[-3]] == 'm': male_x_mv_test.append(class_index) else: female_x_mv_test.append(class_index) print('\r> loaded', (class_index + 1) * n_templates, '/', len(np.unique(mv_labels)) * n_templates, 'audio files', end='') print() return x_mv_test, y_mv_test, male_x_mv_test, female_x_mv_test