Пример #1
0
def load_test_from_csv(path_file,
                       base_path,
                       sample_1='audio_1',
                       sample_2='audio_2'):
    """
       Function permit to load an audio test set from csv
        :args                   Contains both path of file and the folder of file audio
        :sample_1               Column that contains the file audio for speaker_1
        :sample_2               Column that contains the file audio for speaker_2
        :label                  Column that contains labels
    :return:                (list of audio samples, list of audio samples), labels
    """
    x1 = []
    x2 = []
    print('Loading testing data')
    df = pd.read_csv(path_file,
                     encoding='latin1',
                     error_bad_lines=False,
                     warn_bad_lines=False)

    for path_1 in df[sample_1]:
        x1.append(
            decode_audio(os.path.join(base_path, path_1)).reshape((-1, 1)))
    for path_2 in df[sample_2]:
        x2.append(
            decode_audio(os.path.join(base_path, path_2)).reshape((-1, 1)))
    y = np.array([string for string in df['label']])

    return (x1, x2), y
Пример #2
0
def load_data_raw(base_path,
                  trials_path,
                  n_pairs=10,
                  sample_rate=16000,
                  n_seconds=3,
                  print_interval=100):
    """
    Function to load raw paired audio samples for verification
    :param base_path:       Base path to the dataset samples
    :param trials_path:     Path to the list of trial pairs
    :param n_pairs:         Number of pairs to be loaded
    :param sample_rate:     Sample rate of the audio files to be processed
    :param n_seconds:       Max number of seconds of an audio sample to be processed
    :param print_interval:  Print interval (verbosity)
    :return:                (list of audio samples, list of audio samples), labels
    """

    pairs = pd.read_csv(trials_path,
                        names=['target', 'path_1', 'path_2'],
                        delimiter=' ')
    n_real_pairs = n_pairs if n_pairs > 0 else len(pairs['target'])

    y = pairs['target'].values[:n_real_pairs]
    x1 = []
    x2 = []

    for i, (path_1, path_2) in enumerate(
            zip(pairs['path_1'].values[:n_real_pairs],
                pairs['path_2'].values[:n_real_pairs])):

        if (i + 1) % print_interval == 0:
            print('\r> pair %5.0f / %5.0f' % (i + 1, len(y)), end='')

        x1.append(
            decode_audio(os.path.join(base_path, path_1),
                         tgt_sample_rate=sample_rate).reshape((-1, 1)))
        x2.append(
            decode_audio(os.path.join(base_path, path_2),
                         tgt_sample_rate=sample_rate).reshape((-1, 1)))

    return (x1, x2), y
Пример #3
0
def main():
    parser = argparse.ArgumentParser(
        description='Filterbanks functionality testing')

    # Parameters
    parser.add_argument(
        '--audio_path',
        dest='audio_path',
        default='/beegfs/mm10572/voxceleb1/test/id10281/Yw8v8055uPc/00001.wav',
        type=str,
        action='store',
        help='Audio path')
    parser.add_argument('--sample_rate',
                        dest='sample_rate',
                        default=16000,
                        type=int,
                        action='store',
                        help='Sample rate audio')

    args = parser.parse_args()

    print('Parameters summary')
    print('>', 'Audio path: {}'.format(args.audio_path))
    print('>', 'Sample rate: {}'.format(args.sample_rate))

    print('Compute spectrum')
    xt = decode_audio(os.path.join(args.audio_path)).astype(np.float32)
    print('> signal:', xt.shape)

    sp_np, _, _ = get_np_filterbanks(xt, args.sample_rate)
    print('> numpy spectrum:', sp_np.shape, sp_np.min(), sp_np.max())

    @tf.function
    def forward(signal):
        return get_tf_filterbanks(signal)

    sp_tf = np.squeeze(forward(xt.reshape((1, -1, 1))).numpy())
    print('> tensorflow filterbanks:', sp_tf.shape, sp_tf.min(), sp_tf.max())

    print('Saving filterbanks comparison plot')

    fig, axes = plt.subplots(2, 1)
    fig.set_size_inches((16, 8))

    axes[0].matshow(sp_np, aspect="auto")
    axes[0].set_title('Numpy spectrum')

    axes[1].matshow(sp_tf, aspect="auto")
    axes[1].set_title('Tensorflow filterbanks')

    plt.savefig('./tests/filterbanks_comparison.png')
Пример #4
0
def data_pipeline_generator_verifier(x, y, classes, sample_rate=16000, n_seconds=2,input_format='aud',num_fft=512,spec_len=250):
    """
    Function to simulate a (signal, impulse_flags), label generator for training a verifier
    :param x:           List of audio paths
    :param y:           List of users' labels
    :param classes:     Number of target classes
    :param augment:     Augmentation flag - 0 for non-augmentation, 1 for augmentation
    :param sample_rate: Sample rate of the audio files to be processed
    :param n_seconds:   Max number of seconds of an audio file to be processed
    :return:            (signal, impulse_flags), label
    """

    indexes = list(range(len(x)))
    count = 0
    print(len(x))
    random.shuffle(indexes)
    for index in indexes:
        count += 1
        audio = decode_audio(x[index], tgt_sample_rate=sample_rate)
        if len(audio) > (sample_rate*n_seconds):
            start_sample = random.choice(range(len(audio) - sample_rate*n_seconds))
            end_sample = start_sample + sample_rate*n_seconds
        else:
            bucket = np.zeros(abs(len(audio) - (sample_rate*n_seconds)))
            audio = np.concatenate([audio, bucket])
            start_sample = 0
            end_sample = start_sample + sample_rate*n_seconds


        input = audio[start_sample:end_sample]
        if input_format == 'spec':
            spectrogram = get_tf_spectrum2(input, spec_len=spec_len, n_fft=num_fft)
            input = np.expand_dims(spectrogram, axis=2)
        if input_format == 'bank':
            input = np.expand_dims(input, axis=[0, 2])
            input = np.float32(input)
            filterbank = get_tf_filterbanks(input)
            input = np.squeeze(filterbank)
        elif input_format == 'aud':
            input = np.expand_dims(input, axis=1)

        label = y[index]
        impulse = np.random.randint(2, size=3)

        #yield {'input1': audio, 'input_2': impulse}, tf.keras.utils.to_categorical(label, num_classes=classes, dtype='float32')
        yield input, tf.keras.utils.to_categorical(label, num_classes=classes, dtype='float32')
    raise StopIteration()
Пример #5
0
def data_pipeline_generator_gan(x, slice_len, sample_rate=16000):
    """
    Function to simulate a signal generator for training a gan
    :param x:           List of audio paths
    :param slice_len:   Length of each audio sample
    :param sample_rate: Sample rate of the audio files to be processed
    :return:            (signal)
    """
    indexes = list(range(len(x)))
    random.shuffle(indexes)

    for index in indexes:
        audio = decode_audio(x[index], tgt_sample_rate=sample_rate)
        start_sample = random.choice(range(len(audio) - slice_len))
        end_sample = start_sample + slice_len
        audio = audio[start_sample:end_sample].reshape((1, -1, 1))
        yield audio

    raise StopIteration()
Пример #6
0
def data_pipeline_generator_mv(x, sample_rate=16000, n_seconds=3):
    """
    Function to simulate a signal generator for training a master voice vocoder
    :param x:           List of audio paths
    :param sample_rate: Sample rate of the audio files to be processed
    :param n_seconds:   Max number of seconds of an audio file to be processed
    :return:            (Signal)
    """
    indexes = list(range(len(x)))
    random.shuffle(indexes)

    for index in indexes:
        audio = decode_audio(x[index], tgt_sample_rate=sample_rate)
        start_sample = random.choice(range(len(audio) - sample_rate*n_seconds))
        end_sample = start_sample + sample_rate*n_seconds
        audio = audio[start_sample:end_sample].reshape((1, -1, 1))
        yield audio

    raise StopIteration()
Пример #7
0
def main():
    parser = argparse.ArgumentParser(
        description='Playback functionality testing')

    # Parameters
    parser.add_argument(
        '--audio_path',
        dest='audio_path',
        default='/beegfs/mm10572/voxceleb1/test/id10281/Yw8v8055uPc/00001.wav',
        type=str,
        action='store',
        help='Audio path')
    parser.add_argument('--sample_rate',
                        dest='sample_rate',
                        default=16000,
                        type=int,
                        action='store',
                        help='Sample rate audio')
    parser.add_argument('--speaker_flag',
                        dest='speaker_flag',
                        default=0,
                        type=int,
                        choices=[0, 1],
                        action='store',
                        help='Speaker flag')
    parser.add_argument('--room_flag',
                        dest='room_flag',
                        default=0,
                        type=int,
                        choices=[0, 1],
                        action='store',
                        help='Room flag')
    parser.add_argument('--microphone_flag',
                        dest='microphone_flag',
                        default=0,
                        type=int,
                        choices=[0, 1],
                        action='store',
                        help='Microphone flag')

    args = parser.parse_args()

    print('Parameters summary')
    print('>', 'Audio path: {}'.format(args.audio_path))
    print('>', 'Sample rate: {}'.format(args.sample_rate))
    print('>', 'Speaker flag: {}'.format(args.speaker_flag))
    print('>', 'Room flag: {}'.format(args.room_flag))
    print('>', 'Microphone flag: {}'.format(args.microphone_flag))

    impulse_flags = [args.speaker_flag, args.room_flag, args.microphone_flag]

    print('Load impulse response paths')
    noise_paths = load_noise_paths('./data/vs_noise_data')

    print('Cache impulse response data')
    noise_cache = cache_noise_data(noise_paths, sample_rate=args.sample_rate)

    print('Noise samples')
    print('Speaker', noise_cache[noise_paths['speaker'][0]].shape)
    print('Room', noise_cache[noise_paths['room'][0]].shape)
    print('Microphone', noise_cache[noise_paths['microphone'][0]].shape)

    print('Compute playback & recording')
    xt = decode_audio(os.path.join(args.audio_path)).reshape(
        (1, -1, 1)).astype(np.float32)
    xn = np.array(impulse_flags, dtype=np.float32).reshape(1, -1)

    print('> signal:', xt.shape)
    print('> impulse_flags:', xn.shape)

    @tf.function
    def forward(signal, impulse_flags):
        return play_n_rec((signal, impulse_flags),
                          noises=noise_paths,
                          cache=noise_cache,
                          noise_strength='random')

    xf = forward(xt, xn).numpy()

    print('> playback signal:', xf.shape)

    print('> data flow in the model:')
    print('>>> original audio: {} -> [{:.2f}, {:.2f}] // {:.1f} s'.format(
        xt.shape, xt.min(), xt.max(), xt.size / args.sample_rate))
    print('>>> p&rec audio: {} -> [{:.2f}, {:.2f}] // {:.1f} s'.format(
        xf.shape, xf.min(), xf.max(), xf.size / args.sample_rate))

    print('Saving playback comparison plot')

    fig, axes = plt.subplots(2, 1)
    fig.set_size_inches((16, 8))

    axes[0].plot(xt.ravel())
    axes[0].set_title('Speech sample')

    axes[1].plot(xf.ravel())
    axes[1].set_title('Playback sample')

    plt.savefig('./tests/playback_comparison.png')

    print('Saving original and playback audio samples')
    sf.write('./tests/original_audio.wav', np.squeeze(xt), args.sample_rate)
    sf.write('./tests/playback_audio.wav', np.squeeze(xf), args.sample_rate)
Пример #8
0
def load_mv_data(mv_analysis_path,
                 mv_base_path,
                 audio_meta,
                 sample_rate=16000,
                 n_seconds=3,
                 n_templates=10):
    """
    Function to load data for master voice impersonation
    :param mv_analysis_path:    File path to master voice analysis metadata
    :param mv_base_path:        Base path of the dataset from which master-voice-used audio samples are retrieved
    :param audio_meta:          Path to the file with gender information
    :param sample_rate:         Sample rate of the audio files to be processed
    :param n_seconds:           Max number of seconds of an audio sample to be processed
    :param n_templates:         Number of audio samples per user to be loaded
    :return:                    (list of audio samples, list of labels, list of male user ids, list of female user ids)
    """
    print('Loading master voice data')

    mv_analysis_data = np.load(mv_analysis_path)
    mv_paths = [
        os.path.join(mv_base_path, path) for path in mv_analysis_data['x_test']
    ]
    mv_labels = mv_analysis_data['y_test']
    print('> found', len(mv_paths), 'paths from', len(np.unique(mv_labels)),
          'users')

    data_set_df = pd.read_csv(audio_meta, delimiter=' ')
    gender_map = {
        k: v
        for k, v in zip(data_set_df['id'].values, data_set_df['gender'].values)
    }

    x_mv_test, y_mv_test, male_x_mv_test, female_x_mv_test = [], [], [], []
    samples_per_user = int(len(mv_paths) // len(np.unique(mv_labels)))

    for class_index, _ in enumerate(np.unique(mv_labels)):

        class_paths = random.sample(
            mv_paths[class_index * samples_per_user:(class_index + 1) *
                     samples_per_user], n_templates)

        for path in class_paths:
            x_mv_test.append(
                decode_audio(path.replace('.m4a', '.wav'),
                             tgt_sample_rate=sample_rate).reshape(
                                 (-1, 1))[:sample_rate * n_seconds, :])
            y_mv_test.append(class_index)

        if gender_map[class_paths[0].split(os.path.sep)[-3]] == 'm':
            male_x_mv_test.append(class_index)
        else:
            female_x_mv_test.append(class_index)

        print('\r> loaded', (class_index + 1) * n_templates,
              '/',
              len(np.unique(mv_labels)) * n_templates,
              'audio files',
              end='')

    print()

    return x_mv_test, y_mv_test, male_x_mv_test, female_x_mv_test