Exemplo n.º 1
0
def wavs_to_vec(wavs, iters=5000):
    #voice_audios = sorted(glob(wav_dir + '/*.wav'))[:max_files]
    wavs = sorted(wavs)
    voice_audios = [get_voice_from_file(wav) for wav in wavs]
    features = []
    i = 0
    #while i % len(voice_audios) < len(v)
    #for i, voice_audio in enumerate(voice_audios):
    while i < iters:
        voice_audio = voice_audios[i % len(voice_audios)]

        cuts = np.random.uniform(low=1, high=len(voice_audio), size=2)
        signal_to_process = voice_audio[int(min(cuts)):int(max(cuts))]
        features_for_single = get_mfcc_features_390(signal_to_process,
                                                    c.AUDIO.SAMPLE_RATE,
                                                    max_frames=None)
        # if len(features_per_conv) > 0:
        #     features.append(features_per_conv)

        if len(features_for_single) == 0:
            print(f'0 length features for {wavs[i % len(voice_audios)]}')
        #     import pdb
        #     pdb.set_trace()
        # assert len(features_for_single) > 0
        else:
            features.append(features_for_single)

        i += 1

    #speaker_cache, metadata = self.audio_reader.load_cache([speaker_id])
    #audio_entities = list(speaker_cache.values())
    #logger.info('Generating the inputs necessary for the inference (speaker is {})...'.format(speaker_id))
    #logger.info('This might take a couple of minutes to complete.')
    #feat = generate_features(audio_entities, self.max_count_per_class, progress_bar=False)
    # mean = np.mean([np.mean(t) for t in feat])
    # std = np.mean([np.std(t) for t in feat])
    # feat = normalize(feat, mean, std)

    mean = np.mean([np.mean(t) for t in features])
    std = np.mean([np.std(t) for t in features])
    features = normalize(features, mean, std)

    stacked_embeddings = model.predict(np.vstack(features))[0]
    #emb_sp2 = m.predict(np.vstack(sp2_feat))[0]

    logger.info('Checking that L2 norm is 1.')
    logger.info(np.mean(np.linalg.norm(stacked_embeddings, axis=1)))

    embeddings = stacked_embeddings.mean(axis=0)
    return embeddings
Exemplo n.º 2
0
def generate_features(audio_entities, max_count, progress_bar=False):
    features = []
    count_range = range(max_count)
    if progress_bar:
        from tqdm import tqdm
        count_range = tqdm(count_range)
    for _ in count_range:
        audio_entity = np.random.choice(audio_entities)
        voice_only_signal = audio_entity['audio_voice_only']
        cuts = np.random.uniform(low=1, high=len(voice_only_signal), size=2)
        signal_to_process = voice_only_signal[int(min(cuts)):int(max(cuts))]
        features_per_conv = get_mfcc_features_390(signal_to_process,
                                                  c.AUDIO.SAMPLE_RATE,
                                                  max_frames=None)
        if len(features_per_conv) > 0:
            features.append(features_per_conv)
    return features
def get_feat_from_audio(audio_reader, sr, norm_data, speaker):
    feat = get_mfcc_features_390(audio_reader, sr, max_frames=None)
    feat = normalize(feat, norm_data[speaker]['mean_train'], norm_data[speaker]['std_train'])
    return feat