def wavs_to_vec(wavs, iters=5000): #voice_audios = sorted(glob(wav_dir + '/*.wav'))[:max_files] wavs = sorted(wavs) voice_audios = [get_voice_from_file(wav) for wav in wavs] features = [] i = 0 #while i % len(voice_audios) < len(v) #for i, voice_audio in enumerate(voice_audios): while i < iters: voice_audio = voice_audios[i % len(voice_audios)] cuts = np.random.uniform(low=1, high=len(voice_audio), size=2) signal_to_process = voice_audio[int(min(cuts)):int(max(cuts))] features_for_single = get_mfcc_features_390(signal_to_process, c.AUDIO.SAMPLE_RATE, max_frames=None) # if len(features_per_conv) > 0: # features.append(features_per_conv) if len(features_for_single) == 0: print(f'0 length features for {wavs[i % len(voice_audios)]}') # import pdb # pdb.set_trace() # assert len(features_for_single) > 0 else: features.append(features_for_single) i += 1 #speaker_cache, metadata = self.audio_reader.load_cache([speaker_id]) #audio_entities = list(speaker_cache.values()) #logger.info('Generating the inputs necessary for the inference (speaker is {})...'.format(speaker_id)) #logger.info('This might take a couple of minutes to complete.') #feat = generate_features(audio_entities, self.max_count_per_class, progress_bar=False) # mean = np.mean([np.mean(t) for t in feat]) # std = np.mean([np.std(t) for t in feat]) # feat = normalize(feat, mean, std) mean = np.mean([np.mean(t) for t in features]) std = np.mean([np.std(t) for t in features]) features = normalize(features, mean, std) stacked_embeddings = model.predict(np.vstack(features))[0] #emb_sp2 = m.predict(np.vstack(sp2_feat))[0] logger.info('Checking that L2 norm is 1.') logger.info(np.mean(np.linalg.norm(stacked_embeddings, axis=1))) embeddings = stacked_embeddings.mean(axis=0) return embeddings
def generate_features(audio_entities, max_count, progress_bar=False): features = [] count_range = range(max_count) if progress_bar: from tqdm import tqdm count_range = tqdm(count_range) for _ in count_range: audio_entity = np.random.choice(audio_entities) voice_only_signal = audio_entity['audio_voice_only'] cuts = np.random.uniform(low=1, high=len(voice_only_signal), size=2) signal_to_process = voice_only_signal[int(min(cuts)):int(max(cuts))] features_per_conv = get_mfcc_features_390(signal_to_process, c.AUDIO.SAMPLE_RATE, max_frames=None) if len(features_per_conv) > 0: features.append(features_per_conv) return features
def get_feat_from_audio(audio_reader, sr, norm_data, speaker): feat = get_mfcc_features_390(audio_reader, sr, max_frames=None) feat = normalize(feat, norm_data[speaker]['mean_train'], norm_data[speaker]['std_train']) return feat