def get_speaker_similarity_dict_and_wav_splits(file_name): print('Processing voices for file:', file_name) fpath = os.fspath(file_name) wav = preprocess_wav(fpath_or_wav=fpath) speaker_names = ['Phreak', 'Other'] segments = [[0, 25], [75, 90]] encoder = VoiceEncoder('cpu') speaker_wavs = [ wav[int(s[0] * sampling_rate):int(s[1] * sampling_rate)] for s in segments ] print( "Running the continuous embedding on cpu, this might take a while...") _, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=16) speaker_embeds = [ encoder.embed_utterance(speaker_wav) for speaker_wav in speaker_wavs ] similarity_dict = { name: cont_embeds @ speaker_embed for name, speaker_embed in zip(speaker_names, speaker_embeds) } return similarity_dict, wav_splits
class VoiceIDSys: def __init__(self): # A user dictionary with name:centroid self.users = {} self.encoder = VoiceEncoder() sd.default.device = 'Microphone (GENERAL WEBCAM)' def save_sounds(self, duration=10, fs=44100): print('recording now!') sample = sd.rec(int(duration * fs), samplerate=fs, channels=2) sd.wait() print('recording done!') path = 'output.wav' write(path, fs, sample) return sample, path def generate_voice_profile(self, data_path): embeds = [] os.chdir(data_path) for file in os.listdir('.'): fpath = Path(os.getcwd() + '\\' + file) wav = preprocess_wav(fpath) embed = self.encoder.embed_utterance(wav) embeds.append(embed) centroid = np.array(embeds).mean(axis=0) os.chdir(de) return centroid def add_user(self, name, centroid): self.users[name] = centroid def id_subject(self, voicepath, th=0.45): fpath = Path(voicepath) wav = preprocess_wav(fpath) embedding = self.encoder.embed_utterance(wav) diff_list = [] for i in self.users.values(): diff_list.append(i - embedding) norms = np.linalg.norm(diff_list, axis=1) print(norms) m = min(norms) if m > th: print('Unauthorized') print(m) return False else: user = list(self.users.keys())[np.argmin(norms)] print('Hello', user, '!') return True
def load_speaker_embeds(args): encoder = VoiceEncoder() speakers_dir = '{0}/{1}/{2}/'.format(args.media, args.name, args.speakers) speaker_embeds_list = [] if os.path.exists(speakers_dir): speakers_dir_subfolders = [ f.path for f in os.scandir(speakers_dir) if f.is_dir() ] for speakers_dir_subfolder in speakers_dir_subfolders: speaker_embeds = [] wav_file_list = list( enumerate(glob.glob( "{}/*.wav".format(speakers_dir_subfolder)))) for index, wav_file in wav_file_list: wav = AudioSegment.from_wav(wav_file) librosa_npy = audiosegment_to_librosawav(wav) librosa_wav = preprocess_wav(librosa_npy) current_embed = encoder.embed_utterance(librosa_wav) speaker_embeds.append(current_embed) if len(speaker_embeds) > 0: dirname = os.path.basename(speakers_dir_subfolder) speaker_embeds_list.append(( dirname, speaker_embeds, )) return speaker_embeds_list
def several_speakers_identification(self, path, min_duration=3, return_splits=False, export=False, recognition=False, language='en-En'): self.min_duration = min_duration self.path = path wav = preprocess_wav(path) sf.write(self.wav, wav, 16000, subtype='PCM_24') encoder = VoiceEncoder() _, embed, slices = encoder.embed_utterance(wav, return_partials=True, rate=1) np.set_printoptions(suppress=True) for i in range(len(embed)): self.add_speaker(embed[i]) # for i in range(len(self.timing)): # print(i, self.timing[i]) self.clear() print('Found %d speakers' % self.speakers_number) for i in range(self.speakers_number): print('Speaker ' + str(i) + ': ' + str(len(self.speakers[i])) + 's') self.splits = self.get_splits() if recognition or export: paths = ExportAudio.export(self.splits, self.wav) if recognition: self.recognize_audio(language, paths, export) if return_splits: return self.speakers_number, self.splits return self.speakers_number
def load_data(from_path=None, ckpt_path=None, data_path=None, save_path=None): if from_path is None: if ckpt_path is None: raise Exception('No checkpoint path provided') from resemblyzer import preprocess_wav, VoiceEncoder from tqdm import tqdm device = torch.device('cuda') encoder = VoiceEncoder(device=device, loss_device=device) encoder.load_ckpt(ckpt_path, device=device) encoder.eval() wav_fpaths = list(Path(data_path).glob("**/*.flac")) # Preprocess and save encoded utterance and label to list X = [] y = [] for wav_fpath in tqdm(wav_fpaths): wav = preprocess_wav(wav_fpath) X.append(encoder.embed_utterance(wav).cpu().numpy()) y.append(wav_fpath.parent.parent.stem) # Save for testing if save_path is not None: np.save(Path(save_path, 'embeds.npy'), X) np.save(Path(save_path, 'labels.npy'), y) else: raise Exception('No save_path provided') else: X = np.load(Path(from_path, 'embeds.npy'), allow_pickle=True) y = np.load(Path(from_path, 'labels.npy'), allow_pickle=True) return X, y
def isolate_voice(audio_file_path: Path, embed_path: Path, params_path: Path, output_path: Path): """ load speaker embeds from pickle take voice out only if value is > thresh and take greater if both > thresh Args: file_path: input complete wav file path from which rick's voice will be taken out cutoff_thresh: voice if value above this is taken """ params = load_params(params_path) cutoff_threshold = params["cutoff_threshold"] sampling_rate = params["wav_bitrate"] print("preprocessing") file_wav = preprocess_wav(audio_file_path) ; print("input file shape ", file_wav.shape, "\n", file_wav[:10]) print("file preprocessed") encoder = VoiceEncoder("cpu") print("model loaded") speaker_names = ["Rick", "Morty"] _, file_embeds, wav_splits = encoder.embed_utterance(file_wav, return_partials=True, rate=1) print("file encoded") speaker_embeds = pickle.load(open(embed_path, "rb")) similarity_dict = {name: file_embeds @ speaker_embed for name, speaker_embed in zip(speaker_names, speaker_embeds)} print("similatrity dict is\n", similarity_dict) pickle.dump(similarity_dict, open("./similarity.pkl", "wb")) #find greater in both then cutoff -> take that second append it to that file current_second = 0 rick_wav = [] rick_seconds = [] morty_wav = [] morty_seconds = [] for rick_value, morty_value in zip(similarity_dict["Rick"], similarity_dict["Morty"]): print(current_second, rick_value, morty_value) if rick_value > morty_value and rick_value > cutoff_threshold: rick_wav.append(file_wav[current_second * sampling_rate : (current_second+1) * sampling_rate]) rick_seconds.append(current_second) print("append rick") elif morty_value > rick_value and morty_value > cutoff_threshold: morty_wav.append(file_wav[current_second * sampling_rate: (current_second+1) * sampling_rate]) morty_seconds.append(current_second) print("append morty") else: print("skipping") current_second += 1 rick_wav = [item for sublist in rick_wav for item in sublist] morty_wav = [item for sublist in morty_wav for item in sublist] save_wav(np.array(rick_wav), output_path.joinpath("rick.wav"), sampling_rate) save_wav(np.array(morty_wav), output_path.joinpath("morty.wav"), sampling_rate) return rick_seconds, morty_seconds
def process(wav_fpath): wav = preprocess_wav(wav_fpath) encoder = VoiceEncoder("cpu") _, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=16) # Output denoised wave after removing the pauses write('DenoisedInputFiles/DenoisedSignal.wav', 16000, wav) return cont_embeds, wav_splits
def oneDictorIdentification(self, cSample, mainFile): print('[-i] --> Identify Dictor') avg1 = 0.0 avg2 = 0.0 fpath = Path(cSample) wav = preprocess_wav(fpath) encoder = VoiceEncoder() embed = encoder.embed_utterance(wav) np.set_printoptions(precision=3, suppress=True) embedNew = [] for i in embed: if i != 0.0: embedNew.append(i) for s in embedNew: avg1 = avg1 + s fpath = Path(mainFile) wav = preprocess_wav(fpath) encoder = VoiceEncoder() embed = encoder.embed_utterance(wav) np.set_printoptions(precision=3, suppress=True) embedNew2 = [] for i in embed: if i != 0.0: embedNew2.append(i) for s in embedNew2: avg2 = avg2 + s self.result = abs((avg2 / len(embedNew2)) - (avg1 / len(embedNew))) print(self.result) if (self.result < 0.002): print("Match!") # print("\033[33m\033[1m {}".format("Match!")) return 1 else: print("These are different voices") # print("\033[33m\033[1m {}".format("These are different voices")) return 0
class Voice: def __init__(self): self.encoder = VoiceEncoder() self.database = {} def add_data(self,v,name): ''' v: wav file path name : str ''' if name in self.database: print("person exist") return False wav = preprocess_wav(Path(v)) self.database[name] = self.encoder.embed_utterance(wav) return True def voice_com(self,v1): ''' v1: wav file path return True if speaker in database ''' if len(self.database.values()) == 0: print("Your data not in our database.") return False wav = preprocess_wav(Path(v1)) # ## method 1 # embed1 = self.encoder.embed_speaker(wav1) # embed2 = self.encoder.embed_speaker(wav2) # sims1 = np.inner(embed1,embed2) # bigger 0.85 ## method 2 embed = self.encoder.embed_utterance(wav) for dk in self.database.keys(): sims = embed @ self.database[dk] # bigger 0.75 if sims > 0.75: print("welcome {}!".format(dk)) return True print("Your data not in our database.") return False
def vectorExtract(audio1_path, audio2_path): wav1 = preprocess_wav(Path(audio1_path)) wav2 = preprocess_wav(Path(audio2_path)) encoder = VoiceEncoder() embed1 = encoder.embed_utterance(wav1) embed2 = encoder.embed_utterance(wav2) return numpy.concatenate([embed1, embed2])
def process(wav_fpath): wav = preprocess_wav(wav_fpath) encoder = VoiceEncoder() _, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=16) # Output denoised wave after removing the pauses write('Denoise/Denoise_commercial_mono.wav', 16000, wav) return cont_embeds, wav_splits
def simVoice(audio1_path, audio2_path): wav1 = preprocess_wav(Path(audio1_path)) wav2 = preprocess_wav(Path(audio2_path)) encoder = VoiceEncoder() embed1 = encoder.embed_utterance(wav1) embed2 = encoder.embed_utterance(wav2) return dot(embed1, embed2) / (norm(embed1) * norm(embed2) ) #np.inner(embed1, embed2)
def extract(data_dirs, output_dir): """Extract embedding by resemblyzer.""" encoder = VoiceEncoder() data = {} for data_dir in tqdm(data_dirs, position=0): file_list = librosa.util.find_files(data_dir) for file_path in tqdm(file_list, position=1, leave=False): wav = preprocess_wav(file_path) embedding = encoder.embed_utterance(wav) wav_name = splitext(basename(file_path))[0] data[wav_name] = embedding joblib.dump(data, f"{output_dir}.pkl")
def get_spk_emb(audio_file_dir, segment_len=960000): device = torch.device("cuda" if torch.cuda.is_available() else "cpu") resemblyzer_encoder = VoiceEncoder(device=device) wav = preprocess_wav(audio_file_dir) l = len(wav) // segment_len # segment_len = 16000 * 60 l = np.max([1, l]) all_embeds = [] for i in range(l): mean_embeds, cont_embeds, wav_splits = resemblyzer_encoder.embed_utterance( wav[segment_len * i:segment_len* (i + 1)], return_partials=True, rate=2) all_embeds.append(mean_embeds) all_embeds = np.array(all_embeds) mean_embed = np.mean(all_embeds, axis=0) return mean_embed, all_embeds
def get_top_similar(file, top): top_names = [] wav = preprocess_wav(file) encoder = VoiceEncoder() embed = encoder.embed_utterance(wav) cs = cosine_similarity(X=embeded_voices, Y=embed.reshape(1, -1)) cs_sorted = np.argsort(cs, axis=0)[::-1][:, 0] top_similarity = cs[cs_sorted[:top]] top_ids = ids[cs_sorted[:top]] for each in top_ids: top_names.append(mydict[each]) return top_names, top_similarity, top_ids
class VQVCModel(BaseModel): def __init__(self, params): super().__init__(params) self.encoder = Encoder(params) self.decoder = Decoder(params) self.speaker_encoder = VoiceEncoder() self.freeze(self.speaker_encoder) def forward(self, wavs, mels): emb = self._make_speaker_vectors(wavs, mels.device) q_afters, diff = self.encoder(mels) dec = self.decoder(q_afters, emb) return dec, diff def inference(self, src_path: str, tgt_path: str): wav_src, wav_tgt, mel_src = self._preprocess(src_path, tgt_path) emb = self._make_speaker_vectors([wav_tgt], mel_src.device) q_afters, _ = self.encoder(mel_src) dec = self.decoder(q_afters, emb) wav = self._mel_to_wav(dec) return wav def _make_speaker_vectors(self, wavs, device): c = [self.speaker_encoder.embed_utterance(x) for x in wavs] c = torch.tensor(c, dtype=torch.float, device=device) return c def _preprocess(self, src_path: str, tgt_path: str): wav_src, mel_src = get_wav_mel(src_path) wav_tgt, _ = get_wav_mel(tgt_path) mel_src = self._preprocess_mel(mel_src) return wav_src, wav_tgt, mel_src def _preprocess_mel(self, mel): if self.is_normalize: mel = normalize(mel) mel = self._adjust_length(mel, freq=4) mel = self.unsqueeze_for_input(mel) return mel
def extract_audio_embedding(paths, dest_path): """ Extract audio embedding from each wav file in paths. :param paths: list of paths to wav audio files :param dest_path: output path in which save the df.pickle file :return: """ # Define encoder for audio. This is a demanding task, but CUDA does not support multiprocessing from Python encoder = VoiceEncoder() for path in tqdm(paths): filename = os.path.basename(path).split('.')[0] # Extract wav features wav = preprocess_wav(path) # Actually creates audio embedding embed = encoder.embed_utterance(wav) df = pd.DataFrame(columns=['filename', 'audio_embedding']) # Put info inside dataframe. Keep the name as "filename.mp4" so we keep compatibility with video embeddings df.loc[0] = [filename + '.mp4', embed] df.to_pickle(os.path.join(dest_path, filename + '.csv'))
def extract(data_dirs, output_dir): """Extract embedding by resemblyzer.""" encoder = VoiceEncoder() os.makedirs(output_dir, exist_ok=True) for data_dir in tqdm(data_dirs, position=0): speaker_list = [ speaker_name for speaker_name in os.listdir(data_dir) if os.path.isdir(join_path(data_dir, speaker_name)) ] for speaker_name in tqdm(speaker_list, position=1, leave=False): data = [] file_list = librosa.util.find_files( join_path(data_dir, speaker_name)) for file_path in tqdm(file_list, position=2, leave=False): wav = preprocess_wav(file_path) embedding = encoder.embed_utterance(wav) wav_name = splitext(basename(file_path))[0] data.append({"filename": wav_name, "embedding": embedding}) if len(data) == 0: continue joblib.dump(data, join_path(output_dir, f"{speaker_name}.pkl"))
class Predictor(): def __init__(self, clf_ckpt_path='exp/clv/mlp/mlp_best_val_loss.pt', enc_ckpt_path='ckpt/pretrained.pt', device=torch.device('cuda'), num_class=381, verbose=False): start = timer() self.encoder = VoiceEncoder(device=device, loss_device=device) self.encoder.load_ckpt(enc_ckpt_path, device) self.encoder.eval() self.classifier = MLP(num_class=num_class) self.classifier.load_ckpt(clf_ckpt_path, device) self.classifier.eval() if verbose: print( f'Encoder and classifier models loaded successfully in {timer() - start}s' ) def preprocess(self, f): """ Applies preprocessing operations to a waveform either on disk or in memory such that The waveform will be resampled to match the data hyperparameters. :param f: either a filepath to an audio file or the waveform as a numpy array of floats. """ return preprocess_wav(f) def predict(self, audio, topk=2): """ Predict top_k classes with highest probabilities. :param audio: preprocessed waveform. :param topk: Keep topk classes with highest probabilities. """ embed = self.encoder.embed_utterance(audio) inp = embed.unsqueeze(dim=0) top_probs, top_classes = self.classifier.predict(inp, topk=topk) return top_probs, top_classes
def get_speaker_segments(args, audio_file, segments_file): if os.path.exists(segments_file): with open(segments_file) as json_file: json_data = json.load(json_file) return [( monologue["speaker"]["id"], monologue["start"] * 1000, monologue["end"] * 1000, ) for monologue in json_data["monologues"]] else: from resemblyzer import preprocess_wav, VoiceEncoder encoder = VoiceEncoder() speaker_embeds = [] segments = get_segments(args, audio_file, segments_file) speaker_segments = [] for start, end in segments: clip = audio_file[start:end] segment_npy = audiosegment_to_librosawav(clip) segment_wav = preprocess_wav(segment_npy) current_embed = encoder.embed_utterance(segment_wav) is_any_similar = False min_similarity = 0.85 name_id = len(speaker_embeds) for index, speaker_embed in enumerate(speaker_embeds): similarity = current_embed @ speaker_embed if similarity > min_similarity: min_similarity = similarity name_id = index is_any_similar = True if not is_any_similar: speaker_embeds.append(current_embed) speaker_segments.append((name_id, [start, end])) return speaker_segments
def clustering(wav): encoder = VoiceEncoder("cpu") _, cont_embeds, wav_splits = encoder.embed_utterance( wav, return_partials=True, rate=16) #create d-vectors clusterer = SpectralClusterer(min_clusters=2, max_clusters=100, p_percentile=0.90, gaussian_blur_sigma=1) labels = clusterer.predict(cont_embeds) times = [((s.start + s.stop) / 2) / sampling_rate for s in wav_splits] labelling = [] start_time = 0 for i, time in enumerate(times): if i > 0 and labels[i] != labels[i - 1]: temp = ['speaker ' + str(labels[i - 1]), start_time, time] labelling.append(tuple(temp)) start_time = time if i == len(times) - 1: temp = ['speaker ' + str(labels[i]), start_time, time] labelling.append(tuple(temp)) return labelling #str
def get_embedding_GE2E(filename): wav, _ = librosa.load(str(filename), sr=22050) encoder = VoiceEncoder(device="cpu") emb = encoder.embed_utterance(wav) return emb
speaker_wavs = [ wav[int(s[0] * sampling_rate):int(s[1] * sampling_rate)] for s in segments ] ## Compare speaker embeds to the continuous embedding of the interview # Derive a continuous embedding of the interview. We put a rate of 16, meaning that an # embedding is generated every 0.0625 seconds. It is good to have a higher rate for speaker # diarization, but it is not so useful for when you only need a summary embedding of the # entire utterance. A rate of 2 would have been enough, but 16 is nice for the sake of the # demonstration. # We'll exceptionally force to run this on CPU, because it uses a lot of RAM and most GPUs # won't have enough. There's a speed drawback, but it remains reasonable. encoder = VoiceEncoder("cpu") print("Running the continuous embedding on cpu, this might take a while...") _, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=16) # Get the continuous similarity for every speaker. It amounts to a dot product between the # embedding of the speaker and the continuous embedding of the interview speaker_embeds = [ encoder.embed_utterance(speaker_wav) for speaker_wav in speaker_wavs ] similarity_dict = { name: cont_embeds @ speaker_embed for name, speaker_embed in zip(speaker_names, speaker_embeds) } ## Run the interactive demo interactive_diarization(similarity_dict, wav, wav_splits)
# We'll use a smaller version of the dataset LibriSpeech test-other to run our examples. This # smaller dataset contains 10 speakers with 10 utterances each. N.B. "wav" in variable names stands # for "waveform" and not the wav file extension. wav_fpaths = list(Path("audio_data", "librispeech_test-other").glob("**/*.flac")) # Group the wavs per speaker and load them using the preprocessing function provided with # resemblyzer to load wavs in memory. It normalizes the volume, trims long silences and resamples # the wav to the correct sampling rate. speaker_wavs = {speaker: list(map(preprocess_wav, wav_fpaths)) for speaker, wav_fpaths in groupby(tqdm(wav_fpaths, "Preprocessing wavs", len(wav_fpaths), unit="wavs"), lambda wav_fpath: wav_fpath.parent.stem)} ## Similarity between two utterances from each speaker # Embed two utterances A and B for each speaker embeds_a = np.array([encoder.embed_utterance(wavs[0]) for wavs in speaker_wavs.values()]) embeds_b = np.array([encoder.embed_utterance(wavs[1]) for wavs in speaker_wavs.values()]) # Each array is of shape (num_speakers, embed_size) which should be (10, 256) if you haven't # changed anything. print("Shape of embeddings: %s" % str(embeds_a.shape)) # Compute the similarity matrix. The similarity of two embeddings is simply their dot # product, because the similarity metric is the cosine similarity and the embeddings are # already L2-normed. # Short version: utt_sim_matrix = np.inner(embeds_a, embeds_b) # Long, detailed version: utt_sim_matrix2 = np.zeros((len(embeds_a), len(embeds_b))) for i in range(len(embeds_a)): for j in range(len(embeds_b)): # The @ notation is exactly equivalent to np.dot(embeds_a[i], embeds_b[i])
# from matplotlib import cm # from time import sleep, perf_counter as timer # from umap import UMAP # import matplotlib.pyplot as plt sys.path.append("Resemblyzer") from resemblyzer import preprocess_wav, VoiceEncoder, sampling_rate # noqa # %% # Load file wav = preprocess_wav("Resemblyzer/audio_data/X2zqiX6yL3I.mp3") # %% # Audio features encoder = VoiceEncoder("cpu") _, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=5) # %% # Load UIS-RNN model sys.argv = ['dummy'] model_args, training_args, inference_args = uisrnn.parse_arguments() model = uisrnn.UISRNN(model_args) model.load('uis-rnn/saved_model.uisrnn') # %% # Testing test_sequence = cont_embeds.astype(float) predictions = model.predict(test_sequence, inference_args) # %%
class AutoVCModel(BaseModel): def __init__(self, params): super().__init__(params) self.encoder = Encoder(params.model.dim_neck, params.speaker_emb_dim, params.model.freq) self.decoder = Decoder(params.model.dim_neck, params.speaker_emb_dim, params.model.dim_pre) self.postnet = Postnet() self.style_encoder = VoiceEncoder() self.freeze(self.style_encoder) def forward(self, wavs, mels): c_src = self._make_speaker_vectors(wavs, mels.size(-1), mels.device) codes, mel_outputs, mel_outputs_postnet = self._forward(mels, c_src) return ( mel_outputs, # decoder output mel_outputs_postnet, # postnet output torch.cat(codes, dim=-1), # encoder output torch.cat(self.encoder(mel_outputs_postnet, c_src), dim=-1) # encoder output using postnet output ) def inference(self, src_path: str, tgt_path: str): wav_src, wav_tgt, mel_src = self._preprocess(src_path, tgt_path) c_src = self._make_speaker_vectors([wav_src], mel_src.size(-1), mel_src.device) c_tgt = self._make_speaker_vectors([wav_tgt], mel_src.size(-1), mel_src.device) _, _, mel_outputs_postnet = self._forward(mel_src, c_src, c_tgt) wav = self._mel_to_wav(mel_outputs_postnet) return wav def _forward(self, mels, c_src, c_tgt=None): codes = self.encoder(mels, c_src) # almost equivalent to torch.modules.functional.interpolate code_exp = torch.cat([ c.unsqueeze(-1).expand(-1, -1, mels.size(-1) // len(codes)) for c in codes ], dim=-1) # (Batch, Mel-bin, Time) => (Batch, Time, Mel-bin) for LSTM decoder_input = torch.cat( (code_exp, c_src if c_tgt is None else c_tgt), dim=1).transpose(1, 2) mel_outputs = self.decoder(decoder_input) mel_outputs_postnet = self.postnet(mel_outputs) mel_outputs_postnet = mel_outputs + mel_outputs_postnet return codes, mel_outputs, mel_outputs_postnet def _make_speaker_vectors(self, wavs, time_size, device): c = [self.style_encoder.embed_utterance(x) for x in wavs] c = torch.tensor(c, dtype=torch.float, device=device) c = c[:, :, None].expand(-1, -1, time_size) return c def _preprocess(self, src_path: str, tgt_path: str): wav_src, mel_src = get_wav_mel(src_path) wav_tgt, _ = get_wav_mel(tgt_path) mel_src = self._preprocess_mel(mel_src) return wav_src, wav_tgt, mel_src def _preprocess_mel(self, mel): if self.is_normalize: mel = normalize(mel) mel = self._adjust_length(mel, self.freq) mel = self.unsqueeze_for_input(mel) return mel
def audio_analyze(): delimiter = utils.get_delimiter() data_directory = utils.get_data_dir() audio_directory = data_directory + delimiter + 'audio' AUDIO_FILE = data_directory + delimiter + 'students-output-audio.wav' myaudio = AudioSegment.from_file(AUDIO_FILE, "wav") chunk_length_ms = 10000 # pydub calculates in millisec chunks = make_chunks(myaudio, chunk_length_ms) #Make chunks from audio file # Remove old chunks for filename in glob.glob(audio_directory + delimiter + 'chunk*'): os.remove(filename) # Create new chunks for i, chunk in enumerate(chunks): chunk_name = audio_directory + delimiter + "chunk{0}.wav".format(i) chunk.export(chunk_name, format="wav") with open(data_directory + delimiter + 'audio_wpm_csv.csv', mode='w', newline='') as audio_wpm_csv: audio_wpm_csv_writer = csv.writer(audio_wpm_csv, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) audio_wpm_csv_writer.writerow(['Second', 'WPM', 'Text']) ##------------------resemblyzer---------##### ## Open source project from https://github.com/resemble-ai/Resemblyzer ## Get reference audios wav = preprocess_wav(AUDIO_FILE) # Cut some segments from single speakers as reference audio # Speaker times are in seconds [beginning, end] # Can diarize multiple speakers (e.g. students and professor) # Segments and speaker names are ordered specific segments = [[1, 5]] speaker_names = ["Professor"] # This assumes the speaker portion was appended to the beginning of the audio # file but it could also be passed in a seperate file speaker_wavs = [wav[int(s[0] * 16000):int(s[1] * 16000)] for s in segments] # Rate of 16 = an embedding every 0.0625 seconds. # Higher rate = better for speaker # diarization, but it is not so useful for when you only need a summary # Forcing this on CPU, because it uses a lot of RAM and most GPUs # won't have enough. There's a speed drawback, but it remains reasonable. # The rate also determines how many floats will populate the array. # For example. A 14 second audio file would give ~239 readings at 0.0625 secs. encoder = VoiceEncoder("cpu") logger.info("Continuous embedding running on cpu...") _, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=16) # Get the continuous similarity for every speaker. This is a dot product between the # embedding of the speaker and the continuous embedding of the whole audio file speaker_embeds = [ encoder.embed_utterance(speaker_wav) for speaker_wav in speaker_wavs ] similarity_dict = { name: cont_embeds @ speaker_embed for name, speaker_embed in zip(speaker_names, speaker_embeds) } # Produce 'audio_diarize_csv.csv' diarize_file = data_directory + delimiter + 'audio_diarize_csv.csv' populate_speaker(diarize_file, similarity_dict) ## Run the interactive demo # interactive_diarization(similarity_dict, wav, wav_splits) ##--------------------- end resemblyzer----------### now = datetime.now() ############ Testing Sphinx ########## i = 0 second_count = 10 logger.info("Sphinx recognizer with chunks") for chunk in chunks: filename = audio_directory + delimiter + 'chunk' + str(i) + '.wav' logger.info("Processing chunk...") file = filename r = sr.Recognizer() with sr.AudioFile(file) as source: #r.adjust_for_ambient_noise(source) audio_listened = r.record(source) try: rec = r.recognize_sphinx(audio_listened) logger.info(rec) word_count = str(rec).split() with open(data_directory + delimiter + 'audio_wpm_csv.csv', mode='a', newline='') as audio_wpm_csv: audio_wpm_csv_writer = csv.writer(audio_wpm_csv, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) audio_wpm_csv_writer.writerow( [str(second_count), len(word_count), rec]) except sr.UnknownValueError: logger.info("Sphinx could not understand audio") except sr.RequestError as e: logger.info("Sphinx error") except: rec = r.recognize_sphinx(audio_listened, show_all=True) word_count = str(rec).split() with open(data_directory + delimiter + 'audio_wpm_csv.csv', mode='a', newline='') as audio_wpm_csv: audio_wpm_csv_writer = csv.writer(audio_wpm_csv, delimiter=',', quotechar='"', quoting=csv.QUOTE_MINIMAL) audio_wpm_csv_writer.writerow( [str(second_count), len(word_count), rec]) i += 1 second_count += 10 ######### End sphinx test ########## # Print time delta without decimals process_duration = str(datetime.now() - now).split('.')[0] duration = 'Sphink took ' + process_duration + ' to process' logger.info(duration)
wav_fpaths = list( Path("audio_data", "librispeech_test-other").glob("**/*.flac")) # Group the wavs per speaker and load them using the preprocessing function provided with # resemblyzer to load wavs in memory. It normalizes the volume, trims long silences and resamples # the wav to the correct sampling rate. speaker_wavs = { speaker: list(map(preprocess_wav, wav_fpaths)) for speaker, wav_fpaths in groupby( tqdm(wav_fpaths, "Preprocessing wavs", len(wav_fpaths), unit="wavs"), lambda wav_fpath: wav_fpath.parent.stem) } ## Similarity between two utterances from each speaker # Embed two utterances A and B for each speaker embeds_a = np.array( [encoder.embed_utterance(wavs[0]) for wavs in speaker_wavs.values()]) embeds_b = np.array( [encoder.embed_utterance(wavs[1]) for wavs in speaker_wavs.values()]) # Each array is of shape (num_speakers, embed_size) which should be (10, 256) if you haven't # changed anything. print("Shape of embeddings: %s" % str(embeds_a.shape)) # Compute the similarity matrix. The similarity of two embeddings is simply their dot # product, because the similarity metric is the cosine similarity and the embeddings are # already L2-normed. # Short version: utt_sim_matrix = np.inner(embeds_a, embeds_b) # Long, detailed version: utt_sim_matrix2 = np.zeros((len(embeds_a), len(embeds_b))) for i in range(len(embeds_a)): for j in range(len(embeds_b)):
basename = os.path.basename(wav_path).split('.wav')[0] idx = people[int(basename[-7:-4])] wav = audio.load_wav(wav_path) wav = wav / np.abs(wav).max() * hparams.hparams.rescaling_max mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T tmp = mel_spectrogram result = np.zeros((WAV_LEN, 80)) result[:min(tmp.shape[0], WAV_LEN), :tmp. shape[1]] = tmp[:min(tmp.shape[0], WAV_LEN), :tmp.shape[1]] mels[idx].append(result) obj = preprocess_wav(wav_path) emb = encoder.embed_utterance(obj) style_list[idx].append(emb) with open(os.path.join(write_path, 'data.pkl'), 'wb') as handle: pickle.dump(mels, handle) print("finish 'data.pkl' !!!") ''' for idx in style_list: for s in style_list[idx]: for i in range(256): style[idx][i] += s[i] for i in range(256): style[idx][i] = style[idx][i] / len(style_list[idx]) '''
import os from resemblyzer import VoiceEncoder, preprocess_wav from pathlib import Path import numpy as np from sklearn.linear_model import Perceptron os.chdir('C:\\Users\\nrdas\\Downloads\\voiceID\\data\\nitish') embeds = [] encoder = VoiceEncoder() for file in os.listdir('.'): fpath = Path(os.getcwd() + '\\' + file) wav = preprocess_wav(fpath) embed = encoder.embed_utterance(wav) embeds.append(embed) embeds2 = [] os.chdir('C:\\Users\\nrdas\\Downloads\\voiceID\\data\\unauthorized') for file in os.listdir('.'): fpath = Path(os.getcwd() + '\\' + file) wav = preprocess_wav(fpath) embed = encoder.embed_utterance(wav) embeds2.append(embed) centroid = np.array(embeds).mean(axis=0) diff_dists = embeds2 - centroid sim_dists = embeds - centroid sim_dists_norm = np.linalg.norm(sim_dists, axis=1) diff_dists_norm = np.linalg.norm(diff_dists, axis=1) print(sim_dists_norm) print(diff_dists_norm)