# https://medium.com/saarthi-ai/who-spoke-when-build-your-own-speaker-diarization-module-from-scratch-e7d725ee279 from resemblyzer import preprocess_wav, VoiceEncoder from pathlib import Path # D Vector Clusteroring from spectralcluster import SpectralClusterer # give the file path to your audio file audio_file_path = '****************************************' wav_fpath = Path(audio_file_path) wav = preprocess_wav(wav_fpath) encoder = VoiceEncoder("cpu") _, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=16) print(cont_embeds.shape) #################################################################################################### # D vector clustering clusterer = SpectralClusterer( min_clusters=2, max_clusters=100, p_percentile=0.90, gaussian_blur_sigma=1) labels = clusterer.predict(cont_embeds)
from data.wav_folder import read_wav import os from resemblyzer import preprocess_wav, VoiceEncoder import numpy as np NCE_dir = '/com_space/zhaohang/CUT/checkpoints/voice_CUT_replicate_many_p256/converted_sound_independent_MM/' cyclegan_dir = '/com_space/zhaohang/CUT/checkpoints/cyclegan_replicate_many_p256/converted_sound_independent_MM/' target_dir = '/com_space/zhaohang/CUT/datasets/voice/p256/' sampling_rate = 24000 encoder = VoiceEncoder() NCE_list, cyclegan_list = [], [] for NCE_path, cyclegan_path in zip(os.listdir(NCE_dir), os.listdir(cyclegan_dir)): NCE_path = os.path.join(NCE_dir, NCE_path) cyclegan_path = os.path.join(cyclegan_dir, cyclegan_path) NCE_wav, _ = read_wav(NCE_path, sr=sampling_rate) cyclegan_wav, _ = read_wav(cyclegan_path, sr=sampling_rate) NCE_list.append(NCE_wav) cyclegan_list.append(cyclegan_wav) target_list = [] for target_path in os.listdir(target_dir): target_path = os.path.join(target_dir, target_path) target_wav, _ = read_wav(target_path, sr=sampling_rate) target_list.append(target_wav)
def load_model(root, device): """Load model""" model = VoiceEncoder() return model
def __init__(self): self.encoder = VoiceEncoder() self.database = {}
# take 6 segments of real speech as ground truth reference and compare those against the 12 # remaining. Those segments are selected at random, so will run into different results every time # you run the script, but they should be more or less consistent. # Using the voice of Donald Trump is merely a matter of convenience, as several fake speeches # with his voice were already put up on youtube. This choice was not politically motivated. ## Load and preprocess the audio data_dir = Path("audio_data", "donald_trump") wav_fpaths = list(data_dir.glob("**/*.mp3")) wavs = [preprocess_wav(wav_fpath) for wav_fpath in \ tqdm(wav_fpaths, "Preprocessing wavs", len(wav_fpaths), unit=" utterances")] ## Compute the embeddings encoder = VoiceEncoder() embeds = np.array([encoder.embed_utterance(wav) for wav in wavs]) speakers = np.array([fpath.parent.name for fpath in wav_fpaths]) names = np.array([fpath.stem for fpath in wav_fpaths]) # Take 6 real embeddings at random, and leave the 6 others for testing gt_indices = np.random.choice(*np.where(speakers == "real"), 6, replace=False) mask = np.zeros(len(embeds), dtype=np.bool) mask[gt_indices] = True gt_embeds = embeds[mask] gt_names = names[mask] gt_speakers = speakers[mask] embeds, speakers, names = embeds[~mask], speakers[~mask], names[~mask]
def encode_each_batch(source_dir, source_dir_pattern, embed_file_path): vocoder = VoiceEncoder() file_paths = audio_paths(source_dir, source_dir_pattern) print('Total number of files: {}'.format(len(file_paths))) embeddings = encoder(file_paths=file_paths, vocoder=vocoder) save_embeddings(embed_file_path, embeddings, file_paths)
from resemblyzer import preprocess_wav, VoiceEncoder from itertools import groupby from pathlib import Path import matplotlib.pyplot as plt import numpy as np # The neural network will automatically use CUDA if it's available on your machine, otherwise it # will use the CPU. You can enforce a device of your choice by passing its name as argument to the # constructor. The model might take a few seconds to load with CUDA, but it then executes very # quickly. encoder = VoiceEncoder() # We'll use a smaller version of the dataset LibriSpeech test-other to run our examples. This # smaller dataset contains 10 speakers with 10 utterances each. N.B. "wav" in variable names stands # for "waveform" and not the wav file extension. wav_fpaths = list(Path("librispeech_test-other").glob("**/*.flac")) # Group the wavs per speaker and load them using the preprocessing function provided with # resemblyzer to load wavs in memory. It normalizes the volume, trims long silences and resamples # the wav to the correct sampling rate. spearker_wavs = { speaker: list(map(preprocess_wav, wav_fpaths)) for speaker, wav_fpaths in groupby(wav_fpaths, lambda wav_fpath: wav_fpath.parent.stem) } def demo_similarity_matrix(): ## Similarity between two utterances from each speaker # Embed two utterances A and B for each speaker embeds_a = np.array(
def get_full_translation(audio_file_name): def mp3_to_wav(file): if audio_file_name.split('.')[1] == 'mp3': sound = AudioSegment.from_mp3(audio_file_name) sound.export(f"{audio_file_name.split('.')[0]}.wav", format="wav") else: pass wav_fpath = (f"{audio_file_name.split('.')[0]}.wav") #preprocesses the wave file to turn it into a file without sounds wav = preprocess_wav(wav_fpath) encoder = VoiceEncoder( "cpu" ) #Creates a voice encoder object so we can process audio with the cpu _, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=16) #create a cluster object clusterer = SpectralClusterer(min_clusters=2, max_clusters=100, p_percentile=0.90, gaussian_blur_sigma=1) #label the speaker that is speaking at certain time labels = clusterer.predict(cont_embeds) def create_labelling(labels, wav_splits): times = [((s.start + s.stop) / 2) / sampling_rate for s in wav_splits] labelling = [] start_time = 0 for i, time in enumerate(times): if i > 0 and labels[i] != labels[i - 1]: temp = [str(labels[i - 1]), start_time, time] labelling.append(tuple(temp)) start_time = time if i == len(times) - 1: temp = [str(labels[i]), start_time, time] labelling.append(tuple(temp)) return labelling labelling = create_labelling(labels, wav_splits) print(labelling) #read data from the wave file def split_audio(file_path): thisList = [] n = 0 for i in labelling: n = +1 #changes the values in the tuple to ints and milliseconds start = int(i[1]) * 1000 end = int(i[2]) * 1000 #finds the audio file as a wav newAud = AudioSegment.from_wav(file_path) #creates a new audio based on the values given by seconds newAudio = newAud[start:end] newAudio.export( f'audio_files/split/SplitAudio_{n}.wav', format="wav") #Exports to a wav file in the current path. return n n = split_audio(audio_file_name) def get_translation(): entire_text = [] for num in range(n): r = sr.Recognizer() harvard = sr.AudioFile(f'audio_files/split/SplitAudio_{num+1}.wav') with harvard as source: r.adjust_for_ambient_noise(source) r.enable_separate_recognition_per_channel = True audio = r.record(source) text = r.recognize_google(audio, language='en-US') text = f'Speaker {n}: ' + str(text) entire_text.append(text) return entire_text translation = get_translation() return render_template('translation.html', translation=translation)
from resemblyzer import VoiceEncoder, preprocess_wav from pathlib import Path import numpy as np import os data_dir = "/media/user/data/complete_synth/" encoder = VoiceEncoder() counter = 0 end_this_shit = False for path, subdirs, files in os.walk(data_dir): for name in files: extension = name[-3:] if extension == "wav": full_file_path = path + "/" + name wav = preprocess_wav(Path(full_file_path)) should_reproces = False while len(wav) / 16000 < 3.00: wav = np.concatenate((wav, wav)) should_reproces = True if should_reproces: embed = encoder.embed_utterance(wav) counter += 1 np.save(path + "/" + name[:-4] + "_embed.npy", embed) print("Processed: " + str(counter)) #fpath = Path("/home/user/data/complete_synth/nst_synth/3468.wav") #wav = preprocess_wav(fpath) #encoder = VoiceEncoder()
from resemblyzer import preprocess_wav, VoiceEncoder from flask import Flask, request from pathlib import Path import numpy as np import os app = Flask(__name__) app.config['UPLOAD_FOLDER'] = "utils/" # wav_fpath1 = Path("utils/", "voice1.mp3") # wav_fpath2 = Path("utils/", "voice2.mp3") # wav2 = preprocess_wav(wav_fpath2) encoder = VoiceEncoder("cpu") @app.route("/verify", methods=["POST"]) def main(): file_var1 = request.files["audio1"] file_var2 = request.files["audio2"] print(file_var1) print(file_var2) file_extension1 = file_var1.filename.split(".")[-1] file_extension2 = file_var2.filename.split(".")[-1] if not file_extension1 in ["mp3", "wav"
wav_fpath = Path(audio_file_path) def mp3_to_wav(audio_file_name): if audio_file_name.split('.')[1] == 'mp3': sound = AudioSegment.from_mp3(audio_file_name) audio_file_name = audio_file_name.split('.')[0] + '.wav' sound.export(audio_file_name, format="wav") mp3_to_wav(audio_file_path) #preprocesses the wave file to turn it into a file without sounds wav = preprocess_wav(wav_fpath) encoder = VoiceEncoder( "cpu" ) #Creates a voice encoder object so we can process audio with the cpu _, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=16) #create a cluster object clusterer = SpectralClusterer(min_clusters=2, max_clusters=100, p_percentile=0.90, gaussian_blur_sigma=1) #label the speaker that is speaking at certain time labels = clusterer.predict(cont_embeds)
def __init__(self): """At initialization level AudioFeaturizer loads VoiceEncoder model. """ #: VoiceEncoder: VoiceEncoder that used to get d-vector from audio record. self.encoder = VoiceEncoder()
class AudioFeaturizer(): """AudioFeaturizer allows to get features from audio records. This class allows to read audio record as file or by URL, get features from audio record and compare different sets of features to get similarity. :param names_to_norm: Names of features that should be normilized. :type names_to_norm: list :param path_to_temp_file: File name of temporary saved audio record. :type path_to_temp_file: str :param feature_len: Default lenght of feature vector. :type feature_len: int :param d_weight: Weight for d-vector feature, which is used in process of comparison. :type d_weight: float :param lfcc_weight: Weight for lfcc feature, which is used in process of comparison. :type lfcc_weight: float :param pncc_weight: Weight for pncc feature, which is used in process of comparison. :type pncc_weight: float :param mfcc_weight: Weight for mfcc feature, which is used in process of comparison. :type mfcc_weight: float """ names_to_norm = ['lfcc', 'mfcc', 'pncc'] path_to_temp_file = './temp.wav' feature_len = 384 d_weight = 4.0 lfcc_weight = 1.2 pncc_weight = 0.5 mfcc_weight = 0.5 def __init__(self): """At initialization level AudioFeaturizer loads VoiceEncoder model. """ #: VoiceEncoder: VoiceEncoder that used to get d-vector from audio record. self.encoder = VoiceEncoder() def read_file(self, file_name, sample_rate=16000): """Read audio record as file. :param file_name: Path to the audio file. :type file_name: str :param sample_rate: Sample rate for audio record. :type sample_rate: int :return: Record and sample rate """ record, sample_rate = librosa.load(file_name, sr=sample_rate) return record, sample_rate def read_file_by_url(self, url, sample_rate=16000): """Read audio record by URL. :param url: URL to the audio file. :type url: str :param sample_rate: Sample rate for audio record. :type sample_rate: int :return: Record and sample rate """ z = io.BytesIO(urlopen(url).read()) temp_path = f'./{str(uuid.uuid4())}.wav' temp_file = Path(temp_path).write_bytes(z.getbuffer()) record, sample_rate = librosa.load(temp_path, sr=sample_rate) Path(temp_path).unlink() return record, sample_rate def norm_dim(self, features): """Normalize feature vector to 1-dimensional with default lenght. :param features: Feature vector. :type features: list :return: Normalized features """ return np.mean(features.T, axis=0) def get_d_vector(self, record, sample_rate): """Get d-vector feature from audio record. Args: :param record: Record object to get feature from. :type record: object :param sample_rate: Sample rate for audio record. :type sample_rate: int :return: D-vector feature vector """ wav = preprocess_wav(record) embed = self.encoder.embed_utterance(wav) np.set_printoptions(precision=3, suppress=True) return embed def get_lfcc(self, record, sample_rate): """Get LFCC feature from audio record. `LFCC paper <http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.63.8029&rep=rep1&type=pdf>`_ :param record: Record object to get feature from. :type record: object :param sample_rate: Sample rate for audio record. :type sample_rate: int :return: LFCC feature vector """ return lfcc(record, fs=sample_rate) def get_mfcc(self, record, sample_rate): """Get MFCC feature from audio record. `MFCC paper <https://arxiv.org/pdf/1003.4083.pdf>`_ :param record: Record object to get feature from. :type record: object :param sample_rate: Sample rate for audio record. :type sample_rate: int :return: MFCC feature vector """ return mfcc(record, fs=sample_rate) def get_pncc(self, record, sample_rate): """Get PNCC feature from audio record. `PNCC paper <http://www.cs.cmu.edu/~robust/Papers/OnlinePNCC_V25.pdf>`_ :param record: Record object to get feature from. :type record: object :param sample_rate: Sample rate for audio record. :type sample_rate: int :return: PNCC feature vector """ return pncc(record, fs=sample_rate) def get_all_features(self, record, sample_rate=16000, normalize_dim=False): """Get all list of features from audio record. :param record: Record object to get feature from. :type record: object :param sample_rate: Sample rate for audio record. :type sample_rate: int :param normalize_dim: Normilize vectors or not. :type normalize_dim: bool :return: Dictionary of all features """ all_features = { 'lfcc': self.get_lfcc(record, sample_rate), 'mfcc': self.get_mfcc(record, sample_rate), 'pncc': self.get_pncc(record, sample_rate), 'd_vector': self.get_d_vector(record, sample_rate) } if normalize_dim: for feature_name in all_features.keys(): all_features[feature_name] = self.norm_dim(all_features[feature_name]) if self.is_in_norm_list(feature_name) else all_features[feature_name] return all_features def get_all_features_limited(self, record, sample_rate=16000): """Get all list of features from audio record and normalize by default dimension and lenght. :param record: Record object to get feature from. :type record: object :param sample_rate: Sample rate for audio record. :type sample_rate: int :return: Dictionary of all features """ all_features = self.get_all_features(record, sample_rate, normalize_dim=True) for feature_name in all_features: if feature_name in self.names_to_norm: all_features[feature_name] = all_features[feature_name][:self.feature_len] return all_features def get_all_features_mean_limited(self, record, sample_rate=16000, chunk_len=5): chunks_amount = math.floor(len(record)/sample_rate/chunk_len) features_sets = { 'd_vector': [], 'mfcc': [], 'lfcc': [], 'pncc': [] } for i in range(chunks_amount): record_chunk = record[i*sample_rate*chunk_len:(i+1)*sample_rate*chunk_len] chunk_all_features = self.get_all_features(record_chunk, sample_rate, normalize_dim=True) for feature_name in chunk_all_features: features_sets[feature_name].append(chunk_all_features[feature_name]) all_features = { 'd_vector': None, 'mfcc': None, 'lfcc': None, 'pncc': None } for feature_name in all_features: all_features[feature_name] = np.mean(features_sets[feature_name], axis=0) all_features = self.get_all_features(record, sample_rate, normalize_dim=True) for feature_name in all_features: if feature_name in self.names_to_norm: all_features[feature_name] = all_features[feature_name][:self.feature_len] return all_features def is_in_norm_list(self, feature_name): """Check feature name in default normaliation list. :param feature_name: Name of the feature. :type feature_name: str :return: True or False for feature name in default normalize list. """ return any([name == feature_name for name in self.names_to_norm]) def visualize_spectrogram(self, record, sample_rate): """Build plot of spectrogram for audio record. :param record: Record object. :type record: object :param sample_rate: Sample rate for audio record. :type sample_rate: int """ vis.spectogram(record, sample_rate) def visualize_features(self, features, feature_index='feature', frame_index='frame index', normalize_dim=False): """Build plot of features for audio record. :param features: Feature vector. :type features: list :param feature_index: Feature name index. :type feature_index: str :param frame_index: Frame name index. :type frame_index: str :param normalize_dim: Normilize vectors or not. :type normalize_dim: bool """ if normalize_dim: vis.plot(features, feature_index, frame_index) else: vis.visualize_features(features, feature_index, frame_index) def cosine_similarity(self, x, y): """Compare two feature lists by cosine distance. :param x: First feature vector. :type x: list :param y: Second feature vector. :type y: list :return: Float number from 0 to 1 which shows similarity between two vectors. """ return 1 - spatial.distance.cosine(x, y) def compare_two_features_sets(self, features_1, features_2): """Compare two feature sets. :param features_1: First feature set. :type features_1: dict :param features_2: Second feature set. :type features_2: dict :return: Float number which shows similarity between two feature sets. """ d_sim = self.cosine_similarity(features_1['d_vector'], features_2['d_vector']) lfcc_sim = self.cosine_similarity(features_1['lfcc'], features_2['lfcc']) pncc_sim = self.cosine_similarity(features_1['pncc'], features_2['pncc']) mfcc_sim = self.cosine_similarity(features_1['mfcc'], features_2['mfcc']) sum_sim = d_sim*self.d_weight + lfcc_sim*self.lfcc_weight + pncc_sim*self.pncc_weight + mfcc_sim*self.mfcc_weight # TODO: scale sim based on weights return sum_sim def features_to_json_serializable(self, all_features): """Format feature set to json-serializable. :param all_features: Feature set. :type all_features: dict :return: Json-serializable feature set dictionary. """ for feature_name in all_features: all_features[feature_name] = all_features[feature_name].tolist() return all_features
class AutoVCAlphaModel(BaseModel): def __init__(self, params): super().__init__(params) self.encoder = ContentEncoder(params) self.decoder = Decoder(params) self.postnet = PostNet(params) self.style_encoder = VoiceEncoder() self.freeze(self.style_encoder) def forward(self, wavs, mels): c_src = self._make_speaker_vectors(wavs, mels.size(-1), mels.device) codes, mel_outputs, mel_outputs_postnet, q_loss = self._forward(mels, c_src) return ( mel_outputs, mel_outputs_postnet, codes, self.encoder(mel_outputs_postnet)[0], q_loss ) def inference(self, src_path: str, tgt_path: str): wav_src, wav_tgt, mel_src = self._preprocess(src_path, tgt_path) c_src = self._make_speaker_vectors([wav_src], mel_src.size(-1), mel_src.device) c_tgt = self._make_speaker_vectors([wav_tgt], mel_src.size(-1), mel_src.device) _, _, mel_outputs_postnet, _ = self._forward(mel_src, c_src, c_tgt) wav = self._mel_to_wav(mel_outputs_postnet) return wav def _forward(self, mels, c_src, c_tgt=None): codes, q_loss = self.encoder(mels) decoder_input = torch.cat((codes, c_src if c_tgt is None else c_tgt), dim=1) mel_outputs = self.decoder(decoder_input) mel_outputs_postnet = self.postnet(mel_outputs) mel_outputs_postnet = mel_outputs + mel_outputs_postnet return codes, mel_outputs, mel_outputs_postnet, q_loss def _make_speaker_vectors(self, wavs, time_size, device): c = [self.style_encoder.embed_utterance(x) for x in wavs] c = torch.tensor(c, dtype=torch.float, device=device) c = c[:, :, None].expand(-1, -1, time_size) return c def _preprocess(self, src_path: str, tgt_path: str): wav_src, mel_src = get_wav_mel(src_path) wav_tgt, _ = get_wav_mel(tgt_path) mel_src = self._preprocess_mel(mel_src) return wav_src, wav_tgt, mel_src def _preprocess_mel(self, mel): if self.is_normalize: mel = normalize(mel) mel = self.unsqueeze_for_input(mel) return mel
def fingerprint_from_waveform(wav): return VoiceEncoder().embed_utterance(wav)
wav = preprocess_wav(wav_fpath) # Cut some segments from single speakers as reference audio segments = [[0, 5.5], [6.5, 12], [17, 25]] speaker_names = ["Kyle Gass", "Sean Evans", "Jack Black"] speaker_wavs = [wav[int(s[0] * sampling_rate):int(s[1]) * sampling_rate] for s in segments] ## Compare speaker embeds to the continuous embedding of the interview # Derive a continuous embedding of the interview. We put a rate of 16, meaning that an # embedding is generated every 0.0625 seconds. It is good to have a higher rate for speaker # diarization, but it is not so useful for when you only need a summary embedding of the # entire utterance. A rate of 2 would have been enough, but 16 is nice for the sake of the # demonstration. # We'll exceptionally force to run this on CPU, because it uses a lot of RAM and most GPUs # won't have enough. There's a speed drawback, but it remains reasonable. encoder = VoiceEncoder("cpu") print("Running the continuous embedding on cpu, this might take a while...") _, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=16) # Get the continuous similarity for every speaker. It amounts to a dot product between the # embedding of the speaker and the continuous embedding of the interview speaker_embeds = [encoder.embed_utterance(speaker_wav) for speaker_wav in speaker_wavs] similarity_dict = {name: cont_embeds @ speaker_embed for name, speaker_embed in zip(speaker_names, speaker_embeds)} ## Run the interactive demo interactive_diarization(similarity_dict, wav, wav_splits)
def fingerprint_from_file(filepath, segment=None, sampling_rate=16000): fpath = Path(filepath) wav = preprocess_wav(fpath) if segment: wav = wav[int(segment[0] * sampling_rate):int(segment[1]) * sampling_rate] return VoiceEncoder().embed_utterance(wav)
def load_asv_model(device): model = VoiceEncoder().to(device) return model
import pickle import glob import numpy as np from resemblyzer import VoiceEncoder import os encoder = VoiceEncoder('cpu') data = {} classes = {} i = 0 path = 'data' for name in os.listdir(path): if os.path.isdir(os.path.join(path, name)): classes[name] = i i += 1 def preprocess(name, embedding): global data embedded = encoder.embed_utterance(embedding) embedded = np.array(embedded).reshape(1, 256) if name not in data: data[name] = [embedded] else: data[name].append(embedded) for filepath in glob.glob('*.pickle'): name = filepath.split('_')[0] with open(filepath, 'rb') as handle: embedding = pickle.load(handle)
"%.1fGb total memory.\n" % (torch.cuda.device_count(), device_id, gpu_properties.name, gpu_properties.major, gpu_properties.minor, gpu_properties.total_memory / 1e9)) ## Load the models one by one. print("Preparing the synthesizer and the vocoder...") synthesizer = Synthesizer(synth_path.joinpath("taco_pretrained"), low_mem=False) vocoder.load_model(vocoder_path) print("Loading encoder from resemblyzer") encoder = VoiceEncoder() # Get the reference audio repo path speaker = 'SAM' repo_fpath = Path('../SOURCE_AUDIO',speaker) wav_fpaths = list(repo_fpath.glob(speaker+"*")) print('PAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATHS') print(repo_fpath) print(wav_fpaths) wavs = np.array(list(map(preprocess_wav, tqdm(wav_fpaths, "Preprocessing wavs", len(wav_fpaths))))) speaker_embedding = encoder.embed_speaker(wavs) text = str(np.loadtxt('../test_sentence.txt', dtype='str', delimiter = '&')) texts = [text]
from itertools import groupby from pathlib import Path from tqdm import tqdm import matplotlib.pyplot as plt import numpy as np # The demos are ordered so as to make the explanations in the comments consistent. If you only # care about running the code, then you don't have to follow that order. # The neural network will automatically use CUDA if it's available on your machine, otherwise it # will use the CPU. You can enforce a device of your choice by passing its name as argument to the # constructor. The model might take a few seconds to load with CUDA, but it then executes very # quickly. encoder = VoiceEncoder() # We'll use a smaller version of the dataset LibriSpeech test-other to run our examples. This # smaller dataset contains 10 speakers with 10 utterances each. N.B. "wav" in variable names stands # for "waveform" and not the wav file extension. wav_fpaths = list(Path("audio_data", "librispeech_test-other").glob("**/*.flac")) # Group the wavs per speaker and load them using the preprocessing function provided with # resemblyzer to load wavs in memory. It normalizes the volume, trims long silences and resamples # the wav to the correct sampling rate. speaker_wavs = {speaker: list(map(preprocess_wav, wav_fpaths)) for speaker, wav_fpaths in groupby(tqdm(wav_fpaths, "Preprocessing wavs", len(wav_fpaths), unit="wavs"), lambda wav_fpath: wav_fpath.parent.stem)} ## Similarity between two utterances from each speaker # Embed two utterances A and B for each speaker
"--num", help="Num of speakers(optional)", default=None, type=int, ) parser.add_argument("--interactive", dest="interactive", action="store_true") parser.add_argument("--no-interactive", dest="interactive", action="store_false") parser.set_defaults(interactive=True) args = parser.parse_args() audio_path = Path(args.audio) wav = preprocess_wav(audio_path) encoder = VoiceEncoder() if args.rate <= 4 else VoiceEncoder("cpu") # encoder = VoiceEncoder("cpu") _, cond_emd, wav_splits = encoder.embed_utterance( wav, return_partials=True, rate=args.rate, ) clusterer = SpectralClusterer( min_clusters=args.num, p_percentile=0.91, ) labels = clusterer.predict(cond_emd)