예제 #1
0
def convert_two(model, uttr_org, uttr_trg):
    spect_vc = []
    #spect_vc.append( ("uttr_org", uttr_org) )
    #spect_vc.append( ("uttr_trg", uttr_trg) )

    uttr_trg, _ = pad_seq(uttr_trg, 32)
    uttr_org, _ = pad_seq(uttr_org, 32)
    trg_enc = preprocess_wav("./demo/data/22618_01.wav")
    trg_enc = encoder.embed_utterance(trg_enc)

    org_enc = preprocess_wav("./demo/data/22617_01.wav")
    org_enc = encoder.embed_utterance(org_enc)

    uttr_trg = torch.FloatTensor(uttr_trg).to(device).double().unsqueeze(0)
    uttr_org = torch.FloatTensor(uttr_org).to(device).double().unsqueeze(0)
    org_enc = torch.FloatTensor(org_enc).to(device).double().unsqueeze(0)
    trg_enc = torch.FloatTensor(trg_enc).to(device).double().unsqueeze(0)

    with torch.no_grad():
        _, x_identic_psnt, _ = model(uttr_org, org_enc, trg_enc)

    res = x_identic_psnt[0, 0, :, :].cpu().numpy()
    spect_vc.append(('fin_conversion', res))

    return spect_vc
def vectorExtract(audio1_path, audio2_path):
    wav1 = preprocess_wav(Path(audio1_path))
    wav2 = preprocess_wav(Path(audio2_path))

    encoder = VoiceEncoder()

    embed1 = encoder.embed_utterance(wav1)
    embed2 = encoder.embed_utterance(wav2)

    return numpy.concatenate([embed1, embed2])
def simVoice(audio1_path, audio2_path):
    wav1 = preprocess_wav(Path(audio1_path))
    wav2 = preprocess_wav(Path(audio2_path))

    encoder = VoiceEncoder()

    embed1 = encoder.embed_utterance(wav1)
    embed2 = encoder.embed_utterance(wav2)

    return dot(embed1, embed2) / (norm(embed1) * norm(embed2)
                                  )  #np.inner(embed1, embed2)
예제 #4
0
def get_metrics(user1, user2, inferencer):
    phrases1 = {}
    phrases2 = {}
    with open(f'{ROOT_PATH}/{user1}/metadata.csv', 'r') as ttf:
        for line in ttf:
            cols = line.split('|')
            wav_file = os.path.join(ROOT_PATH, user1, cols[0])
            if not wav_file.endswith('.wav'):
                wav_file += '.wav'
            text = cols[1].strip()
            text = text.strip(',')
            phrases1[text] = wav_file

    with open(f'{ROOT_PATH}/{user2}/metadata.csv', 'r') as ttf:
        for line in ttf:
            cols = line.split('|')
            wav_file = os.path.join(ROOT_PATH, user2, cols[0])
            if not wav_file.endswith('.wav'):
                wav_file += '.wav'
            text = cols[1].strip()
            text = text.strip(',')
            phrases2[text] = wav_file

    inter = set(phrases1.keys()).intersection(phrases2.keys())
    inter = list(inter)
    inter = np.random.choice(inter, 10)

    wavs_1 = []
    wavs_2 = []
    generated_1 = []
    generated_2 = []

    for phr in inter:
        wavs_1.append(res.preprocess_wav(phrases1[phr]))
        wavs_2.append(res.preprocess_wav(phrases2[phr]))
        generated_1.append(inferencer.get_json_output(user1, phr, OUT_PATH, save_wavs=False))
        generated_2.append(inferencer.get_json_output(user2, phr, OUT_PATH, save_wavs=False))

    generated_1 = list(map(res.preprocess_wav, generated_1))
    generated_2 = list(map(res.preprocess_wav, generated_2))

    o_embed_1 = encoder.embed_speaker(wavs_1)
    o_embed_2 = encoder.embed_speaker(wavs_2)

    g_embed_1 = encoder.embed_speaker(generated_1)
    g_embed_2 = encoder.embed_speaker(generated_2)

    return np.inner(o_embed_1, o_embed_2),  np.linalg.norm(o_embed_1 - o_embed_2), \
            np.inner(g_embed_1, o_embed_1), np.linalg.norm(o_embed_1 - g_embed_1), \
            np.inner(g_embed_2, o_embed_2), np.linalg.norm(o_embed_2 - g_embed_2)
예제 #5
0
def load_data(from_path=None, ckpt_path=None, data_path=None, save_path=None):
    if from_path is None:
        if ckpt_path is None:
            raise Exception('No checkpoint path provided')

        from resemblyzer import preprocess_wav, VoiceEncoder
        from tqdm import tqdm

        device = torch.device('cuda')
        encoder = VoiceEncoder(device=device, loss_device=device)
        encoder.load_ckpt(ckpt_path, device=device)
        encoder.eval()
        wav_fpaths = list(Path(data_path).glob("**/*.flac"))

        # Preprocess and save encoded utterance and label to list
        X = []
        y = []
        for wav_fpath in tqdm(wav_fpaths):
            wav = preprocess_wav(wav_fpath)
            X.append(encoder.embed_utterance(wav).cpu().numpy())
            y.append(wav_fpath.parent.parent.stem)

        # Save for testing
        if save_path is not None:
            np.save(Path(save_path, 'embeds.npy'), X)
            np.save(Path(save_path, 'labels.npy'), y)
        else:
            raise Exception('No save_path provided')
    else:
        X = np.load(Path(from_path, 'embeds.npy'), allow_pickle=True)
        y = np.load(Path(from_path, 'labels.npy'), allow_pickle=True)
    return X, y
예제 #6
0
def make_one_dataset(filename, parameters, total, display=False):
    global finish
    sub_filename = filename.strip().split('/')[-1]
    speaker_id, utt_id = re.match(
        r'p(\d+)_(\d+)\.wav',
        sub_filename).groups()  # format: p{speaker}_{sid}.wav
    #mel_spec, lin_spec = get_spectrograms(filename)
    mel_spec, lin_spec, mfcc, f0, audio = wav2spectrogram(filename,
                                                          parameters,
                                                          display=display)

    wav = preprocess_wav(Path(filename))
    d_mel = d_wav2spec(wav)

    print(
        '[Processor] - processing {}/{} s{}-{} | d_mel: {} | mel:{} | lin:{} | audio:{}'
        .format(finish * WORKERS, total, speaker_id, utt_id, d_mel.shape,
                mel_spec.shape, lin_spec.shape, audio.shape),
        end='\r')
    result = {}
    result['speaker_id'] = speaker_id
    result['utt_id'] = utt_id
    result['d_mel_spec'] = d_mel
    result['mel_spec'] = mel_spec
    result['lin_spec'] = lin_spec
    result['mfcc'] = mfcc
    result['f0'] = f0
    result['audio'] = audio
    finish += 1
    return result
예제 #7
0
    def voice_com(self,v1):
        '''
        v1: wav file path

        return True if speaker in database
        '''
        if len(self.database.values()) == 0:
            print("Your data not in our database.")
            return False
        wav = preprocess_wav(Path(v1))

        # ## method 1
        # embed1 = self.encoder.embed_speaker(wav1)
        # embed2 = self.encoder.embed_speaker(wav2)
        # sims1 = np.inner(embed1,embed2) # bigger 0.85

        ## method 2
        embed = self.encoder.embed_utterance(wav)

        for dk in self.database.keys():
            sims = embed @ self.database[dk] # bigger 0.75
            if sims > 0.75:
                print("welcome {}!".format(dk))
                return True
        print("Your data not in our database.")
        return False
def get_speaker_similarity_dict_and_wav_splits(file_name):
    print('Processing voices for file:', file_name)
    fpath = os.fspath(file_name)
    wav = preprocess_wav(fpath_or_wav=fpath)

    speaker_names = ['Phreak', 'Other']
    segments = [[0, 25], [75, 90]]
    encoder = VoiceEncoder('cpu')
    speaker_wavs = [
        wav[int(s[0] * sampling_rate):int(s[1] * sampling_rate)]
        for s in segments
    ]
    print(
        "Running the continuous embedding on cpu, this might take a while...")
    _, cont_embeds, wav_splits = encoder.embed_utterance(wav,
                                                         return_partials=True,
                                                         rate=16)

    speaker_embeds = [
        encoder.embed_utterance(speaker_wav) for speaker_wav in speaker_wavs
    ]
    similarity_dict = {
        name: cont_embeds @ speaker_embed
        for name, speaker_embed in zip(speaker_names, speaker_embeds)
    }

    return similarity_dict, wav_splits
예제 #9
0
def load_speaker_embeds(args):
    encoder = VoiceEncoder()

    speakers_dir = '{0}/{1}/{2}/'.format(args.media, args.name, args.speakers)
    speaker_embeds_list = []
    if os.path.exists(speakers_dir):
        speakers_dir_subfolders = [
            f.path for f in os.scandir(speakers_dir) if f.is_dir()
        ]
        for speakers_dir_subfolder in speakers_dir_subfolders:
            speaker_embeds = []
            wav_file_list = list(
                enumerate(glob.glob(
                    "{}/*.wav".format(speakers_dir_subfolder))))
            for index, wav_file in wav_file_list:
                wav = AudioSegment.from_wav(wav_file)
                librosa_npy = audiosegment_to_librosawav(wav)
                librosa_wav = preprocess_wav(librosa_npy)
                current_embed = encoder.embed_utterance(librosa_wav)
                speaker_embeds.append(current_embed)
            if len(speaker_embeds) > 0:
                dirname = os.path.basename(speakers_dir_subfolder)
                speaker_embeds_list.append((
                    dirname,
                    speaker_embeds,
                ))
    return speaker_embeds_list
예제 #10
0
def speaker_diarization(**kwargs):
    if 'wav' in kwargs:
        wav = kwargs['wav']
    elif 'filepath' in kwargs:
        wav = preprocess_wav(kwargs['filepath'])

    avg_embed, cont_embeds, wav_splits = VoiceEncoder().embed_utterance(
        wav, return_partials=True, rate=16)

    fig, ax = plt.subplots(figsize=(6, 6))
    wav_seconds = len(wav) / hparams.sampling_rate
    timesteps = np.arange(0, wav_seconds, wav_seconds / len(cont_embeds))

    if 'speaker_embed' in kwargs:
        # compare utterance embeddings with speaker embeddings
        similarity = cont_embeds @ kwargs['speaker_embed']
        dummy_similarity = cont_embeds @ create_dummy_speaker()
        ax.plot(timesteps, similarity, 'g')
        ax.plot(timesteps, dummy_similarity, 'k--')
    else:
        # cluster utterance embeddings using Spectral Clustering
        spectral = SpectralClustering(n_clusters=2).fit_predict(cont_embeds)
        ax.plot(timesteps, spectral)

    plt.show()
예제 #11
0
    def several_speakers_identification(self,
                                        path,
                                        min_duration=3,
                                        return_splits=False,
                                        export=False,
                                        recognition=False,
                                        language='en-En'):
        self.min_duration = min_duration
        self.path = path
        wav = preprocess_wav(path)
        sf.write(self.wav, wav, 16000, subtype='PCM_24')
        encoder = VoiceEncoder()
        _, embed, slices = encoder.embed_utterance(wav,
                                                   return_partials=True,
                                                   rate=1)
        np.set_printoptions(suppress=True)
        for i in range(len(embed)):
            self.add_speaker(embed[i])
        # for i in range(len(self.timing)):
        #     print(i, self.timing[i])
        self.clear()
        print('Found %d speakers' % self.speakers_number)
        for i in range(self.speakers_number):
            print('Speaker ' + str(i) + ': ' + str(len(self.speakers[i])) +
                  's')
        self.splits = self.get_splits()
        if recognition or export:
            paths = ExportAudio.export(self.splits, self.wav)
            if recognition:
                self.recognize_audio(language, paths, export)
        if return_splits:
            return self.speakers_number, self.splits

        return self.speakers_number
예제 #12
0
def fingerprint_from_file(filepath, segment=None, sampling_rate=16000):
    fpath = Path(filepath)
    wav = preprocess_wav(fpath)
    if segment:
        wav = wav[int(segment[0] * sampling_rate):int(segment[1]) *
                  sampling_rate]
    return VoiceEncoder().embed_utterance(wav)
예제 #13
0
def isolate_voice(audio_file_path: Path, embed_path: Path, params_path: Path, output_path: Path):
    """
    load speaker embeds from pickle
    take voice out only if value is > thresh and take greater if both > thresh
    
    Args:
        file_path: input complete wav file path from which rick's voice will be taken out
        cutoff_thresh: voice if value above this is taken
    """
    params = load_params(params_path)
    cutoff_threshold = params["cutoff_threshold"]
    sampling_rate = params["wav_bitrate"]

    print("preprocessing")
    file_wav = preprocess_wav(audio_file_path) ; print("input file shape ", file_wav.shape, "\n", file_wav[:10])
    print("file preprocessed")
    encoder = VoiceEncoder("cpu")
    print("model loaded")
    speaker_names = ["Rick", "Morty"]

    _, file_embeds, wav_splits = encoder.embed_utterance(file_wav, return_partials=True, rate=1)
    print("file encoded")
    speaker_embeds = pickle.load(open(embed_path, "rb"))

    similarity_dict = {name: file_embeds @ speaker_embed for name, speaker_embed in zip(speaker_names, speaker_embeds)}
    print("similatrity dict is\n", similarity_dict)
    pickle.dump(similarity_dict, open("./similarity.pkl", "wb"))

    #find greater in both then cutoff -> take that second append it to that file
    current_second = 0
    rick_wav = []
    rick_seconds = []
    morty_wav = []
    morty_seconds = []

    for rick_value, morty_value in zip(similarity_dict["Rick"], similarity_dict["Morty"]):
        print(current_second, rick_value, morty_value)
        if rick_value > morty_value and rick_value > cutoff_threshold:
            rick_wav.append(file_wav[current_second * sampling_rate : (current_second+1) * sampling_rate])
            rick_seconds.append(current_second)
            print("append rick")

        elif morty_value > rick_value and morty_value > cutoff_threshold:
            morty_wav.append(file_wav[current_second * sampling_rate: (current_second+1) * sampling_rate])
            morty_seconds.append(current_second)
            print("append morty")

        else:
            print("skipping")

        current_second += 1

    rick_wav = [item for sublist in rick_wav for item in sublist]
    morty_wav = [item for sublist in morty_wav for item in sublist]
    
    save_wav(np.array(rick_wav), output_path.joinpath("rick.wav"), sampling_rate)
    save_wav(np.array(morty_wav), output_path.joinpath("morty.wav"), sampling_rate)

    return rick_seconds, morty_seconds
예제 #14
0
    def preprocess(self, f):
        """
        Applies preprocessing operations to a waveform either on disk or in memory such that
        The waveform will be resampled to match the data hyperparameters.

        :param f: either a filepath to an audio file or the waveform as a numpy array of floats.
        """
        return preprocess_wav(f)
예제 #15
0
def process(wav_fpath):
    wav = preprocess_wav(wav_fpath)
    encoder = VoiceEncoder("cpu")
    _, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=16)

    # Output denoised wave after removing the pauses
    write('DenoisedInputFiles/DenoisedSignal.wav', 16000, wav)
    return cont_embeds, wav_splits
예제 #16
0
    def oneDictorIdentification(self, cSample, mainFile):
        print('[-i] --> Identify Dictor')
        avg1 = 0.0
        avg2 = 0.0

        fpath = Path(cSample)
        wav = preprocess_wav(fpath)

        encoder = VoiceEncoder()
        embed = encoder.embed_utterance(wav)
        np.set_printoptions(precision=3, suppress=True)
        embedNew = []

        for i in embed:
            if i != 0.0:
                embedNew.append(i)

        for s in embedNew:
            avg1 = avg1 + s

        fpath = Path(mainFile)
        wav = preprocess_wav(fpath)

        encoder = VoiceEncoder()
        embed = encoder.embed_utterance(wav)
        np.set_printoptions(precision=3, suppress=True)
        embedNew2 = []

        for i in embed:
            if i != 0.0:
                embedNew2.append(i)

        for s in embedNew2:
            avg2 = avg2 + s

        self.result = abs((avg2 / len(embedNew2)) - (avg1 / len(embedNew)))
        print(self.result)
        if (self.result < 0.002):
            print("Match!")
            # print("\033[33m\033[1m {}".format("Match!"))
            return 1
        else:
            print("These are different voices")
            # print("\033[33m\033[1m {}".format("These are different voices"))
            return 0
예제 #17
0
def process(wav_fpath):
    wav = preprocess_wav(wav_fpath)
    encoder = VoiceEncoder()
    _, cont_embeds, wav_splits = encoder.embed_utterance(wav,
                                                         return_partials=True,
                                                         rate=16)

    # Output denoised wave after removing the pauses
    write('Denoise/Denoise_commercial_mono.wav', 16000, wav)
    return cont_embeds, wav_splits
def calculate_score(model, data_dir, output_dir, target_dir, threshold_path,
                    **kwargs):
    """Calculate score"""

    data_dir = Path(data_dir)
    target_dir = Path(target_dir)

    if output_dir is None:
        output_dir = data_dir
    else:
        output_dir = Path(output_dir)
    output_dir.parent.mkdir(parents=True, exist_ok=True)
    output_path = Path(output_dir) / "evaluation_score.txt"

    metadata_path = data_dir / "metadata.json"
    metadata = json.load(metadata_path.open())

    thresholds = yaml.load(Path(threshold_path).open())
    threshold = thresholds[metadata["target_corpus"]]

    n_accept = 0
    for pair in tqdm(metadata["pairs"]):
        wav = preprocess_wav(data_dir / pair["converted"])
        source_emb = model.embed_utterance(wav)

        targets = [target_dir / tgt_utt for tgt_utt in pair["tgt_utts"]]
        target_emb = model.embed_speaker(
            [preprocess_wav(target) for target in targets])

        cosine_similarity = (np.inner(source_emb, target_emb) /
                             np.linalg.norm(source_emb) /
                             np.linalg.norm(target_emb))

        if cosine_similarity > threshold:
            n_accept += 1

    svar = n_accept / len(metadata["pairs"])
    print(f"[INFO]: Speaker verification accept rate: {svar}")
    print(
        f"Speaker verification accept rate: {svar}",
        file=output_path.open("a"),
    )
def encoder(file_paths, vocoder):
    print('Number of files in batch: {}'.format(len(file_paths)))

    # processed_wavs = Parallel(n_jobs=-1)(delayed(preprocess_wav)(i) for i in tqdm(file_paths))
    processed_wavs = [preprocess_wav(i) for i in tqdm(file_paths)]

    # encodings = Parallel(n_jobs=-1)(delayed(vocoder.embed_utterance)(i) for i in tqdm(processed_wavs))
    encodings = [vocoder.embed_utterance(i) for i in tqdm(processed_wavs)]
    print('Creating embeddings')
    encodings = np.array(encodings)
    return encodings
예제 #20
0
def compute_embed(files, encoder):
    emb = []

    files = random.sample(files, min(len(files), 20))
    for f in files:
        wav = preprocess_wav(f)
        e = encoder.embed_utterance(wav)
        emb.append(e)
    emb = np.array(emb)
    emb = emb.mean(axis=0)
    return emb
예제 #21
0
 def generate_voice_profile(self, data_path):
     embeds = []
     os.chdir(data_path)
     for file in os.listdir('.'):
         fpath = Path(os.getcwd() + '\\' + file)
         wav = preprocess_wav(fpath)
         embed = self.encoder.embed_utterance(wav)
         embeds.append(embed)
     centroid = np.array(embeds).mean(axis=0)
     os.chdir(de)
     return centroid
예제 #22
0
def embed_speaker_librispeech(speaker_path: Path, hp: Map):
    """
    Create an embedding of a speaker directory using `resemblyzer`.
    :param speaker_path: path to speaker directory
    :param hp: hyperparameters object
    :return: speaker embedding
    """
    flacs = list(speaker_path.rglob("*.flac"))
    flacs = random.sample(flacs, hp.n_samples)
    flacs = [resemblyzer.preprocess_wav(flac) for flac in flacs]
    return voice_encoder.embed_speaker(flacs)
예제 #23
0
    def add_data(self,v,name):
        '''
        v: wav file path
        name : str
        '''
        if name in self.database:
            print("person exist")
            return False
        wav = preprocess_wav(Path(v))
        self.database[name] = self.encoder.embed_utterance(wav)

        return True
예제 #24
0
def main():

    file_var1 = request.files["audio1"]
    file_var2 = request.files["audio2"]

    print(file_var1)
    print(file_var2)

    file_extension1 = file_var1.filename.split(".")[-1]
    file_extension2 = file_var2.filename.split(".")[-1]

    if not file_extension1 in ["mp3", "wav"
                               ] and not file_extension2 in ["mp3", "wav"]:
        return {"Error": "Formato inválido. Use mp3 o wav."}

    file_name1 = "voice1.{}".format(file_extension1)
    file_name2 = "voice2.{}".format(file_extension2)

    wav_fpath1 = os.path.join(app.config['UPLOAD_FOLDER'], file_name1)
    wav_fpath2 = os.path.join(app.config['UPLOAD_FOLDER'], file_name2)

    file_var1.save(wav_fpath1)
    file_var2.save(wav_fpath2)

    del file_var1
    del file_var2

    wav1 = preprocess_wav(wav_fpath1)
    wav2 = preprocess_wav(wav_fpath2)

    os.remove(wav_fpath1)
    os.remove(wav_fpath2)

    speaker_wavs = [wav1, wav2]

    speaker_embeds = [
        encoder.embed_utterance(speaker_wav) for speaker_wav in speaker_wavs
    ]
    result = speaker_embeds[0] @ speaker_embeds[1]
    return {"Exito": np.float64(result)}
예제 #25
0
def augment_data():
    noise_names = []
    for noise_path in os.listdir('./noise'):
        noise_name = noise_path.split('.')[0]
        noise = f'./noise/{noise_path}'
        for username in os.listdir('./data'):
            data_encode = []
            for user_file in os.listdir(f'./data/{username}'):
                y, sr = librosa.load(f'./data/{username}/{user_file}',
                                     sr=16000)
                encoded_data = resemblyzer.preprocess_wav(
                    f'./data/{username}/{user_file}')
                data_encode.append(encoded_data)
                for i in range(1):
                    choice = i
                    print(choice)

                    if choice == 1:
                        aug = pitch(mix_bg(y, noise), sr, 0.2)
                    elif choice == 2:
                        aug = speed(mix_bg(y, noise), 1.2)
                    else:
                        aug = mix_bg(y, noise)

                    if not os.path.exists(f'./augmented_data/{username}'):
                        os.mkdir(f'./augmented_data/{username}')

                    librosa.output.write_wav(
                        f'./augmented_data/{username}/{noise_name}_{i}.wav',
                        aug, sr)
                    encoded_data = resemblyzer.preprocess_wav(
                        f'./augmented_data/{username}/{noise_name}_{i}.wav')
                    data_encode.append(encoded_data)
                    with open(f'data/{username}_encoded_wav.pickle',
                              'wb') as handle:
                        pickle.dump(data_encode,
                                    handle,
                                    protocol=pickle.HIGHEST_PROTOCOL)
예제 #26
0
def extract(data_dirs, output_dir):
    """Extract embedding by resemblyzer."""
    encoder = VoiceEncoder()

    data = {}
    for data_dir in tqdm(data_dirs, position=0):
        file_list = librosa.util.find_files(data_dir)
        for file_path in tqdm(file_list, position=1, leave=False):
            wav = preprocess_wav(file_path)
            embedding = encoder.embed_utterance(wav)
            wav_name = splitext(basename(file_path))[0]
            data[wav_name] = embedding

    joblib.dump(data, f"{output_dir}.pkl")
예제 #27
0
def voiceRun(frames):
    p = pyaudio.PyAudio()
    wf = wave.open('check.wav', 'wb')
    wf.setnchannels(1)
    wf.setsampwidth(p.get_sample_size(pyaudio.paInt16))
    wf.setframerate(16000)
    wf.writeframes(b''.join(frames))
    wf.writeframes(b''.join(frames))
    wf.close()
    wav = preprocess_wav('check.wav')
    embed = encoder.embed_utterance(wav)
    embed = np.array(embed).reshape(-1, 1, 256)
    res1 = voiceModel.predict(embed)
    res1 = res1.flatten()
    return res1.tolist()
예제 #28
0
def get_name_id(args, encoder, speaker_embeds_list, audio_segment):
    segment_npy = audiosegment_to_librosawav(audio_segment)
    segment_wav = preprocess_wav(segment_npy)
    current_embed = encoder.embed_utterance(segment_wav)

    min_similarity = args.min_similarity
    name_id = ''
    for speaker_id, speaker_embeds in speaker_embeds_list:
        for speaker_embed in speaker_embeds:
            similarity = current_embed @ speaker_embed

            if similarity > min_similarity:
                min_similarity = similarity
                name_id = speaker_id
    return name_id
예제 #29
0
def get_spk_emb(audio_file_dir, segment_len=960000):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    resemblyzer_encoder = VoiceEncoder(device=device)

    wav = preprocess_wav(audio_file_dir)
    l = len(wav) // segment_len # segment_len = 16000 * 60
    l = np.max([1, l])
    all_embeds = []
    for i in range(l):
        mean_embeds, cont_embeds, wav_splits = resemblyzer_encoder.embed_utterance(
            wav[segment_len * i:segment_len* (i + 1)], return_partials=True, rate=2)
        all_embeds.append(mean_embeds)
    all_embeds = np.array(all_embeds)
    mean_embed = np.mean(all_embeds, axis=0)

    return mean_embed, all_embeds
    def get_d_vector(self, record, sample_rate):
        """Get d-vector feature from audio record.
            
        Args:
        :param record: Record object to get feature from.
        :type record: object
        :param sample_rate: Sample rate for audio record.
        :type sample_rate: int

        :return: D-vector feature vector
        """
        wav = preprocess_wav(record)

        embed = self.encoder.embed_utterance(wav)
        np.set_printoptions(precision=3, suppress=True)
        return embed