# https://medium.com/saarthi-ai/who-spoke-when-build-your-own-speaker-diarization-module-from-scratch-e7d725ee279
from resemblyzer import preprocess_wav, VoiceEncoder
from pathlib import Path

# D Vector Clusteroring
from spectralcluster import SpectralClusterer

# give the file path to your audio file
audio_file_path = '****************************************'
wav_fpath = Path(audio_file_path)

wav = preprocess_wav(wav_fpath)

encoder = VoiceEncoder("cpu")
_, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=16)

print(cont_embeds.shape)

####################################################################################################
# D vector clustering


clusterer = SpectralClusterer(
    min_clusters=2,
    max_clusters=100,
    p_percentile=0.90,
    gaussian_blur_sigma=1)

labels = clusterer.predict(cont_embeds)
示例#2
0
from data.wav_folder import read_wav
import os
from resemblyzer import preprocess_wav, VoiceEncoder
import numpy as np

NCE_dir = '/com_space/zhaohang/CUT/checkpoints/voice_CUT_replicate_many_p256/converted_sound_independent_MM/'
cyclegan_dir = '/com_space/zhaohang/CUT/checkpoints/cyclegan_replicate_many_p256/converted_sound_independent_MM/'
target_dir = '/com_space/zhaohang/CUT/datasets/voice/p256/'

sampling_rate = 24000
encoder = VoiceEncoder()

NCE_list, cyclegan_list = [], []
for NCE_path, cyclegan_path in zip(os.listdir(NCE_dir),
                                   os.listdir(cyclegan_dir)):

    NCE_path = os.path.join(NCE_dir, NCE_path)
    cyclegan_path = os.path.join(cyclegan_dir, cyclegan_path)

    NCE_wav, _ = read_wav(NCE_path, sr=sampling_rate)
    cyclegan_wav, _ = read_wav(cyclegan_path, sr=sampling_rate)

    NCE_list.append(NCE_wav)
    cyclegan_list.append(cyclegan_wav)

target_list = []
for target_path in os.listdir(target_dir):
    target_path = os.path.join(target_dir, target_path)
    target_wav, _ = read_wav(target_path, sr=sampling_rate)
    target_list.append(target_wav)
def load_model(root, device):
    """Load model"""

    model = VoiceEncoder()

    return model
示例#4
0
 def __init__(self):
     self.encoder = VoiceEncoder()
     self.database = {}
# take 6 segments of real speech as ground truth reference and compare those against the 12 
# remaining. Those segments are selected at random, so will run into different results every time
# you run the script, but they should be more or less consistent.
# Using the voice of Donald Trump is merely a matter of convenience, as several fake speeches 
# with his voice were already put up on youtube. This choice was not politically motivated.


## Load and preprocess the audio
data_dir = Path("audio_data", "donald_trump")
wav_fpaths = list(data_dir.glob("**/*.mp3"))
wavs = [preprocess_wav(wav_fpath) for wav_fpath in \
        tqdm(wav_fpaths, "Preprocessing wavs", len(wav_fpaths), unit=" utterances")]


## Compute the embeddings
encoder = VoiceEncoder()
embeds = np.array([encoder.embed_utterance(wav) for wav in wavs])
speakers = np.array([fpath.parent.name for fpath in wav_fpaths])
names = np.array([fpath.stem for fpath in wav_fpaths])


# Take 6 real embeddings at random, and leave the 6 others for testing
gt_indices = np.random.choice(*np.where(speakers == "real"), 6, replace=False) 
mask = np.zeros(len(embeds), dtype=np.bool)
mask[gt_indices] = True
gt_embeds = embeds[mask]
gt_names = names[mask]
gt_speakers = speakers[mask]
embeds, speakers, names = embeds[~mask], speakers[~mask], names[~mask]

def encode_each_batch(source_dir, source_dir_pattern, embed_file_path):
    vocoder = VoiceEncoder()
    file_paths = audio_paths(source_dir, source_dir_pattern)
    print('Total number of files: {}'.format(len(file_paths)))
    embeddings = encoder(file_paths=file_paths, vocoder=vocoder)
    save_embeddings(embed_file_path, embeddings, file_paths)
示例#7
0
from resemblyzer import preprocess_wav, VoiceEncoder
from itertools import groupby
from pathlib import Path
import matplotlib.pyplot as plt
import numpy as np

# The neural network will automatically use CUDA if it's available on your machine, otherwise it
# will use the CPU. You can enforce a device of your choice by passing its name as argument to the
# constructor. The model might take a few seconds to load with CUDA, but it then executes very
# quickly.
encoder = VoiceEncoder()

# We'll use a smaller version of the dataset LibriSpeech test-other to run our examples. This
# smaller dataset contains 10 speakers with 10 utterances each. N.B. "wav" in variable names stands
# for "waveform" and not the wav file extension.
wav_fpaths = list(Path("librispeech_test-other").glob("**/*.flac"))
# Group the wavs per speaker and load them using the preprocessing function provided with
# resemblyzer to load wavs in memory. It normalizes the volume, trims long silences and resamples
# the wav to the correct sampling rate.
spearker_wavs = {
    speaker: list(map(preprocess_wav, wav_fpaths))
    for speaker, wav_fpaths in groupby(wav_fpaths,
                                       lambda wav_fpath: wav_fpath.parent.stem)
}


def demo_similarity_matrix():

    ## Similarity between two utterances from each speaker
    # Embed two utterances A and B for each speaker
    embeds_a = np.array(
示例#8
0
def get_full_translation(audio_file_name):
    def mp3_to_wav(file):
        if audio_file_name.split('.')[1] == 'mp3':
            sound = AudioSegment.from_mp3(audio_file_name)
            sound.export(f"{audio_file_name.split('.')[0]}.wav", format="wav")
        else:
            pass

    wav_fpath = (f"{audio_file_name.split('.')[0]}.wav")

    #preprocesses the wave file to turn it into a file without sounds
    wav = preprocess_wav(wav_fpath)
    encoder = VoiceEncoder(
        "cpu"
    )  #Creates a voice encoder object so we can process audio with the cpu
    _, cont_embeds, wav_splits = encoder.embed_utterance(wav,
                                                         return_partials=True,
                                                         rate=16)

    #create a cluster object
    clusterer = SpectralClusterer(min_clusters=2,
                                  max_clusters=100,
                                  p_percentile=0.90,
                                  gaussian_blur_sigma=1)

    #label the speaker that is speaking at certain time
    labels = clusterer.predict(cont_embeds)

    def create_labelling(labels, wav_splits):
        times = [((s.start + s.stop) / 2) / sampling_rate for s in wav_splits]
        labelling = []
        start_time = 0

        for i, time in enumerate(times):
            if i > 0 and labels[i] != labels[i - 1]:
                temp = [str(labels[i - 1]), start_time, time]
                labelling.append(tuple(temp))
                start_time = time
            if i == len(times) - 1:
                temp = [str(labels[i]), start_time, time]
                labelling.append(tuple(temp))

        return labelling

    labelling = create_labelling(labels, wav_splits)
    print(labelling)

    #read data from the wave file

    def split_audio(file_path):
        thisList = []
        n = 0
        for i in labelling:
            n = +1
            #changes the values in the tuple to ints and milliseconds
            start = int(i[1]) * 1000
            end = int(i[2]) * 1000
            #finds the audio file as a wav
            newAud = AudioSegment.from_wav(file_path)
            #creates a new audio based on the values given by seconds
            newAudio = newAud[start:end]
            newAudio.export(
                f'audio_files/split/SplitAudio_{n}.wav',
                format="wav")  #Exports to a wav file in the current path.

        return n

    n = split_audio(audio_file_name)

    def get_translation():
        entire_text = []
        for num in range(n):
            r = sr.Recognizer()

            harvard = sr.AudioFile(f'audio_files/split/SplitAudio_{num+1}.wav')
            with harvard as source:
                r.adjust_for_ambient_noise(source)
                r.enable_separate_recognition_per_channel = True

                audio = r.record(source)
                text = r.recognize_google(audio, language='en-US')

                text = f'Speaker {n}: ' + str(text)
                entire_text.append(text)
        return entire_text

    translation = get_translation()

    return render_template('translation.html', translation=translation)
示例#9
0
from resemblyzer import VoiceEncoder, preprocess_wav
from pathlib import Path
import numpy as np
import os

data_dir = "/media/user/data/complete_synth/"
encoder = VoiceEncoder()
counter = 0
end_this_shit = False
for path, subdirs, files in os.walk(data_dir):
    for name in files:
        extension = name[-3:]

        if extension == "wav":
            full_file_path = path + "/" + name
            wav = preprocess_wav(Path(full_file_path))
            should_reproces = False
            while len(wav) / 16000 < 3.00:
                wav = np.concatenate((wav, wav))
                should_reproces = True
            if should_reproces:
                embed = encoder.embed_utterance(wav)
                counter += 1
                np.save(path + "/" + name[:-4] + "_embed.npy", embed)

    print("Processed: " + str(counter))

#fpath = Path("/home/user/data/complete_synth/nst_synth/3468.wav")
#wav = preprocess_wav(fpath)

#encoder = VoiceEncoder()
示例#10
0
from resemblyzer import preprocess_wav, VoiceEncoder
from flask import Flask, request
from pathlib import Path
import numpy as np
import os

app = Flask(__name__)

app.config['UPLOAD_FOLDER'] = "utils/"

# wav_fpath1 = Path("utils/", "voice1.mp3")
# wav_fpath2 = Path("utils/", "voice2.mp3")

# wav2 = preprocess_wav(wav_fpath2)

encoder = VoiceEncoder("cpu")


@app.route("/verify", methods=["POST"])
def main():

    file_var1 = request.files["audio1"]
    file_var2 = request.files["audio2"]

    print(file_var1)
    print(file_var2)

    file_extension1 = file_var1.filename.split(".")[-1]
    file_extension2 = file_var2.filename.split(".")[-1]

    if not file_extension1 in ["mp3", "wav"
wav_fpath = Path(audio_file_path)


def mp3_to_wav(audio_file_name):
    if audio_file_name.split('.')[1] == 'mp3':
        sound = AudioSegment.from_mp3(audio_file_name)
        audio_file_name = audio_file_name.split('.')[0] + '.wav'
        sound.export(audio_file_name, format="wav")


mp3_to_wav(audio_file_path)

#preprocesses the wave file to turn it into a file without sounds
wav = preprocess_wav(wav_fpath)
encoder = VoiceEncoder(
    "cpu"
)  #Creates a voice encoder object so we can process audio with the cpu
_, cont_embeds, wav_splits = encoder.embed_utterance(wav,
                                                     return_partials=True,
                                                     rate=16)

#create a cluster object
clusterer = SpectralClusterer(min_clusters=2,
                              max_clusters=100,
                              p_percentile=0.90,
                              gaussian_blur_sigma=1)

#label the speaker that is speaking at certain time
labels = clusterer.predict(cont_embeds)

 def __init__(self):
     """At initialization level AudioFeaturizer loads VoiceEncoder model.
     """
     #: VoiceEncoder: VoiceEncoder that used to get d-vector from audio record.
     self.encoder = VoiceEncoder()
class AudioFeaturizer():
    """AudioFeaturizer allows to get features from audio records.

    This class allows to read audio record as file or by URL, get features
    from audio record and compare different sets of features to get similarity. 

    :param names_to_norm: Names of features that should be normilized.
    :type names_to_norm: list
    :param path_to_temp_file: File name of temporary saved audio record.
    :type path_to_temp_file: str
    :param feature_len: Default lenght of feature vector.
    :type feature_len: int
    :param d_weight: Weight for d-vector feature, which is used in process of comparison.
    :type d_weight: float
    :param lfcc_weight: Weight for lfcc feature, which is used in process of comparison.
    :type lfcc_weight: float
    :param pncc_weight: Weight for pncc feature, which is used in process of comparison.
    :type pncc_weight: float
    :param mfcc_weight: Weight for mfcc feature, which is used in process of comparison.
    :type mfcc_weight: float
    """

    names_to_norm = ['lfcc', 'mfcc', 'pncc']
    path_to_temp_file = './temp.wav'
    feature_len = 384
    d_weight = 4.0
    lfcc_weight = 1.2
    pncc_weight = 0.5
    mfcc_weight = 0.5

    def __init__(self):
        """At initialization level AudioFeaturizer loads VoiceEncoder model.
        """
        #: VoiceEncoder: VoiceEncoder that used to get d-vector from audio record.
        self.encoder = VoiceEncoder()
    
    def read_file(self, file_name, sample_rate=16000):
        """Read audio record as file.
            
        :param file_name: Path to the audio file.
        :type file_name: str
        :param sample_rate: Sample rate for audio record.
        :type sample_rate: int

        :return: Record and sample rate
        """
        record, sample_rate = librosa.load(file_name, sr=sample_rate)
        return record, sample_rate

    def read_file_by_url(self, url, sample_rate=16000):
        """Read audio record by URL.
            
        :param url: URL to the audio file.
        :type url: str
        :param sample_rate: Sample rate for audio record.
        :type sample_rate: int

        :return: Record and sample rate
        """
        z = io.BytesIO(urlopen(url).read())
        temp_path = f'./{str(uuid.uuid4())}.wav'
        temp_file = Path(temp_path).write_bytes(z.getbuffer())
        record, sample_rate = librosa.load(temp_path, sr=sample_rate)
        Path(temp_path).unlink()
        return record, sample_rate
    
    def norm_dim(self, features):
        """Normalize feature vector to 1-dimensional with default lenght.
            
        :param features: Feature vector.
        :type features: list

        :return: Normalized features
        """
        return np.mean(features.T, axis=0)
    
    def get_d_vector(self, record, sample_rate):
        """Get d-vector feature from audio record.
            
        Args:
        :param record: Record object to get feature from.
        :type record: object
        :param sample_rate: Sample rate for audio record.
        :type sample_rate: int

        :return: D-vector feature vector
        """
        wav = preprocess_wav(record)

        embed = self.encoder.embed_utterance(wav)
        np.set_printoptions(precision=3, suppress=True)
        return embed
    
    def get_lfcc(self, record, sample_rate):
        """Get LFCC feature from audio record.
            
        `LFCC paper <http://citeseerx.ist.psu.edu/viewdoc/download?doi=10.1.1.63.8029&rep=rep1&type=pdf>`_
        
        :param record: Record object to get feature from.
        :type record: object
        :param sample_rate: Sample rate for audio record.
        :type sample_rate: int

        :return: LFCC feature vector
        """
        return lfcc(record, fs=sample_rate)
        
    def get_mfcc(self, record, sample_rate):
        """Get MFCC feature from audio record.
            
        `MFCC paper <https://arxiv.org/pdf/1003.4083.pdf>`_
        
        :param record: Record object to get feature from.
        :type record: object
        :param sample_rate: Sample rate for audio record.
        :type sample_rate: int

        :return: MFCC feature vector
        """
        return mfcc(record, fs=sample_rate)
    
    def get_pncc(self, record, sample_rate):
        """Get PNCC feature from audio record.
            
        `PNCC paper <http://www.cs.cmu.edu/~robust/Papers/OnlinePNCC_V25.pdf>`_
        
        :param record: Record object to get feature from.
        :type record: object
        :param sample_rate: Sample rate for audio record.
        :type sample_rate: int

        :return: PNCC feature vector
        """
        return pncc(record, fs=sample_rate)
    
    def get_all_features(self, record, sample_rate=16000, normalize_dim=False):
        """Get all list of features from audio record.
        
        :param record: Record object to get feature from.
        :type record: object
        :param sample_rate: Sample rate for audio record.
        :type sample_rate: int
        :param normalize_dim: Normilize vectors or not.
        :type normalize_dim: bool

        :return: Dictionary of all features
        """
        all_features = {
            'lfcc': self.get_lfcc(record, sample_rate),
            'mfcc': self.get_mfcc(record, sample_rate),
            'pncc': self.get_pncc(record, sample_rate),
            'd_vector': self.get_d_vector(record, sample_rate)
        }
        
        if normalize_dim:
            for feature_name in all_features.keys():
                all_features[feature_name] = self.norm_dim(all_features[feature_name]) if self.is_in_norm_list(feature_name) else all_features[feature_name]
        
        return all_features

    def get_all_features_limited(self, record, sample_rate=16000):
        """Get all list of features from audio record and normalize by 
        default dimension and lenght.
        
        :param record: Record object to get feature from.
        :type record: object
        :param sample_rate: Sample rate for audio record.
        :type sample_rate: int

        :return: Dictionary of all features
        """
        all_features = self.get_all_features(record, sample_rate, normalize_dim=True)
        for feature_name in all_features:
            if feature_name in self.names_to_norm:
                all_features[feature_name] = all_features[feature_name][:self.feature_len]
        return all_features

    def get_all_features_mean_limited(self, record, sample_rate=16000, chunk_len=5):
        chunks_amount = math.floor(len(record)/sample_rate/chunk_len)
        features_sets = {
            'd_vector': [],
            'mfcc': [],
            'lfcc': [],
            'pncc': []
        }
        for i in range(chunks_amount):
            record_chunk = record[i*sample_rate*chunk_len:(i+1)*sample_rate*chunk_len]
            chunk_all_features = self.get_all_features(record_chunk, sample_rate, normalize_dim=True)
            for feature_name in chunk_all_features:
                features_sets[feature_name].append(chunk_all_features[feature_name])
        all_features = {
            'd_vector': None,
            'mfcc': None,
            'lfcc': None,
            'pncc': None
        }
        for feature_name in all_features:
            all_features[feature_name] = np.mean(features_sets[feature_name], axis=0)
        all_features = self.get_all_features(record, sample_rate, normalize_dim=True)
        for feature_name in all_features:
            if feature_name in self.names_to_norm:
                all_features[feature_name] = all_features[feature_name][:self.feature_len]
        return all_features
    
    def is_in_norm_list(self, feature_name):
        """Check feature name in default normaliation list.
        
        :param feature_name: Name of the feature.
        :type feature_name: str

        :return: True or False for feature name in default normalize list.
        """
        return any([name == feature_name for name in self.names_to_norm])
    
    def visualize_spectrogram(self, record, sample_rate):
        """Build plot of spectrogram for audio record.
        
        :param record: Record object.
        :type record: object
        :param sample_rate: Sample rate for audio record.
        :type sample_rate: int
        """
        vis.spectogram(record, sample_rate)
        
    def visualize_features(self, 
                           features, 
                           feature_index='feature', 
                           frame_index='frame index', 
                           normalize_dim=False):
        """Build plot of features for audio record.
        
        :param features: Feature vector.
        :type features: list
        :param feature_index: Feature name index.
        :type feature_index: str
        :param frame_index: Frame name index.
        :type frame_index: str
        :param normalize_dim: Normilize vectors or not.
        :type normalize_dim: bool
        """
        if normalize_dim:
            vis.plot(features, feature_index, frame_index)
        else:
            vis.visualize_features(features, feature_index, frame_index)

    def cosine_similarity(self, x, y):
        """Compare two feature lists by cosine distance.
        
        :param x: First feature vector.
        :type x: list 
        :param y: Second feature vector.
        :type y: list

        :return: Float number from 0 to 1 which shows similarity between two vectors.
        """
        return 1 - spatial.distance.cosine(x, y)

    def compare_two_features_sets(self, features_1, features_2):
        """Compare two feature sets.
        
        :param features_1: First feature set.
        :type features_1: dict
        :param features_2: Second feature set.
        :type features_2: dict

        :return: Float number which shows similarity between two feature sets.
        """
        d_sim = self.cosine_similarity(features_1['d_vector'], features_2['d_vector'])
        lfcc_sim = self.cosine_similarity(features_1['lfcc'], features_2['lfcc'])
        pncc_sim = self.cosine_similarity(features_1['pncc'], features_2['pncc'])
        mfcc_sim = self.cosine_similarity(features_1['mfcc'], features_2['mfcc'])

        sum_sim = d_sim*self.d_weight + lfcc_sim*self.lfcc_weight + pncc_sim*self.pncc_weight + mfcc_sim*self.mfcc_weight
        # TODO: scale sim based on weights
        return sum_sim

    def features_to_json_serializable(self, all_features):
        """Format feature set to json-serializable.
        
        :param all_features: Feature set.
        :type all_features: dict

        :return: Json-serializable feature set dictionary.
        """
        for feature_name in all_features:
            all_features[feature_name] = all_features[feature_name].tolist()
        return all_features
        
示例#14
0
class AutoVCAlphaModel(BaseModel):
    def __init__(self, params):
        super().__init__(params)

        self.encoder = ContentEncoder(params)
        self.decoder = Decoder(params)
        self.postnet = PostNet(params)

        self.style_encoder = VoiceEncoder()
        self.freeze(self.style_encoder)

    def forward(self, wavs, mels):
        c_src = self._make_speaker_vectors(wavs, mels.size(-1), mels.device)

        codes, mel_outputs, mel_outputs_postnet, q_loss = self._forward(mels, c_src)

        return (
            mel_outputs,
            mel_outputs_postnet,
            codes,
            self.encoder(mel_outputs_postnet)[0],
            q_loss
        )

    def inference(self, src_path: str, tgt_path: str):
        wav_src, wav_tgt, mel_src = self._preprocess(src_path, tgt_path)

        c_src = self._make_speaker_vectors([wav_src], mel_src.size(-1), mel_src.device)
        c_tgt = self._make_speaker_vectors([wav_tgt], mel_src.size(-1), mel_src.device)

        _, _, mel_outputs_postnet, _ = self._forward(mel_src, c_src, c_tgt)

        wav = self._mel_to_wav(mel_outputs_postnet)
        return wav

    def _forward(self, mels, c_src, c_tgt=None):
        codes, q_loss = self.encoder(mels)

        decoder_input = torch.cat((codes, c_src if c_tgt is None else c_tgt), dim=1)

        mel_outputs = self.decoder(decoder_input)

        mel_outputs_postnet = self.postnet(mel_outputs)
        mel_outputs_postnet = mel_outputs + mel_outputs_postnet

        return codes, mel_outputs, mel_outputs_postnet, q_loss

    def _make_speaker_vectors(self, wavs, time_size, device):
        c = [self.style_encoder.embed_utterance(x) for x in wavs]
        c = torch.tensor(c, dtype=torch.float, device=device)
        c = c[:, :, None].expand(-1, -1, time_size)
        return c

    def _preprocess(self, src_path: str, tgt_path: str):
        wav_src, mel_src = get_wav_mel(src_path)
        wav_tgt, _ = get_wav_mel(tgt_path)
        mel_src = self._preprocess_mel(mel_src)
        return wav_src, wav_tgt, mel_src

    def _preprocess_mel(self, mel):
        if self.is_normalize:
            mel = normalize(mel)
        mel = self.unsqueeze_for_input(mel)
        return mel
示例#15
0
def fingerprint_from_waveform(wav):
    return VoiceEncoder().embed_utterance(wav)
wav = preprocess_wav(wav_fpath)

# Cut some segments from single speakers as reference audio
segments = [[0, 5.5], [6.5, 12], [17, 25]]
speaker_names = ["Kyle Gass", "Sean Evans", "Jack Black"]
speaker_wavs = [wav[int(s[0] * sampling_rate):int(s[1]) * sampling_rate] for s in segments]
  
    
## Compare speaker embeds to the continuous embedding of the interview
# Derive a continuous embedding of the interview. We put a rate of 16, meaning that an 
# embedding is generated every 0.0625 seconds. It is good to have a higher rate for speaker 
# diarization, but it is not so useful for when you only need a summary embedding of the 
# entire utterance. A rate of 2 would have been enough, but 16 is nice for the sake of the 
# demonstration. 
# We'll exceptionally force to run this on CPU, because it uses a lot of RAM and most GPUs 
# won't have enough. There's a speed drawback, but it remains reasonable.
encoder = VoiceEncoder("cpu")
print("Running the continuous embedding on cpu, this might take a while...")
_, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=16)


# Get the continuous similarity for every speaker. It amounts to a dot product between the 
# embedding of the speaker and the continuous embedding of the interview
speaker_embeds = [encoder.embed_utterance(speaker_wav) for speaker_wav in speaker_wavs]
similarity_dict = {name: cont_embeds @ speaker_embed for name, speaker_embed in 
                   zip(speaker_names, speaker_embeds)}


## Run the interactive demo
interactive_diarization(similarity_dict, wav, wav_splits)
示例#17
0
def fingerprint_from_file(filepath, segment=None, sampling_rate=16000):
    fpath = Path(filepath)
    wav = preprocess_wav(fpath)
    if segment:
        wav = wav[int(segment[0] * sampling_rate):int(segment[1]) * sampling_rate]
    return VoiceEncoder().embed_utterance(wav)
示例#18
0
def load_asv_model(device):
    model = VoiceEncoder().to(device)
    return model
示例#19
0
import pickle
import glob
import numpy as np
from resemblyzer import VoiceEncoder
import os

encoder = VoiceEncoder('cpu')
data = {}
classes = {}
i = 0
path = 'data'
for name in os.listdir(path):
    if os.path.isdir(os.path.join(path, name)):
        classes[name] = i
        i += 1


def preprocess(name, embedding):
    global data
    embedded = encoder.embed_utterance(embedding)
    embedded = np.array(embedded).reshape(1, 256)
    if name not in data:
        data[name] = [embedded]
    else:
        data[name].append(embedded)


for filepath in glob.glob('*.pickle'):
    name = filepath.split('_')[0]
    with open(filepath, 'rb') as handle:
        embedding = pickle.load(handle)
示例#20
0
          "%.1fGb total memory.\n" % 
          (torch.cuda.device_count(),
           device_id,
           gpu_properties.name,
           gpu_properties.major,
           gpu_properties.minor,
           gpu_properties.total_memory / 1e9))
    

    ## Load the models one by one.
    print("Preparing the synthesizer and the vocoder...")
    synthesizer = Synthesizer(synth_path.joinpath("taco_pretrained"), low_mem=False)
    vocoder.load_model(vocoder_path)
    
    print("Loading encoder from resemblyzer")
    encoder = VoiceEncoder()
    
    # Get the reference audio repo path
    speaker = 'SAM'
    repo_fpath = Path('../SOURCE_AUDIO',speaker)
    wav_fpaths = list(repo_fpath.glob(speaker+"*"))
    print('PAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAATHS')
    print(repo_fpath)
    print(wav_fpaths)
    
    wavs = np.array(list(map(preprocess_wav, tqdm(wav_fpaths, "Preprocessing wavs", len(wav_fpaths)))))
    speaker_embedding = encoder.embed_speaker(wavs)
    
    text = str(np.loadtxt('../test_sentence.txt', dtype='str', delimiter = '&'))
   
    texts = [text]
示例#21
0
from itertools import groupby
from pathlib import Path
from tqdm import tqdm
import matplotlib.pyplot as plt
import numpy as np


# The demos are ordered so as to make the explanations in the comments consistent. If you only 
# care about running the code, then you don't have to follow that order.


# The neural network will automatically use CUDA if it's available on your machine, otherwise it 
# will use the CPU. You can enforce a device of your choice by passing its name as argument to the 
# constructor. The model might take a few seconds to load with CUDA, but it then executes very 
# quickly.
encoder = VoiceEncoder()

# We'll use a smaller version of the dataset LibriSpeech test-other to run our examples. This 
# smaller dataset contains 10 speakers with 10 utterances each. N.B. "wav" in variable names stands
# for "waveform" and not the wav file extension.
wav_fpaths = list(Path("audio_data", "librispeech_test-other").glob("**/*.flac"))
# Group the wavs per speaker and load them using the preprocessing function provided with 
# resemblyzer to load wavs in memory. It normalizes the volume, trims long silences and resamples 
# the wav to the correct sampling rate.
speaker_wavs = {speaker: list(map(preprocess_wav, wav_fpaths)) for speaker, wav_fpaths in
                groupby(tqdm(wav_fpaths, "Preprocessing wavs", len(wav_fpaths), unit="wavs"), 
                        lambda wav_fpath: wav_fpath.parent.stem)}


## Similarity between two utterances from each speaker
# Embed two utterances A and B for each speaker
示例#22
0
    "--num",
    help="Num of speakers(optional)",
    default=None,
    type=int,
)

parser.add_argument("--interactive", dest="interactive", action="store_true")
parser.add_argument("--no-interactive", dest="interactive", action="store_false")
parser.set_defaults(interactive=True)

args = parser.parse_args()

audio_path = Path(args.audio)
wav = preprocess_wav(audio_path)

encoder = VoiceEncoder() if args.rate <= 4 else VoiceEncoder("cpu")

# encoder = VoiceEncoder("cpu")
_, cond_emd, wav_splits = encoder.embed_utterance(
    wav,
    return_partials=True,
    rate=args.rate,
)

clusterer = SpectralClusterer(
    min_clusters=args.num,
    p_percentile=0.91,
)


labels = clusterer.predict(cond_emd)