def get_speaker_similarity_dict_and_wav_splits(file_name):
    print('Processing voices for file:', file_name)
    fpath = os.fspath(file_name)
    wav = preprocess_wav(fpath_or_wav=fpath)

    speaker_names = ['Phreak', 'Other']
    segments = [[0, 25], [75, 90]]
    encoder = VoiceEncoder('cpu')
    speaker_wavs = [
        wav[int(s[0] * sampling_rate):int(s[1] * sampling_rate)]
        for s in segments
    ]
    print(
        "Running the continuous embedding on cpu, this might take a while...")
    _, cont_embeds, wav_splits = encoder.embed_utterance(wav,
                                                         return_partials=True,
                                                         rate=16)

    speaker_embeds = [
        encoder.embed_utterance(speaker_wav) for speaker_wav in speaker_wavs
    ]
    similarity_dict = {
        name: cont_embeds @ speaker_embed
        for name, speaker_embed in zip(speaker_names, speaker_embeds)
    }

    return similarity_dict, wav_splits
예제 #2
0
class VoiceIDSys:
    def __init__(self):
        # A user dictionary with name:centroid
        self.users = {}
        self.encoder = VoiceEncoder()
        sd.default.device = 'Microphone (GENERAL WEBCAM)'

    def save_sounds(self, duration=10, fs=44100):
        print('recording now!')
        sample = sd.rec(int(duration * fs), samplerate=fs, channels=2)
        sd.wait()
        print('recording done!')
        path = 'output.wav'
        write(path, fs, sample)
        return sample, path

    def generate_voice_profile(self, data_path):
        embeds = []
        os.chdir(data_path)
        for file in os.listdir('.'):
            fpath = Path(os.getcwd() + '\\' + file)
            wav = preprocess_wav(fpath)
            embed = self.encoder.embed_utterance(wav)
            embeds.append(embed)
        centroid = np.array(embeds).mean(axis=0)
        os.chdir(de)
        return centroid

    def add_user(self, name, centroid):
        self.users[name] = centroid

    def id_subject(self, voicepath, th=0.45):
        fpath = Path(voicepath)
        wav = preprocess_wav(fpath)
        embedding = self.encoder.embed_utterance(wav)
        diff_list = []
        for i in self.users.values():
            diff_list.append(i - embedding)
        norms = np.linalg.norm(diff_list, axis=1)
        print(norms)
        m = min(norms)
        if m > th:
            print('Unauthorized')
            print(m)
            return False
        else:
            user = list(self.users.keys())[np.argmin(norms)]
            print('Hello', user, '!')
            return True
예제 #3
0
def load_speaker_embeds(args):
    encoder = VoiceEncoder()

    speakers_dir = '{0}/{1}/{2}/'.format(args.media, args.name, args.speakers)
    speaker_embeds_list = []
    if os.path.exists(speakers_dir):
        speakers_dir_subfolders = [
            f.path for f in os.scandir(speakers_dir) if f.is_dir()
        ]
        for speakers_dir_subfolder in speakers_dir_subfolders:
            speaker_embeds = []
            wav_file_list = list(
                enumerate(glob.glob(
                    "{}/*.wav".format(speakers_dir_subfolder))))
            for index, wav_file in wav_file_list:
                wav = AudioSegment.from_wav(wav_file)
                librosa_npy = audiosegment_to_librosawav(wav)
                librosa_wav = preprocess_wav(librosa_npy)
                current_embed = encoder.embed_utterance(librosa_wav)
                speaker_embeds.append(current_embed)
            if len(speaker_embeds) > 0:
                dirname = os.path.basename(speakers_dir_subfolder)
                speaker_embeds_list.append((
                    dirname,
                    speaker_embeds,
                ))
    return speaker_embeds_list
예제 #4
0
    def several_speakers_identification(self,
                                        path,
                                        min_duration=3,
                                        return_splits=False,
                                        export=False,
                                        recognition=False,
                                        language='en-En'):
        self.min_duration = min_duration
        self.path = path
        wav = preprocess_wav(path)
        sf.write(self.wav, wav, 16000, subtype='PCM_24')
        encoder = VoiceEncoder()
        _, embed, slices = encoder.embed_utterance(wav,
                                                   return_partials=True,
                                                   rate=1)
        np.set_printoptions(suppress=True)
        for i in range(len(embed)):
            self.add_speaker(embed[i])
        # for i in range(len(self.timing)):
        #     print(i, self.timing[i])
        self.clear()
        print('Found %d speakers' % self.speakers_number)
        for i in range(self.speakers_number):
            print('Speaker ' + str(i) + ': ' + str(len(self.speakers[i])) +
                  's')
        self.splits = self.get_splits()
        if recognition or export:
            paths = ExportAudio.export(self.splits, self.wav)
            if recognition:
                self.recognize_audio(language, paths, export)
        if return_splits:
            return self.speakers_number, self.splits

        return self.speakers_number
예제 #5
0
def load_data(from_path=None, ckpt_path=None, data_path=None, save_path=None):
    if from_path is None:
        if ckpt_path is None:
            raise Exception('No checkpoint path provided')

        from resemblyzer import preprocess_wav, VoiceEncoder
        from tqdm import tqdm

        device = torch.device('cuda')
        encoder = VoiceEncoder(device=device, loss_device=device)
        encoder.load_ckpt(ckpt_path, device=device)
        encoder.eval()
        wav_fpaths = list(Path(data_path).glob("**/*.flac"))

        # Preprocess and save encoded utterance and label to list
        X = []
        y = []
        for wav_fpath in tqdm(wav_fpaths):
            wav = preprocess_wav(wav_fpath)
            X.append(encoder.embed_utterance(wav).cpu().numpy())
            y.append(wav_fpath.parent.parent.stem)

        # Save for testing
        if save_path is not None:
            np.save(Path(save_path, 'embeds.npy'), X)
            np.save(Path(save_path, 'labels.npy'), y)
        else:
            raise Exception('No save_path provided')
    else:
        X = np.load(Path(from_path, 'embeds.npy'), allow_pickle=True)
        y = np.load(Path(from_path, 'labels.npy'), allow_pickle=True)
    return X, y
예제 #6
0
def isolate_voice(audio_file_path: Path, embed_path: Path, params_path: Path, output_path: Path):
    """
    load speaker embeds from pickle
    take voice out only if value is > thresh and take greater if both > thresh
    
    Args:
        file_path: input complete wav file path from which rick's voice will be taken out
        cutoff_thresh: voice if value above this is taken
    """
    params = load_params(params_path)
    cutoff_threshold = params["cutoff_threshold"]
    sampling_rate = params["wav_bitrate"]

    print("preprocessing")
    file_wav = preprocess_wav(audio_file_path) ; print("input file shape ", file_wav.shape, "\n", file_wav[:10])
    print("file preprocessed")
    encoder = VoiceEncoder("cpu")
    print("model loaded")
    speaker_names = ["Rick", "Morty"]

    _, file_embeds, wav_splits = encoder.embed_utterance(file_wav, return_partials=True, rate=1)
    print("file encoded")
    speaker_embeds = pickle.load(open(embed_path, "rb"))

    similarity_dict = {name: file_embeds @ speaker_embed for name, speaker_embed in zip(speaker_names, speaker_embeds)}
    print("similatrity dict is\n", similarity_dict)
    pickle.dump(similarity_dict, open("./similarity.pkl", "wb"))

    #find greater in both then cutoff -> take that second append it to that file
    current_second = 0
    rick_wav = []
    rick_seconds = []
    morty_wav = []
    morty_seconds = []

    for rick_value, morty_value in zip(similarity_dict["Rick"], similarity_dict["Morty"]):
        print(current_second, rick_value, morty_value)
        if rick_value > morty_value and rick_value > cutoff_threshold:
            rick_wav.append(file_wav[current_second * sampling_rate : (current_second+1) * sampling_rate])
            rick_seconds.append(current_second)
            print("append rick")

        elif morty_value > rick_value and morty_value > cutoff_threshold:
            morty_wav.append(file_wav[current_second * sampling_rate: (current_second+1) * sampling_rate])
            morty_seconds.append(current_second)
            print("append morty")

        else:
            print("skipping")

        current_second += 1

    rick_wav = [item for sublist in rick_wav for item in sublist]
    morty_wav = [item for sublist in morty_wav for item in sublist]
    
    save_wav(np.array(rick_wav), output_path.joinpath("rick.wav"), sampling_rate)
    save_wav(np.array(morty_wav), output_path.joinpath("morty.wav"), sampling_rate)

    return rick_seconds, morty_seconds
예제 #7
0
def process(wav_fpath):
    wav = preprocess_wav(wav_fpath)
    encoder = VoiceEncoder("cpu")
    _, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=16)

    # Output denoised wave after removing the pauses
    write('DenoisedInputFiles/DenoisedSignal.wav', 16000, wav)
    return cont_embeds, wav_splits
예제 #8
0
    def oneDictorIdentification(self, cSample, mainFile):
        print('[-i] --> Identify Dictor')
        avg1 = 0.0
        avg2 = 0.0

        fpath = Path(cSample)
        wav = preprocess_wav(fpath)

        encoder = VoiceEncoder()
        embed = encoder.embed_utterance(wav)
        np.set_printoptions(precision=3, suppress=True)
        embedNew = []

        for i in embed:
            if i != 0.0:
                embedNew.append(i)

        for s in embedNew:
            avg1 = avg1 + s

        fpath = Path(mainFile)
        wav = preprocess_wav(fpath)

        encoder = VoiceEncoder()
        embed = encoder.embed_utterance(wav)
        np.set_printoptions(precision=3, suppress=True)
        embedNew2 = []

        for i in embed:
            if i != 0.0:
                embedNew2.append(i)

        for s in embedNew2:
            avg2 = avg2 + s

        self.result = abs((avg2 / len(embedNew2)) - (avg1 / len(embedNew)))
        print(self.result)
        if (self.result < 0.002):
            print("Match!")
            # print("\033[33m\033[1m {}".format("Match!"))
            return 1
        else:
            print("These are different voices")
            # print("\033[33m\033[1m {}".format("These are different voices"))
            return 0
예제 #9
0
class Voice:
    def __init__(self):
        self.encoder = VoiceEncoder()
        self.database = {}
    def add_data(self,v,name):
        '''
        v: wav file path
        name : str
        '''
        if name in self.database:
            print("person exist")
            return False
        wav = preprocess_wav(Path(v))
        self.database[name] = self.encoder.embed_utterance(wav)

        return True


    def voice_com(self,v1):
        '''
        v1: wav file path

        return True if speaker in database
        '''
        if len(self.database.values()) == 0:
            print("Your data not in our database.")
            return False
        wav = preprocess_wav(Path(v1))

        # ## method 1
        # embed1 = self.encoder.embed_speaker(wav1)
        # embed2 = self.encoder.embed_speaker(wav2)
        # sims1 = np.inner(embed1,embed2) # bigger 0.85

        ## method 2
        embed = self.encoder.embed_utterance(wav)

        for dk in self.database.keys():
            sims = embed @ self.database[dk] # bigger 0.75
            if sims > 0.75:
                print("welcome {}!".format(dk))
                return True
        print("Your data not in our database.")
        return False
def vectorExtract(audio1_path, audio2_path):
    wav1 = preprocess_wav(Path(audio1_path))
    wav2 = preprocess_wav(Path(audio2_path))

    encoder = VoiceEncoder()

    embed1 = encoder.embed_utterance(wav1)
    embed2 = encoder.embed_utterance(wav2)

    return numpy.concatenate([embed1, embed2])
예제 #11
0
def process(wav_fpath):
    wav = preprocess_wav(wav_fpath)
    encoder = VoiceEncoder()
    _, cont_embeds, wav_splits = encoder.embed_utterance(wav,
                                                         return_partials=True,
                                                         rate=16)

    # Output denoised wave after removing the pauses
    write('Denoise/Denoise_commercial_mono.wav', 16000, wav)
    return cont_embeds, wav_splits
def simVoice(audio1_path, audio2_path):
    wav1 = preprocess_wav(Path(audio1_path))
    wav2 = preprocess_wav(Path(audio2_path))

    encoder = VoiceEncoder()

    embed1 = encoder.embed_utterance(wav1)
    embed2 = encoder.embed_utterance(wav2)

    return dot(embed1, embed2) / (norm(embed1) * norm(embed2)
                                  )  #np.inner(embed1, embed2)
예제 #13
0
def extract(data_dirs, output_dir):
    """Extract embedding by resemblyzer."""
    encoder = VoiceEncoder()

    data = {}
    for data_dir in tqdm(data_dirs, position=0):
        file_list = librosa.util.find_files(data_dir)
        for file_path in tqdm(file_list, position=1, leave=False):
            wav = preprocess_wav(file_path)
            embedding = encoder.embed_utterance(wav)
            wav_name = splitext(basename(file_path))[0]
            data[wav_name] = embedding

    joblib.dump(data, f"{output_dir}.pkl")
예제 #14
0
def get_spk_emb(audio_file_dir, segment_len=960000):
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    resemblyzer_encoder = VoiceEncoder(device=device)

    wav = preprocess_wav(audio_file_dir)
    l = len(wav) // segment_len # segment_len = 16000 * 60
    l = np.max([1, l])
    all_embeds = []
    for i in range(l):
        mean_embeds, cont_embeds, wav_splits = resemblyzer_encoder.embed_utterance(
            wav[segment_len * i:segment_len* (i + 1)], return_partials=True, rate=2)
        all_embeds.append(mean_embeds)
    all_embeds = np.array(all_embeds)
    mean_embed = np.mean(all_embeds, axis=0)

    return mean_embed, all_embeds
예제 #15
0
def get_top_similar(file, top):
    top_names = []

    wav = preprocess_wav(file)
    encoder = VoiceEncoder()
    embed = encoder.embed_utterance(wav)

    cs = cosine_similarity(X=embeded_voices, Y=embed.reshape(1, -1))
    cs_sorted = np.argsort(cs, axis=0)[::-1][:, 0]
    top_similarity = cs[cs_sorted[:top]]
    top_ids = ids[cs_sorted[:top]]

    for each in top_ids:
        top_names.append(mydict[each])

    return top_names, top_similarity, top_ids
예제 #16
0
class VQVCModel(BaseModel):
    def __init__(self, params):
        super().__init__(params)

        self.encoder = Encoder(params)
        self.decoder = Decoder(params)

        self.speaker_encoder = VoiceEncoder()
        self.freeze(self.speaker_encoder)

    def forward(self, wavs, mels):
        emb = self._make_speaker_vectors(wavs, mels.device)
        q_afters, diff = self.encoder(mels)
        dec = self.decoder(q_afters, emb)
        return dec, diff

    def inference(self, src_path: str, tgt_path: str):
        wav_src, wav_tgt, mel_src = self._preprocess(src_path, tgt_path)

        emb = self._make_speaker_vectors([wav_tgt], mel_src.device)

        q_afters, _ = self.encoder(mel_src)
        dec = self.decoder(q_afters, emb)

        wav = self._mel_to_wav(dec)
        return wav

    def _make_speaker_vectors(self, wavs, device):
        c = [self.speaker_encoder.embed_utterance(x) for x in wavs]
        c = torch.tensor(c, dtype=torch.float, device=device)
        return c

    def _preprocess(self, src_path: str, tgt_path: str):
        wav_src, mel_src = get_wav_mel(src_path)
        wav_tgt, _ = get_wav_mel(tgt_path)
        mel_src = self._preprocess_mel(mel_src)
        return wav_src, wav_tgt, mel_src

    def _preprocess_mel(self, mel):
        if self.is_normalize:
            mel = normalize(mel)
        mel = self._adjust_length(mel, freq=4)
        mel = self.unsqueeze_for_input(mel)
        return mel
예제 #17
0
def extract_audio_embedding(paths, dest_path):
    """
    Extract audio embedding from each wav file in paths.
    :param paths: list of paths to wav audio files
    :param dest_path: output path in which save the df.pickle file
    :return:
    """
    # Define encoder for audio. This is a demanding task, but CUDA does not support multiprocessing from Python
    encoder = VoiceEncoder()
    for path in tqdm(paths):
        filename = os.path.basename(path).split('.')[0]
        # Extract wav features
        wav = preprocess_wav(path)
        # Actually creates audio embedding
        embed = encoder.embed_utterance(wav)
        df = pd.DataFrame(columns=['filename', 'audio_embedding'])
        # Put info inside dataframe. Keep the name as "filename.mp4" so we keep compatibility with video embeddings
        df.loc[0] = [filename + '.mp4', embed]
        df.to_pickle(os.path.join(dest_path, filename + '.csv'))
예제 #18
0
def extract(data_dirs, output_dir):
    """Extract embedding by resemblyzer."""
    encoder = VoiceEncoder()
    os.makedirs(output_dir, exist_ok=True)

    for data_dir in tqdm(data_dirs, position=0):
        speaker_list = [
            speaker_name for speaker_name in os.listdir(data_dir)
            if os.path.isdir(join_path(data_dir, speaker_name))
        ]
        for speaker_name in tqdm(speaker_list, position=1, leave=False):
            data = []
            file_list = librosa.util.find_files(
                join_path(data_dir, speaker_name))
            for file_path in tqdm(file_list, position=2, leave=False):
                wav = preprocess_wav(file_path)
                embedding = encoder.embed_utterance(wav)
                wav_name = splitext(basename(file_path))[0]
                data.append({"filename": wav_name, "embedding": embedding})
            if len(data) == 0:
                continue
            joblib.dump(data, join_path(output_dir, f"{speaker_name}.pkl"))
예제 #19
0
class Predictor():
    def __init__(self,
                 clf_ckpt_path='exp/clv/mlp/mlp_best_val_loss.pt',
                 enc_ckpt_path='ckpt/pretrained.pt',
                 device=torch.device('cuda'),
                 num_class=381,
                 verbose=False):
        start = timer()
        self.encoder = VoiceEncoder(device=device, loss_device=device)
        self.encoder.load_ckpt(enc_ckpt_path, device)
        self.encoder.eval()
        self.classifier = MLP(num_class=num_class)
        self.classifier.load_ckpt(clf_ckpt_path, device)
        self.classifier.eval()
        if verbose:
            print(
                f'Encoder and classifier models loaded successfully in {timer() - start}s'
            )

    def preprocess(self, f):
        """
        Applies preprocessing operations to a waveform either on disk or in memory such that
        The waveform will be resampled to match the data hyperparameters.

        :param f: either a filepath to an audio file or the waveform as a numpy array of floats.
        """
        return preprocess_wav(f)

    def predict(self, audio, topk=2):
        """
        Predict top_k classes with highest probabilities.

        :param audio: preprocessed waveform.
        :param topk: Keep topk classes with highest probabilities.
        """
        embed = self.encoder.embed_utterance(audio)
        inp = embed.unsqueeze(dim=0)
        top_probs, top_classes = self.classifier.predict(inp, topk=topk)
        return top_probs, top_classes
예제 #20
0
def get_speaker_segments(args, audio_file, segments_file):
    if os.path.exists(segments_file):
        with open(segments_file) as json_file:
            json_data = json.load(json_file)
            return [(
                monologue["speaker"]["id"],
                monologue["start"] * 1000,
                monologue["end"] * 1000,
            ) for monologue in json_data["monologues"]]
    else:
        from resemblyzer import preprocess_wav, VoiceEncoder
        encoder = VoiceEncoder()
        speaker_embeds = []

        segments = get_segments(args, audio_file, segments_file)

        speaker_segments = []
        for start, end in segments:
            clip = audio_file[start:end]
            segment_npy = audiosegment_to_librosawav(clip)
            segment_wav = preprocess_wav(segment_npy)
            current_embed = encoder.embed_utterance(segment_wav)
            is_any_similar = False

            min_similarity = 0.85
            name_id = len(speaker_embeds)
            for index, speaker_embed in enumerate(speaker_embeds):
                similarity = current_embed @ speaker_embed

                if similarity > min_similarity:
                    min_similarity = similarity
                    name_id = index
                    is_any_similar = True

            if not is_any_similar:
                speaker_embeds.append(current_embed)
            speaker_segments.append((name_id, [start, end]))

        return speaker_segments
예제 #21
0
def clustering(wav):
    encoder = VoiceEncoder("cpu")
    _, cont_embeds, wav_splits = encoder.embed_utterance(
        wav, return_partials=True, rate=16)  #create d-vectors
    clusterer = SpectralClusterer(min_clusters=2,
                                  max_clusters=100,
                                  p_percentile=0.90,
                                  gaussian_blur_sigma=1)

    labels = clusterer.predict(cont_embeds)
    times = [((s.start + s.stop) / 2) / sampling_rate for s in wav_splits]
    labelling = []
    start_time = 0

    for i, time in enumerate(times):
        if i > 0 and labels[i] != labels[i - 1]:
            temp = ['speaker ' + str(labels[i - 1]), start_time, time]
            labelling.append(tuple(temp))
            start_time = time
        if i == len(times) - 1:
            temp = ['speaker ' + str(labels[i]), start_time, time]
            labelling.append(tuple(temp))
    return labelling  #str
예제 #22
0
def get_embedding_GE2E(filename):

    wav, _ = librosa.load(str(filename), sr=22050)
    encoder = VoiceEncoder(device="cpu")
    emb = encoder.embed_utterance(wav)
    return emb
예제 #23
0
speaker_wavs = [
    wav[int(s[0] * sampling_rate):int(s[1] * sampling_rate)] for s in segments
]

## Compare speaker embeds to the continuous embedding of the interview
# Derive a continuous embedding of the interview. We put a rate of 16, meaning that an
# embedding is generated every 0.0625 seconds. It is good to have a higher rate for speaker
# diarization, but it is not so useful for when you only need a summary embedding of the
# entire utterance. A rate of 2 would have been enough, but 16 is nice for the sake of the
# demonstration.
# We'll exceptionally force to run this on CPU, because it uses a lot of RAM and most GPUs
# won't have enough. There's a speed drawback, but it remains reasonable.
encoder = VoiceEncoder("cpu")
print("Running the continuous embedding on cpu, this might take a while...")
_, cont_embeds, wav_splits = encoder.embed_utterance(wav,
                                                     return_partials=True,
                                                     rate=16)

# Get the continuous similarity for every speaker. It amounts to a dot product between the
# embedding of the speaker and the continuous embedding of the interview
speaker_embeds = [
    encoder.embed_utterance(speaker_wav) for speaker_wav in speaker_wavs
]
similarity_dict = {
    name: cont_embeds @ speaker_embed
    for name, speaker_embed in zip(speaker_names, speaker_embeds)
}

## Run the interactive demo
interactive_diarization(similarity_dict, wav, wav_splits)
예제 #24
0
# We'll use a smaller version of the dataset LibriSpeech test-other to run our examples. This 
# smaller dataset contains 10 speakers with 10 utterances each. N.B. "wav" in variable names stands
# for "waveform" and not the wav file extension.
wav_fpaths = list(Path("audio_data", "librispeech_test-other").glob("**/*.flac"))
# Group the wavs per speaker and load them using the preprocessing function provided with 
# resemblyzer to load wavs in memory. It normalizes the volume, trims long silences and resamples 
# the wav to the correct sampling rate.
speaker_wavs = {speaker: list(map(preprocess_wav, wav_fpaths)) for speaker, wav_fpaths in
                groupby(tqdm(wav_fpaths, "Preprocessing wavs", len(wav_fpaths), unit="wavs"), 
                        lambda wav_fpath: wav_fpath.parent.stem)}


## Similarity between two utterances from each speaker
# Embed two utterances A and B for each speaker
embeds_a = np.array([encoder.embed_utterance(wavs[0]) for wavs in speaker_wavs.values()])
embeds_b = np.array([encoder.embed_utterance(wavs[1]) for wavs in speaker_wavs.values()])
# Each array is of shape (num_speakers, embed_size) which should be (10, 256) if you haven't 
# changed anything.
print("Shape of embeddings: %s" % str(embeds_a.shape))

# Compute the similarity matrix. The similarity of two embeddings is simply their dot 
# product, because the similarity metric is the cosine similarity and the embeddings are 
# already L2-normed.
# Short version:
utt_sim_matrix = np.inner(embeds_a, embeds_b)
# Long, detailed version:
utt_sim_matrix2 = np.zeros((len(embeds_a), len(embeds_b)))
for i in range(len(embeds_a)):
    for j in range(len(embeds_b)):
        # The @ notation is exactly equivalent to np.dot(embeds_a[i], embeds_b[i])
예제 #25
0
# from matplotlib import cm
# from time import sleep, perf_counter as timer
# from umap import UMAP
# import matplotlib.pyplot as plt

sys.path.append("Resemblyzer")
from resemblyzer import preprocess_wav, VoiceEncoder, sampling_rate  # noqa

# %%
# Load file
wav = preprocess_wav("Resemblyzer/audio_data/X2zqiX6yL3I.mp3")

# %%
# Audio features
encoder = VoiceEncoder("cpu")
_, cont_embeds, wav_splits = encoder.embed_utterance(wav, return_partials=True, rate=5)

# %%
# Load UIS-RNN model
sys.argv = ['dummy']
model_args, training_args, inference_args = uisrnn.parse_arguments()
model = uisrnn.UISRNN(model_args)
model.load('uis-rnn/saved_model.uisrnn')

# %%
# Testing
test_sequence = cont_embeds.astype(float)
predictions = model.predict(test_sequence, inference_args)

# %%
예제 #26
0
class AutoVCModel(BaseModel):
    def __init__(self, params):
        super().__init__(params)

        self.encoder = Encoder(params.model.dim_neck, params.speaker_emb_dim,
                               params.model.freq)
        self.decoder = Decoder(params.model.dim_neck, params.speaker_emb_dim,
                               params.model.dim_pre)
        self.postnet = Postnet()

        self.style_encoder = VoiceEncoder()
        self.freeze(self.style_encoder)

    def forward(self, wavs, mels):
        c_src = self._make_speaker_vectors(wavs, mels.size(-1), mels.device)

        codes, mel_outputs, mel_outputs_postnet = self._forward(mels, c_src)

        return (
            mel_outputs,  # decoder output
            mel_outputs_postnet,  # postnet output
            torch.cat(codes, dim=-1),  # encoder output
            torch.cat(self.encoder(mel_outputs_postnet, c_src),
                      dim=-1)  # encoder output using postnet output
        )

    def inference(self, src_path: str, tgt_path: str):
        wav_src, wav_tgt, mel_src = self._preprocess(src_path, tgt_path)

        c_src = self._make_speaker_vectors([wav_src], mel_src.size(-1),
                                           mel_src.device)
        c_tgt = self._make_speaker_vectors([wav_tgt], mel_src.size(-1),
                                           mel_src.device)

        _, _, mel_outputs_postnet = self._forward(mel_src, c_src, c_tgt)

        wav = self._mel_to_wav(mel_outputs_postnet)
        return wav

    def _forward(self, mels, c_src, c_tgt=None):
        codes = self.encoder(mels, c_src)
        # almost equivalent to torch.modules.functional.interpolate
        code_exp = torch.cat([
            c.unsqueeze(-1).expand(-1, -1,
                                   mels.size(-1) // len(codes)) for c in codes
        ],
                             dim=-1)

        # (Batch, Mel-bin, Time) => (Batch, Time, Mel-bin) for LSTM
        decoder_input = torch.cat(
            (code_exp, c_src if c_tgt is None else c_tgt),
            dim=1).transpose(1, 2)

        mel_outputs = self.decoder(decoder_input)

        mel_outputs_postnet = self.postnet(mel_outputs)
        mel_outputs_postnet = mel_outputs + mel_outputs_postnet

        return codes, mel_outputs, mel_outputs_postnet

    def _make_speaker_vectors(self, wavs, time_size, device):
        c = [self.style_encoder.embed_utterance(x) for x in wavs]
        c = torch.tensor(c, dtype=torch.float, device=device)
        c = c[:, :, None].expand(-1, -1, time_size)
        return c

    def _preprocess(self, src_path: str, tgt_path: str):
        wav_src, mel_src = get_wav_mel(src_path)
        wav_tgt, _ = get_wav_mel(tgt_path)
        mel_src = self._preprocess_mel(mel_src)
        return wav_src, wav_tgt, mel_src

    def _preprocess_mel(self, mel):
        if self.is_normalize:
            mel = normalize(mel)
        mel = self._adjust_length(mel, self.freq)
        mel = self.unsqueeze_for_input(mel)
        return mel
예제 #27
0
def audio_analyze():

    delimiter = utils.get_delimiter()
    data_directory = utils.get_data_dir()
    audio_directory = data_directory + delimiter + 'audio'

    AUDIO_FILE = data_directory + delimiter + 'students-output-audio.wav'

    myaudio = AudioSegment.from_file(AUDIO_FILE, "wav")
    chunk_length_ms = 10000  # pydub calculates in millisec
    chunks = make_chunks(myaudio,
                         chunk_length_ms)  #Make chunks from audio file

    # Remove old chunks
    for filename in glob.glob(audio_directory + delimiter + 'chunk*'):
        os.remove(filename)

    # Create new chunks
    for i, chunk in enumerate(chunks):
        chunk_name = audio_directory + delimiter + "chunk{0}.wav".format(i)
        chunk.export(chunk_name, format="wav")

    with open(data_directory + delimiter + 'audio_wpm_csv.csv',
              mode='w',
              newline='') as audio_wpm_csv:
        audio_wpm_csv_writer = csv.writer(audio_wpm_csv,
                                          delimiter=',',
                                          quotechar='"',
                                          quoting=csv.QUOTE_MINIMAL)
        audio_wpm_csv_writer.writerow(['Second', 'WPM', 'Text'])

    ##------------------resemblyzer---------#####
    ## Open source project from https://github.com/resemble-ai/Resemblyzer

    ## Get reference audios
    wav = preprocess_wav(AUDIO_FILE)

    # Cut some segments from single speakers as reference audio
    # Speaker times are in seconds [beginning, end]
    # Can diarize multiple speakers (e.g. students and professor)
    # Segments and speaker names are ordered specific
    segments = [[1, 5]]
    speaker_names = ["Professor"]

    # This assumes the speaker portion was appended to the beginning of the audio
    # file but it could also be passed in a seperate file
    speaker_wavs = [wav[int(s[0] * 16000):int(s[1] * 16000)] for s in segments]

    # Rate of 16 = an embedding every 0.0625 seconds.
    # Higher rate = better for speaker
    # diarization, but it is not so useful for when you only need a summary
    # Forcing this on CPU, because it uses a lot of RAM and most GPUs
    # won't have enough. There's a speed drawback, but it remains reasonable.
    # The rate also determines how many floats will populate the array.
    # For example. A 14 second audio file would give ~239 readings at 0.0625 secs.
    encoder = VoiceEncoder("cpu")
    logger.info("Continuous embedding running on cpu...")
    _, cont_embeds, wav_splits = encoder.embed_utterance(wav,
                                                         return_partials=True,
                                                         rate=16)

    # Get the continuous similarity for every speaker. This is a dot product between the
    # embedding of the speaker and the continuous embedding of the whole audio file
    speaker_embeds = [
        encoder.embed_utterance(speaker_wav) for speaker_wav in speaker_wavs
    ]
    similarity_dict = {
        name: cont_embeds @ speaker_embed
        for name, speaker_embed in zip(speaker_names, speaker_embeds)
    }

    # Produce 'audio_diarize_csv.csv'
    diarize_file = data_directory + delimiter + 'audio_diarize_csv.csv'

    populate_speaker(diarize_file, similarity_dict)

    ## Run the interactive demo
    # interactive_diarization(similarity_dict, wav, wav_splits)
    ##--------------------- end resemblyzer----------###

    now = datetime.now()

    ############ Testing Sphinx ##########
    i = 0
    second_count = 10
    logger.info("Sphinx recognizer with chunks")
    for chunk in chunks:
        filename = audio_directory + delimiter + 'chunk' + str(i) + '.wav'
        logger.info("Processing chunk...")
        file = filename
        r = sr.Recognizer()
        with sr.AudioFile(file) as source:
            #r.adjust_for_ambient_noise(source)
            audio_listened = r.record(source)
        try:
            rec = r.recognize_sphinx(audio_listened)
            logger.info(rec)
            word_count = str(rec).split()
            with open(data_directory + delimiter + 'audio_wpm_csv.csv',
                      mode='a',
                      newline='') as audio_wpm_csv:
                audio_wpm_csv_writer = csv.writer(audio_wpm_csv,
                                                  delimiter=',',
                                                  quotechar='"',
                                                  quoting=csv.QUOTE_MINIMAL)
                audio_wpm_csv_writer.writerow(
                    [str(second_count),
                     len(word_count), rec])
        except sr.UnknownValueError:
            logger.info("Sphinx could not understand audio")
        except sr.RequestError as e:
            logger.info("Sphinx error")
        except:
            rec = r.recognize_sphinx(audio_listened, show_all=True)
            word_count = str(rec).split()
            with open(data_directory + delimiter + 'audio_wpm_csv.csv',
                      mode='a',
                      newline='') as audio_wpm_csv:
                audio_wpm_csv_writer = csv.writer(audio_wpm_csv,
                                                  delimiter=',',
                                                  quotechar='"',
                                                  quoting=csv.QUOTE_MINIMAL)
                audio_wpm_csv_writer.writerow(
                    [str(second_count),
                     len(word_count), rec])
        i += 1
        second_count += 10

    ######### End sphinx test ##########

    # Print time delta without decimals
    process_duration = str(datetime.now() - now).split('.')[0]
    duration = 'Sphink took ' + process_duration + ' to process'
    logger.info(duration)
예제 #28
0
wav_fpaths = list(
    Path("audio_data", "librispeech_test-other").glob("**/*.flac"))
# Group the wavs per speaker and load them using the preprocessing function provided with
# resemblyzer to load wavs in memory. It normalizes the volume, trims long silences and resamples
# the wav to the correct sampling rate.
speaker_wavs = {
    speaker: list(map(preprocess_wav, wav_fpaths))
    for speaker, wav_fpaths in groupby(
        tqdm(wav_fpaths, "Preprocessing wavs", len(wav_fpaths), unit="wavs"),
        lambda wav_fpath: wav_fpath.parent.stem)
}

## Similarity between two utterances from each speaker
# Embed two utterances A and B for each speaker
embeds_a = np.array(
    [encoder.embed_utterance(wavs[0]) for wavs in speaker_wavs.values()])
embeds_b = np.array(
    [encoder.embed_utterance(wavs[1]) for wavs in speaker_wavs.values()])
# Each array is of shape (num_speakers, embed_size) which should be (10, 256) if you haven't
# changed anything.
print("Shape of embeddings: %s" % str(embeds_a.shape))

# Compute the similarity matrix. The similarity of two embeddings is simply their dot
# product, because the similarity metric is the cosine similarity and the embeddings are
# already L2-normed.
# Short version:
utt_sim_matrix = np.inner(embeds_a, embeds_b)
# Long, detailed version:
utt_sim_matrix2 = np.zeros((len(embeds_a), len(embeds_b)))
for i in range(len(embeds_a)):
    for j in range(len(embeds_b)):
예제 #29
0
    basename = os.path.basename(wav_path).split('.wav')[0]
    idx = people[int(basename[-7:-4])]
    wav = audio.load_wav(wav_path)
    wav = wav / np.abs(wav).max() * hparams.hparams.rescaling_max

    mel_spectrogram = audio.melspectrogram(wav).astype(np.float32).T

    tmp = mel_spectrogram
    result = np.zeros((WAV_LEN, 80))
    result[:min(tmp.shape[0], WAV_LEN), :tmp.
           shape[1]] = tmp[:min(tmp.shape[0], WAV_LEN), :tmp.shape[1]]

    mels[idx].append(result)

    obj = preprocess_wav(wav_path)
    emb = encoder.embed_utterance(obj)
    style_list[idx].append(emb)

with open(os.path.join(write_path, 'data.pkl'), 'wb') as handle:
    pickle.dump(mels, handle)

print("finish 'data.pkl' !!!")
'''
for idx in style_list:
	for s in style_list[idx]:
		for i in range(256):
			style[idx][i] += s[i]
	for i in range(256):
		style[idx][i] = style[idx][i] / len(style_list[idx])
'''
예제 #30
0
import os
from resemblyzer import VoiceEncoder, preprocess_wav
from pathlib import Path
import numpy as np
from sklearn.linear_model import Perceptron

os.chdir('C:\\Users\\nrdas\\Downloads\\voiceID\\data\\nitish')
embeds = []
encoder = VoiceEncoder()

for file in os.listdir('.'):
    fpath = Path(os.getcwd() + '\\' + file)
    wav = preprocess_wav(fpath)
    embed = encoder.embed_utterance(wav)
    embeds.append(embed)

embeds2 = []
os.chdir('C:\\Users\\nrdas\\Downloads\\voiceID\\data\\unauthorized')
for file in os.listdir('.'):
    fpath = Path(os.getcwd() + '\\' + file)
    wav = preprocess_wav(fpath)
    embed = encoder.embed_utterance(wav)
    embeds2.append(embed)

centroid = np.array(embeds).mean(axis=0)
diff_dists = embeds2 - centroid
sim_dists = embeds - centroid
sim_dists_norm = np.linalg.norm(sim_dists, axis=1)
diff_dists_norm = np.linalg.norm(diff_dists, axis=1)
print(sim_dists_norm)
print(diff_dists_norm)