예제 #1
0
    def __init__(self, working_dir: str, max_length: int, model: DeepSpeakerModel):
        self.working_dir = working_dir
        self.audio = Audio(cache_dir=working_dir)
        logger.info(f'Picking audio from {working_dir}.')
        self.sp_to_utt_train = train_test_sp_to_utt(self.audio, is_test=False)
        self.sp_to_utt_test = train_test_sp_to_utt(self.audio, is_test=True)
        self.max_length = max_length
        self.model = model
        self.nb_per_speaker = 2
        self.nb_speakers = 640
        self.history_length = 4
        self.history_every = 100  # batches.
        self.total_history_length = self.nb_speakers * self.nb_per_speaker * self.history_length  # 25,600
        self.metadata_train_speakers = Counter()
        self.metadata_output_file = os.path.join(self.working_dir, 'debug_batcher.json')

        self.history_embeddings_train = deque(maxlen=self.total_history_length)
        self.history_utterances_train = deque(maxlen=self.total_history_length)
        self.history_model_inputs_train = deque(maxlen=self.total_history_length)

        self.history_embeddings = None
        self.history_utterances = None
        self.history_model_inputs = None

        self.batch_count = 0
        for _ in tqdm(range(self.history_length), desc='Initializing the batcher'):  # init history.
            self.update_triplets_history()
예제 #2
0
    def generate_per_phase(self,
                           max_length=NUM_FRAMES,
                           num_per_speaker=3000,
                           is_test=False):
        # train OR test.
        num_speakers = len(self.audio.speaker_ids)
        sp_to_utt = train_test_sp_to_utt(self.audio, is_test)

        # 64 fbanks 1 channel(s).
        # float32
        kx = np.zeros(
            (num_speakers * num_per_speaker, max_length, NUM_FBANKS, 1),
            dtype=np.float32)
        ky = np.zeros((num_speakers * num_per_speaker, 1), dtype=np.float32)
        kg = np.zeros((num_speakers * num_per_speaker, 1), dtype=np.float32)

        desc = f'Converting to Keras format [{"test" if is_test else "train"}]'
        for i, speaker_id in enumerate(tqdm(self.audio.speaker_ids,
                                            desc=desc)):
            utterances_files = sp_to_utt[speaker_id]
            for j, utterance_file in enumerate(
                    np.random.choice(utterances_files,
                                     size=num_per_speaker,
                                     replace=True)):
                self.load_into_mat(utterance_file, self.categorical_speakers,
                                   speaker_id, max_length, kx, ky, kg,
                                   i * num_per_speaker + j)
        return kx, ky, kg
    chunk_size in ms

    iterate over chunks until you find the first one with sound
    '''
    trim_ms = 0  # ms

    assert chunk_size > 0  # to avoid infinite loop
    while sound[trim_ms:trim_ms + chunk_size].dBFS < silence_threshold and trim_ms < len(sound):
        trim_ms += chunk_size

    return trim_ms


deep_speaker_root = os.getenv('WORKING_DIR') or os.path.join(os.getenv('HOME'), '.deep-speaker-wd')
audio = Audio(cache_dir=os.path.join(deep_speaker_root, 'triplet-training'))
sp_to_utt_test = train_test_sp_to_utt(audio, is_test=True)
sp_to_utt_train = train_test_sp_to_utt(audio, is_test=False)

for speaker in audio.speaker_ids:
    sp_to_utt_test[speaker] += sp_to_utt_train[speaker][5:]

print("No. audio: %d" % sum(len(sp_to_utt_test[s]) for s in sp_to_utt_test))

root = os.path.join('samples', 'librispeech')
os.makedirs(root, exist_ok=True)
outputs = []


def load_transcripts():
    transcripts = dict()
    for transcript_path in tqdm(glob.glob(os.path.join(deep_speaker_root, 'LibriSpeech/**/**/**/*.txt'))):
import glob
import os
import re
import subprocess

from pydub import AudioSegment
from tqdm import tqdm

from audio import Audio
from utils import train_test_sp_to_utt

deep_speaker_root = '/home/trungvd/.deep-speaker-wd'
audio = Audio(cache_dir=os.path.join(deep_speaker_root, 'triplet-training'))
sp_to_utt_test = train_test_sp_to_utt(audio, is_test=True)

root = '/home/trungvd/repos/speech-reconstruction/samples/librispeech'
os.makedirs(root, exist_ok=True)
outputs = []


def load_transcripts():
    transcripts = dict()
    for transcript_path in tqdm(
            glob.glob(
                '/home/trungvd/.deep-speaker-wd/LibriSpeech/**/**/**/*.txt')):
        with open(transcript_path, 'r') as f:
            lines = f.read().strip().split('\n')
            for line in lines:
                ids, trans = line.split(' ', 1)
                transcripts[ids] = trans
    return transcripts