def __init__(self, working_dir: str, max_length: int, model: DeepSpeakerModel): self.working_dir = working_dir self.audio = Audio(cache_dir=working_dir) logger.info(f'Picking audio from {working_dir}.') self.sp_to_utt_train = train_test_sp_to_utt(self.audio, is_test=False) self.sp_to_utt_test = train_test_sp_to_utt(self.audio, is_test=True) self.max_length = max_length self.model = model self.nb_per_speaker = 2 self.nb_speakers = 640 self.history_length = 4 self.history_every = 100 # batches. self.total_history_length = self.nb_speakers * self.nb_per_speaker * self.history_length # 25,600 self.metadata_train_speakers = Counter() self.metadata_output_file = os.path.join(self.working_dir, 'debug_batcher.json') self.history_embeddings_train = deque(maxlen=self.total_history_length) self.history_utterances_train = deque(maxlen=self.total_history_length) self.history_model_inputs_train = deque(maxlen=self.total_history_length) self.history_embeddings = None self.history_utterances = None self.history_model_inputs = None self.batch_count = 0 for _ in tqdm(range(self.history_length), desc='Initializing the batcher'): # init history. self.update_triplets_history()
def generate_per_phase(self, max_length=NUM_FRAMES, num_per_speaker=3000, is_test=False): # train OR test. num_speakers = len(self.audio.speaker_ids) sp_to_utt = train_test_sp_to_utt(self.audio, is_test) # 64 fbanks 1 channel(s). # float32 kx = np.zeros( (num_speakers * num_per_speaker, max_length, NUM_FBANKS, 1), dtype=np.float32) ky = np.zeros((num_speakers * num_per_speaker, 1), dtype=np.float32) kg = np.zeros((num_speakers * num_per_speaker, 1), dtype=np.float32) desc = f'Converting to Keras format [{"test" if is_test else "train"}]' for i, speaker_id in enumerate(tqdm(self.audio.speaker_ids, desc=desc)): utterances_files = sp_to_utt[speaker_id] for j, utterance_file in enumerate( np.random.choice(utterances_files, size=num_per_speaker, replace=True)): self.load_into_mat(utterance_file, self.categorical_speakers, speaker_id, max_length, kx, ky, kg, i * num_per_speaker + j) return kx, ky, kg
chunk_size in ms iterate over chunks until you find the first one with sound ''' trim_ms = 0 # ms assert chunk_size > 0 # to avoid infinite loop while sound[trim_ms:trim_ms + chunk_size].dBFS < silence_threshold and trim_ms < len(sound): trim_ms += chunk_size return trim_ms deep_speaker_root = os.getenv('WORKING_DIR') or os.path.join(os.getenv('HOME'), '.deep-speaker-wd') audio = Audio(cache_dir=os.path.join(deep_speaker_root, 'triplet-training')) sp_to_utt_test = train_test_sp_to_utt(audio, is_test=True) sp_to_utt_train = train_test_sp_to_utt(audio, is_test=False) for speaker in audio.speaker_ids: sp_to_utt_test[speaker] += sp_to_utt_train[speaker][5:] print("No. audio: %d" % sum(len(sp_to_utt_test[s]) for s in sp_to_utt_test)) root = os.path.join('samples', 'librispeech') os.makedirs(root, exist_ok=True) outputs = [] def load_transcripts(): transcripts = dict() for transcript_path in tqdm(glob.glob(os.path.join(deep_speaker_root, 'LibriSpeech/**/**/**/*.txt'))):
import glob import os import re import subprocess from pydub import AudioSegment from tqdm import tqdm from audio import Audio from utils import train_test_sp_to_utt deep_speaker_root = '/home/trungvd/.deep-speaker-wd' audio = Audio(cache_dir=os.path.join(deep_speaker_root, 'triplet-training')) sp_to_utt_test = train_test_sp_to_utt(audio, is_test=True) root = '/home/trungvd/repos/speech-reconstruction/samples/librispeech' os.makedirs(root, exist_ok=True) outputs = [] def load_transcripts(): transcripts = dict() for transcript_path in tqdm( glob.glob( '/home/trungvd/.deep-speaker-wd/LibriSpeech/**/**/**/*.txt')): with open(transcript_path, 'r') as f: lines = f.read().strip().split('\n') for line in lines: ids, trans = line.split(' ', 1) transcripts[ids] = trans return transcripts