def test_from_file(tmp_path: Path): with (tmp_path / "tokens.txt").open("w") as f: f.write("a\n") f.write("b\n") f.write("c\n") f.write("<unk>\n") converter = TokenIDConverter(tmp_path / "tokens.txt") assert converter.tokens2ids("abc") == [0, 1, 2]
class CommonPreprocessor(AbsPreprocessor): def __init__( self, train: bool, token_type: str = None, token_list: Union[Path, str, Iterable[str]] = None, bpemodel: Union[Path, str, Iterable[str]] = None, unk_symbol: str = "<unk>", space_symbol: str = "<space>", non_linguistic_symbols: Union[Path, str, Iterable[str]] = None, delimiter: str = None, speech_name: str = "speech", text_name: str = "text", ): super().__init__(train) self.train = train self.speech_name = speech_name self.text_name = text_name if token_type is not None: if token_list is None: raise ValueError("token_list is required if token_type is not None") self.tokenizer = build_tokenizer( token_type=token_type, bpemodel=bpemodel, delimiter=delimiter, space_symbol=space_symbol, non_linguistic_symbols=non_linguistic_symbols, ) self.token_id_converter = TokenIDConverter( token_list=token_list, unk_symbol=unk_symbol, ) else: self.tokenizer = None self.token_id_converter = None def __call__( self, uid: str, data: Dict[str, Union[str, np.ndarray]] ) -> Dict[str, np.ndarray]: assert check_argument_types() if self.speech_name in data: # Nothing now: candidates: # - STFT # - Fbank # - CMVN # - Data augmentation pass if self.text_name in data and self.tokenizer is not None: text = data[self.text_name] tokens = self.tokenizer.text2tokens(text) text_ints = self.token_id_converter.tokens2ids(tokens) data[self.text_name] = np.array(text_ints, dtype=np.int64) assert check_return_type(data) return data
class CommonPreprocessor(AbsPreprocessor): def __init__( self, train: bool, token_type: str = None, token_list: Union[Path, str, Iterable[str]] = None, bpemodel: Union[Path, str, Iterable[str]] = None, text_cleaner: Collection[str] = None, g2p_type: str = None, unk_symbol: str = "<unk>", space_symbol: str = "<space>", non_linguistic_symbols: Union[Path, str, Iterable[str]] = None, delimiter: str = None, rir_scp: str = None, rir_apply_prob: float = 1.0, noise_scp: str = None, noise_apply_prob: float = 1.0, noise_db_range: str = "3_10", speech_volume_normalize: float = None, speech_name: str = "speech", text_name: str = "text", ): super().__init__(train) self.train = train self.speech_name = speech_name self.text_name = text_name self.speech_volume_normalize = speech_volume_normalize self.rir_apply_prob = rir_apply_prob self.noise_apply_prob = noise_apply_prob if token_type is not None: if token_list is None: raise ValueError( "token_list is required if token_type is not None") self.text_cleaner = TextCleaner(text_cleaner) self.tokenizer = build_tokenizer( token_type=token_type, bpemodel=bpemodel, delimiter=delimiter, space_symbol=space_symbol, non_linguistic_symbols=non_linguistic_symbols, g2p_type=g2p_type, ) self.token_id_converter = TokenIDConverter( token_list=token_list, unk_symbol=unk_symbol, ) else: self.text_cleaner = None self.tokenizer = None self.token_id_converter = None if train and rir_scp is not None: self.rirs = [] with open(rir_scp, "r", encoding="utf-8") as f: for line in f: sps = line.strip().split(None, 1) if len(sps) == 1: self.rirs.append(sps[0]) else: self.rirs.append(sps[1]) else: self.rirs = None if train and noise_scp is not None: self.noises = [] with open(noise_scp, "r", encoding="utf-8") as f: for line in f: sps = line.strip().split(None, 1) if len(sps) == 1: self.noises.append(sps[0]) else: self.noises.append(sps[1]) sps = noise_db_range.split("_") if len(sps) == 1: self.noise_db_low, self.noise_db_high = float(sps[0]) elif len(sps) == 2: self.noise_db_low, self.noise_db_high = float(sps[0]), float( sps[1]) else: raise ValueError( "Format error: '{noise_db_range}' e.g. -3_4 -> [-3db,4db]") else: self.noises = None def __call__( self, uid: str, data: Dict[str, Union[str, np.ndarray]]) -> Dict[str, np.ndarray]: assert check_argument_types() if self.speech_name in data: if self.train and self.rirs is not None and self.noises is not None: speech = data[self.speech_name] nsamples = len(speech) # speech: (Nmic, Time) if speech.ndim == 1: speech = speech[None, :] else: speech = speech.T # Calc power on non shlence region power = (speech[detect_non_silence(speech)]**2).mean() # 1. Convolve RIR if self.rirs is not None and self.rir_apply_prob >= np.random.random( ): rir_path = np.random.choice(self.rirs) if rir_path is not None: rir, _ = soundfile.read(rir_path, dtype=np.float64, always_2d=True) # rir: (Nmic, Time) rir = rir.T # speech: (Nmic, Time) # Note that this operation doesn't change the signal length speech = scipy.signal.convolve( speech, rir, mode="full")[:, :speech.shape[1]] # Reverse mean power to the original power power2 = (speech[detect_non_silence(speech)]**2).mean() speech = np.sqrt(power / max(power2, 1e-10)) * speech # 2. Add Noise if (self.noises is not None and self.noise_apply_prob >= np.random.random()): noise_path = np.random.choice(self.noises) if noise_path is not None: noise_db = np.random.uniform(self.noise_db_low, self.noise_db_high) with soundfile.SoundFile(noise_path) as f: if f.frames == nsamples: noise = f.read(dtype=np.float64, always_2d=True) elif f.frames < nsamples: offset = np.random.randint( 0, nsamples - f.frames) # noise: (Time, Nmic) noise = f.read(dtype=np.float64, always_2d=True) # Repeat noise noise = np.pad( noise, [(offset, nsamples - f.frames - offset), (0, 0)], mode="wrap", ) else: offset = np.random.randint( 0, f.frames - nsamples) f.seek(offset) # noise: (Time, Nmic) noise = f.read(nsamples, dtype=np.float64, always_2d=True) if len(noise) != nsamples: raise RuntimeError( f"Something wrong: {noise_path}") # noise: (Nmic, Time) noise = noise.T noise_power = (noise**2).mean() scale = (10**(-noise_db / 20) * np.sqrt(power) / np.sqrt(max(noise_power, 1e-10))) speech = speech + scale * noise speech = speech.T ma = np.max(np.abs(speech)) if ma > 1.0: speech /= ma data[self.speech_name] = speech if self.speech_volume_normalize is not None: speech = data[self.speech_name] ma = np.max(np.abs(speech)) data[self. speech_name] = speech * self.speech_volume_normalize / ma if self.text_name in data and self.tokenizer is not None: text = data[self.text_name] # from transformers import pipeline # generator = pipeline('text-generation', model='gpt2') # data[self.text_name] = [] # words = text.split(" ") # chunk_len = 3 # pseudo_lookahead = 3 # for i in range(0, len(words), chunk_len): # chunk = " ".join(words[i:i+chunk_len]) # pseudo = generator(chunk, max_new_tokens=pseudo_lookahead, num_return_sequences=1)[0]["generated_text"] # pseudo = self.text_cleaner(pseudo) # tokens = self.tokenizer.text2tokens(pseudo) # text_ints = self.token_id_converter.tokens2ids(tokens) # data[self.text_name].append(np.array(text_ints, dtype=np.int64)) text = self.text_cleaner(text) tokens = self.tokenizer.text2tokens(text) text_ints = self.token_id_converter.tokens2ids(tokens) data[self.text_name] = np.array(text_ints, dtype=np.int64) assert check_return_type(data) return data
def test_tokens2ids(): converter = TokenIDConverter(["a", "b", "c", "<unk>"]) assert converter.tokens2ids("abc") == [0, 1, 2]