def create_datapoints(transformer: sox.Transformer, writers: Writers, grid: pathlib.Path, audio: pathlib.Path): """Creates datapoints from a TextGrid.""" audio_file = audio / grid.parts[-2] / f"{grid.stem}.wav" if audio_file.is_file(): resampled_audio = transformer.build_array( input_filepath=str(audio_file)) tg = textgrid.TextGrid.fromFile(grid) for interval in tg[0]: start_time = interval.minTime end_time = interval.maxTime text = interval.mark if text in writers.word_counts: start_sample = int( max((start_time - 0.1) * transformer.output_format["rate"], 0)) end_sample = int( min((end_time + 0.1) * transformer.output_format["rate"], resampled_audio.size)) utterance = resampled_audio[start_sample:end_sample] writers.write(word=text, sample_rate=transformer.output_format["rate"], audio=utterance) else: print(f"File not found: {audio_file}")
def preprocess_wav(cls, fpath: Union[str, Path]) -> np.ndarray: """Load, resample, normalize and trim a waveform.""" transformer = Transformer() transformer.norm() transformer.silence(silence_threshold=1, min_silence_duration=0.1) transformer.set_output_format(rate=cls.sample_rate, bits=16, channels=1) wav = transformer.build_array(input_filepath=str(fpath)) wav = wav / (2**15) return wav.astype(np.float32)
def sample_triplet( transformer: sox.Transformer, utterances: Utterances ) -> ((np.ndarray, str), (np.ndarray, str), (np.ndarray, str)): anchor = random.choice(utterances.words) negative = random.choice(utterances.words) while negative == anchor: negative = random.choice(utterances.words) anchor_file = random.choice(utterances.word_files[anchor]) positive_file = random.choice(utterances.word_files[anchor]) while positive_file == anchor_file: positive_file = random.choice(utterances.word_files[anchor]) negative_file = random.choice(utterances.word_files[negative]) # print("anchor file", anchor_file) # print("positive file", positive_file) # print("negative file", negative_file) anchor_word = anchor_file.parent.stem anchor_audio = transformer.build_array(input_filepath=str(anchor_file)) positive_word = positive_file.parent.stem positive_audio = transformer.build_array(input_filepath=str(positive_file)) negative_word = negative_file.parent.stem negative_audio = transformer.build_array(input_filepath=str(negative_file)) #print("anchor-word", anchor_word) #print("positive-word", positive_word) #print("negative-word", negative_word) return (anchor_audio, anchor_word), (positive_audio, positive_word), (negative_audio, negative_word)
def loadFile(data, max_timestep): transformer = Transformer() transformer.norm() # transformer.silence(silence_threshold=1, min_silence_duration=0.1) transformer.set_output_format(rate=16000, bits=16, channels=1) wav = transformer.build_array(input_filepath=str(data)) wav = torch.tensor(wav / (2**15)).float() length = len(wav) if length > max_timestep: start = 0 end = max_timestep length = max_timestep wav = wav[start:end] length = torch.tensor(length).long() return wav, length
def loadFile_thread_exec(data): wavs = [] lengths = [] for i in range(len(data)): fullPath = data[i] transformer = Transformer() transformer.norm() transformer.silence(silence_threshold=1, min_silence_duration=0.1) transformer.set_output_format(rate=16000, bits=16, channels=1) wav = transformer.build_array(input_filepath=str(fullPath)) wav = torch.tensor(wav / (2**15)).float() length = len(wav) if length > max_timestep: start = random.randint(0, int(length - max_timestep)) end = start + max_timestep length = max_timestep wav = wav[start:end] wavs.append(wav) lengths.append(torch.tensor(length).long()) return wavs, lengths