def create_datapoints(transformer: sox.Transformer, writers: Writers,
                      grid: pathlib.Path, audio: pathlib.Path):
    """Creates datapoints from a TextGrid."""
    audio_file = audio / grid.parts[-2] / f"{grid.stem}.wav"

    if audio_file.is_file():
        resampled_audio = transformer.build_array(
            input_filepath=str(audio_file))

        tg = textgrid.TextGrid.fromFile(grid)
        for interval in tg[0]:
            start_time = interval.minTime
            end_time = interval.maxTime
            text = interval.mark

            if text in writers.word_counts:
                start_sample = int(
                    max((start_time - 0.1) * transformer.output_format["rate"],
                        0))
                end_sample = int(
                    min((end_time + 0.1) * transformer.output_format["rate"],
                        resampled_audio.size))

                utterance = resampled_audio[start_sample:end_sample]

                writers.write(word=text,
                              sample_rate=transformer.output_format["rate"],
                              audio=utterance)

    else:
        print(f"File not found: {audio_file}")
示例#2
0
 def preprocess_wav(cls, fpath: Union[str, Path]) -> np.ndarray:
     """Load, resample, normalize and trim a waveform."""
     transformer = Transformer()
     transformer.norm()
     transformer.silence(silence_threshold=1, min_silence_duration=0.1)
     transformer.set_output_format(rate=cls.sample_rate,
                                   bits=16,
                                   channels=1)
     wav = transformer.build_array(input_filepath=str(fpath))
     wav = wav / (2**15)
     return wav.astype(np.float32)
示例#3
0
def sample_triplet(
    transformer: sox.Transformer, utterances: Utterances
) -> ((np.ndarray, str), (np.ndarray, str), (np.ndarray, str)):
    anchor = random.choice(utterances.words)

    negative = random.choice(utterances.words)
    while negative == anchor:
        negative = random.choice(utterances.words)

    anchor_file = random.choice(utterances.word_files[anchor])

    positive_file = random.choice(utterances.word_files[anchor])
    while positive_file == anchor_file:
        positive_file = random.choice(utterances.word_files[anchor])

    negative_file = random.choice(utterances.word_files[negative])

    # print("anchor file", anchor_file)
    # print("positive file", positive_file)
    # print("negative file", negative_file)

    anchor_word = anchor_file.parent.stem
    anchor_audio = transformer.build_array(input_filepath=str(anchor_file))

    positive_word = positive_file.parent.stem
    positive_audio = transformer.build_array(input_filepath=str(positive_file))

    negative_word = negative_file.parent.stem
    negative_audio = transformer.build_array(input_filepath=str(negative_file))

    #print("anchor-word", anchor_word)
    #print("positive-word", positive_word)
    #print("negative-word", negative_word)

    return (anchor_audio, anchor_word), (positive_audio,
                                         positive_word), (negative_audio,
                                                          negative_word)
示例#4
0
文件: dataset.py 项目: s3prl/s3prl
def loadFile(data, max_timestep):
    transformer = Transformer()
    transformer.norm()
    # transformer.silence(silence_threshold=1, min_silence_duration=0.1)
    transformer.set_output_format(rate=16000, bits=16, channels=1)
    wav = transformer.build_array(input_filepath=str(data))
    wav = torch.tensor(wav / (2**15)).float()
    length = len(wav)
    if length > max_timestep:
        start = 0
        end = max_timestep
        length = max_timestep
        wav = wav[start:end]
    length = torch.tensor(length).long()

    return wav, length
示例#5
0
文件: dataset.py 项目: s3prl/s3prl
def loadFile_thread_exec(data):

    wavs = []
    lengths = []
    for i in range(len(data)):

        fullPath = data[i]
        transformer = Transformer()
        transformer.norm()
        transformer.silence(silence_threshold=1, min_silence_duration=0.1)
        transformer.set_output_format(rate=16000, bits=16, channels=1)
        wav = transformer.build_array(input_filepath=str(fullPath))
        wav = torch.tensor(wav / (2**15)).float()
        length = len(wav)
        if length > max_timestep:
            start = random.randint(0, int(length - max_timestep))
            end = start + max_timestep
            length = max_timestep
            wav = wav[start:end]
        wavs.append(wav)
        lengths.append(torch.tensor(length).long())
    return wavs, lengths