示例#1
0
def convert_chapter(path: pathlib.Path, sink: pathlib.Path, prefix: str,
                    transformer: sox.Transformer):
    transcriptions = locate_transcriptions(path)

    if not sink.is_dir():
        os.makedirs(sink)

    if transcriptions:
        with open(str(transcriptions), "r") as transcriptions:
            for line in transcriptions.readlines():
                line = line.strip()
                end_of_index = line.find(" ")

                file_name = line[:end_of_index].strip()

                file_name_with_extension = file_name + ".flac"

                audio_input_file = path / file_name_with_extension

                audio_output_file = sink / f"{prefix}-{file_name}.wav"
                label_output_file = sink / f"{prefix}-{file_name}.lab"

                label = line[end_of_index:].strip()

                transformer.build_file(input_filepath=str(audio_input_file),
                                       output_filepath=str(audio_output_file))

                with open(str(label_output_file), "w") as label_file:
                    label_file.write(label)
示例#2
0
def convert_audio_and_split_transcript(input_dir, source_name, target_name,
                                       output_dir, output_file):
    print(f"Pre-processing audio and transcript for {source_name}")
    source_dir = os.path.join(input_dir, source_name)
    target_dir = os.path.join(input_dir, target_name)

    if not tf.io.gfile.exists(target_dir):
        tf.io.gfile.makedirs(target_dir)

    files = []
    tfm = Transformer()
    for root, _, filenames in tf.io.gfile.walk(source_dir):
        for filename in fnmatch.filter(filenames, "*.trans.txt"):
            trans_file = os.path.join(root, filename)
            with codecs.open(trans_file, "r", "utf-8") as fin:
                for line in fin:
                    seq_id, transcript = line.split(" ", 1)
                    transcript = unicodedata.normalize(
                        "NFKD", transcript).encode("ascii", "ignore").decode(
                            "ascii", "ignore").strip().lower()

                    flac_file = os.path.join(root, seq_id + ".flac")
                    wav_file = os.path.join(target_dir, seq_id + ".wav")
                    if not tf.io.gfile.exists(wav_file):
                        tfm.build(flac_file, wav_file)
                    wav_filesize = os.path.getsize(wav_file)

                    files.append(
                        (os.path.abspath(wav_file), wav_filesize, transcript))

    csv_file_path = os.path.join(output_dir, output_file)
    df = pandas.DataFrame(
        data=files, columns=["wav_filename", "wav_filesize", "transcript"])
    df.to_csv(csv_file_path, index=False, sep="\t")
    print(f"Successfully generated csv file {csv_file_path}")
示例#3
0
def convert_cv(transformer: sox.Transformer, max_per_speaker: int,
               tsv: pathlib.Path, clips: pathlib.Path, sink: pathlib.Path,
               prefix: str):

    meta = pd.read_csv(tsv, delimiter="\t")

    for client, df in tqdm(meta.groupby(by="client_id")):
        df = df.tail(max_per_speaker)

        speaker_sink = sink / client

        if not speaker_sink.is_dir():
            os.makedirs(speaker_sink)

        for audio, transcription in zip(df["path"], df["sentence"]):

            try:
                audio_without_stem = audio.split(".")[0]

                input_audio_file = clips / audio

                output_audio_file = speaker_sink / f"{prefix}-{audio_without_stem}.wav"
                output_transcription_file = speaker_sink / f"{prefix}-{audio_without_stem}.lab"

                with open(str(output_transcription_file), "w") as o:
                    transcription = normalize_transcription(transcription)
                    o.write(transcription)

                transformer.build_file(input_filepath=str(input_audio_file),
                                       output_filepath=str(output_audio_file))
            except Exception as e:
                print(
                    f"Failed to convert audio {audio} with sentence {transcription} reason: {str(e)}"
                )
示例#4
0
def convert_audio_and_split_transcript(input_dir,
                                       source_name,
                                       target_name,
                                       output_dir,
                                       output_file):
    """Convert FLAC to WAV and split the transcript.
    Args:
        input_dir: the directory which holds the input dataset.
        source_name: the name of the specified dataset. e.g. test-clean
        target_name: the directory name for the newly generated audio files.
                 e.g. test-clean-wav
        output_dir: the directory to place the newly generated csv files.
        output_file: the name of the newly generated csv file. e.g. test-clean.csv
    """

    logging.info("Processing audio and transcript for %s" % source_name)
    source_dir = os.path.join(input_dir, source_name)
    target_dir = os.path.join(input_dir, target_name)

    if not gfile.Exists(target_dir):
        gfile.MakeDirs(target_dir)

    files = []
    tfm = Transformer()
    # Convert all FLAC file into WAV format. At the same time, generate the csv
    for root, _, filenames in gfile.Walk(source_dir):
        for filename in fnmatch.filter(filenames, "*.trans.txt"):
            trans_file = os.path.join(root, filename)
            with codecs.open(trans_file, "r", "utf-8") as fin:
                for line in fin:
                    seqid, transcript = line.split(" ", 1)
                    # We do a encode-decode transformation here because the output type
                    # of encode is a bytes object, we need convert it to string.
                    transcript = (
                        unicodedata.normalize("NFKD", transcript)
                        .encode("ascii", "ignore")
                        .decode("ascii", "ignore")
                        .strip()
                        .lower()
                    )

                    # Convert FLAC to WAV.
                    flac_file = os.path.join(root, seqid + ".flac")
                    wav_file = os.path.join(target_dir, seqid + ".wav")
                    if not gfile.Exists(wav_file):
                        tfm.build(flac_file, wav_file)
                    # wav_filesize = os.path.getsize(wav_file)
                    wav_length = get_wave_file_length(wav_file)

                    files.append((os.path.abspath(wav_file), wav_length, transcript))
    # Write to CSV file which contains three columns:
    # "wav_filename", "wav_length_ms", "transcript".
    csv_file_path = os.path.join(output_dir, output_file)
    df = pandas.DataFrame(
        data=files, columns=["wav_filename", "wav_length_ms", "transcript"]
    )
    df.to_csv(csv_file_path, index=False, sep="\t")
    logging.info("Successfully generated csv file {}".format(csv_file_path))
示例#5
0
def extract_audio():
    all_videos = find_all_video_files(output_dir)
    for video in tqdm(all_videos):
        mkvfile = os.path.join(os.path.dirname(video), 'temp.mkv')
        command = 'mkvmerge -o ' + mkvfile + ' ' + video
        subprocess.call(command, shell=True)
        video_ts_file = os.path.join(os.path.dirname(video), 'video_ts.txt')
        audio_ts_file = os.path.join(os.path.dirname(video), 'audio_ts.txt')
        command = 'mkvextract ' + mkvfile + ' timestamps_v2 0:' + video_ts_file
        subprocess.call(command, shell=True)
        command = 'mkvextract ' + mkvfile + ' timestamps_v2 1:' + audio_ts_file
        subprocess.call(command, shell=True)
        with open(video_ts_file, 'r') as f:
            f.readline()  # skip header
            video_start = f.readline()
        with open(audio_ts_file, 'r') as f:
            f.readline()  # skip header
            audio_start = f.readline()
        offset_ms = int(audio_start) - int(video_start)
        # extract audio
        audio_tmp = os.path.join(os.path.dirname(video), 'temp.wav')
        command = 'ffmpeg -i ' + video + ' -ar 44100 -ac 1 -y ' + audio_tmp
        subprocess.call(command, shell=True)
        # use the offset to pad the audio with zeros, or trim the audio
        audio_name = os.path.splitext(video)[0] + '.wav'
        tfm = Transformer()
        if offset_ms >= 0:
            tfm.pad(start_duration=offset_ms / 1000)
        elif offset_ms < 0:
            tfm.trim(start_time=-offset_ms / 1000)
        tfm.build(audio_tmp, audio_name)
        os.remove(mkvfile)
        os.remove(audio_tmp)
        os.remove(video_ts_file)
        os.remove(audio_ts_file)
示例#6
0
 def preprocess_wav(cls, fpath: Union[str, Path]) -> np.ndarray:
     """Load, resample, normalize and trim a waveform."""
     transformer = Transformer()
     transformer.norm()
     transformer.silence(silence_threshold=1, min_silence_duration=0.1)
     transformer.set_output_format(rate=cls.sample_rate,
                                   bits=16,
                                   channels=1)
     wav = transformer.build_array(input_filepath=str(fpath))
     wav = wav / (2**15)
     return wav.astype(np.float32)
def create_datapoints(transformer: sox.Transformer, writers: Writers,
                      grid: pathlib.Path, audio: pathlib.Path):
    """Creates datapoints from a TextGrid."""
    audio_file = audio / grid.parts[-2] / f"{grid.stem}.wav"

    if audio_file.is_file():
        resampled_audio = transformer.build_array(
            input_filepath=str(audio_file))

        tg = textgrid.TextGrid.fromFile(grid)
        for interval in tg[0]:
            start_time = interval.minTime
            end_time = interval.maxTime
            text = interval.mark

            if text in writers.word_counts:
                start_sample = int(
                    max((start_time - 0.1) * transformer.output_format["rate"],
                        0))
                end_sample = int(
                    min((end_time + 0.1) * transformer.output_format["rate"],
                        resampled_audio.size))

                utterance = resampled_audio[start_sample:end_sample]

                writers.write(word=text,
                              sample_rate=transformer.output_format["rate"],
                              audio=utterance)

    else:
        print(f"File not found: {audio_file}")
示例#8
0
def __process_transcript(file_path: str, dst_folder: str):
    """
    Converts flac files to wav from a given transcript, capturing the metadata.
    Args:
        file_path: path to a source transcript  with flac sources
        dst_folder: path where wav files will be stored
    Returns:
        a list of metadata entries for processed files.
    """
    entries = []
    root = os.path.dirname(file_path)
    with open(file_path, encoding="utf-8") as fin:
        for line in fin:
            id, text = line[:line.index(" ")], line[line.index(" ") + 1:]
            transcript_text = text.lower().strip()

            # Convert FLAC file to WAV
            flac_file = os.path.join(root, id + ".flac")
            wav_file = os.path.join(dst_folder, id + ".wav")
            if not os.path.exists(wav_file):
                Transformer().build(flac_file, wav_file)
            # check duration
            duration = subprocess.check_output("soxi -D {0}".format(wav_file),
                                               shell=True)

            entry = {}
            entry['audio_filepath'] = os.path.abspath(wav_file)
            entry['duration'] = float(duration)
            entry['text'] = transcript_text
            entries.append(entry)
    return entries
示例#9
0
 def convert(self):
     """Converts the mp3's associated with this instance to wav's
     Return:
       wav_directory (os.path): The directory into which the associated wav's were downloaded
     """
     wav_directory = self._pre_convert()
     for mp3_filename in self.mp3_directory.glob('**/*.mp3'):
         wav_filename = path.join(wav_directory, os.path.splitext(os.path.basename(mp3_filename))[0] + ".wav")
         if not path.exists(wav_filename):
             _logger.debug("Converting mp3 file %s to wav file %s" % (mp3_filename, wav_filename))
             transformer = Transformer()
             transformer.convert(samplerate=SAMPLE_RATE, n_channels=N_CHANNELS, bitdepth=BITDEPTH)
             transformer.build(str(mp3_filename), str(wav_filename))
         else:
             _logger.debug("Already converted mp3 file %s to wav file %s" % (mp3_filename, wav_filename))
     return wav_directory
def _convert_audio_and_split_sentences(extracted_dir, data_set, dest_dir):
    source_dir = os.path.join(extracted_dir, data_set)
    target_dir = os.path.join(extracted_dir, dest_dir)

    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    files = []
    for root, dirnames, filenames in os.walk(source_dir):
        for filename in fnmatch.filter(filenames, '*.trans.txt'):
            trans_filename = os.path.join(root, filename)
            with codecs.open(trans_filename, "r", "utf-8") as fin:
                for line in fin:
                    # Parse each segment line
                    first_space = line.find(" ")
                    seqid, transcript = line[:first_space], line[first_space+1:]

                    transcript = unicodedata.normalize("NFKD", transcript)  \
                                            .encode("ascii", "ignore")      \
                                            .decode("ascii", "ignore")

                    transcript = transcript.lower().strip()

                    # Convert corresponding FLAC to a WAV
                    flac_file = os.path.join(root, seqid + ".flac")
                    wav_file = os.path.join(target_dir, seqid + ".wav")
                    if not os.path.exists(wav_file):
                        try:
                            Transformer().build(flac_file, wav_file)
                            wav_filesize = os.path.getsize(wav_file)
                            files.append((os.path.abspath(wav_file), wav_filesize, transcript))
                        except OSError:
                            print("Could not find file:", wav_file, flac_file)

    return pandas.DataFrame(data=files, columns=["wav_filename", "wav_filesize", "transcript"])
示例#11
0
def _maybe_convert_wav(data_dir, extracted_data, converted_data):
    source_dir = os.path.join(data_dir, extracted_data)
    target_dir = os.path.join(data_dir, converted_data)

    # Conditionally convert FLAC files to wav files
    if not gfile.Exists(target_dir):
        # Create target_dir
        os.makedirs(target_dir)

        # Loop over FLAC files in source_dir and convert each to wav
        for root, dirnames, filenames in os.walk(source_dir):
            for filename in fnmatch.filter(filenames, '*.flac'):
                flac_file = os.path.join(root, filename)
                wav_filename = os.path.splitext(os.path.basename(flac_file))[0] + ".wav"
                wav_file = os.path.join(target_dir, wav_filename)
                transformer = Transformer()
                transformer.build(flac_file, wav_file)
                os.remove(flac_file)
示例#12
0
def main():
    fileName = "audio"
    sx = Transformer()
    proxyPool = scraper()
    prefs = getProfile(proxyPool)
    urlAddr, inputs = getInputs()
    browser = automatePage(fireFoxPath=FIREFOX_PATH,
                           prefs=prefs,
                           address=urlAddr,
                           inputList=inputs)

    ##########################
    ### Convert Audio File ###
    ##########################
    print "Converting Audio File"
    sx.build(fileName + ".mp3", fileName + ".wav")

    answer = getAnswer(fileName)

    submitAnswer(browser, answer)
示例#13
0
    def process(x):
        file_path, text = x
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        text = text.lower().strip()
        audio_path = os.path.join(audio_clips_path, file_path)
        output_wav_path = os.path.join(wav_dir, file_name + '.wav')

        tfm = Transformer()
        tfm.rate(samplerate=args.sample_rate)
        tfm.channels(n_channels=args.n_channels)
        tfm.build(input_filepath=audio_path, output_filepath=output_wav_path)
        duration = sox.file_info.duration(output_wav_path)
        return output_wav_path, duration, text
示例#14
0
 def __init__(self, fname):
     '''
         opens .wav audio file with fname
     '''
     self.wav_fname = fname
     try:
         with wave.open(fname, mode="r") as wav:
             self.nchannels, self.sampwidth, self.framerate, self.nframes, self.comptype, self.compname = wav.getparams(
             )
             self.duration = self.nframes / self.framerate
             self.peak = 256**self.sampwidth / 2
             self.content = wav.readframes(self.nframes)
             self.samples = np.fromstring(self.content,
                                          dtype=types[self.sampwidth])
             self.tsm = Transformer()
             return
     except FileNotFoundError as err:
         print(err)
         print("Try Audio.reload_file function")
         raise FileNotFoundError
示例#15
0
def _convert_audio_and_split_sentences(extracted_dir, data_set, dest_dir):
    source_dir = os.path.join(extracted_dir, data_set)
    target_dir = os.path.join(extracted_dir, dest_dir)

    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    # Loop over transcription files and split each one
    #
    # The format for each file 1-2.trans.txt is:
    # 1-2-0 transcription of 1-2-0.flac
    # 1-2-1 transcription of 1-2-1.flac
    # ...
    #
    # Each file is then split into several files:
    # 1-2-0.txt (contains transcription of 1-2-0.flac)
    # 1-2-1.txt (contains transcription of 1-2-1.flac)
    # ...
    #
    # We also convert the corresponding FLACs to WAV in the same pass
    files = []
    for root, dirnames, filenames in os.walk(source_dir):
        for filename in fnmatch.filter(filenames, '*.trans.txt'):
            trans_filename = os.path.join(root, filename)
            with codecs.open(trans_filename, "r", "utf-8") as fin:
                for line in fin:
                    # Parse each segment line
                    first_space = line.find(" ")
                    seqid, transcript = line[:first_space], line[first_space +
                                                                 1:]

                    # We need to do the encode-decode dance here because encode
                    # returns a bytes() object on Python 3, and text_to_char_array
                    # expects a string.
                    transcript = unicodedata.normalize("NFKD", transcript) \
                        .encode("ascii", "ignore")   \
                        .decode("ascii", "ignore")

                    transcript = transcript.lower().strip()

                    # Convert corresponding FLAC to a WAV
                    flac_file = os.path.join(root, seqid + ".flac")
                    wav_file = os.path.join(target_dir, seqid + ".wav")
                    if not os.path.exists(wav_file):
                        Transformer().build(flac_file, wav_file)
                    wav_filesize = os.path.getsize(wav_file)

                    files.append(
                        (os.path.abspath(wav_file), wav_filesize, transcript))
                    os.remove(flac_file)

    return pandas.DataFrame(
        data=files, columns=["wav_filename", "wav_filesize", "transcript"])
示例#16
0
def generate_wavs(data_dir):
    print("hello")
    pbar = ProgressBar()
    for mp3_file in pbar(glob(path.join(data_dir, '*.mp3'))):
        sound = pydub.AudioSegment.from_mp3(mp3_file)
        filename = mp3_file[-10:-4]
        new_file = path.splitext(data_dir)[0] + "/wavs/" + filename + ".wav"
        sound.export(new_file, format="wav")
    pbar = ProgressBar()
    data_dir = data_dir + '/wavs/'
    # change audio file to 16k sample rate
    for wav_file in pbar(glob(path.join(data_dir, '*.wav'))):
        new_file = path.splitext(wav_file)[0] + "k16.wav"
        transformer = Transformer()
        transformer.convert(samplerate=sample_rate)
        transformer.build(wav_file, new_file)
    pbar = ProgressBar()
    # remove old files
    for item in pbar(glob(path.join(data_dir, '*.wav'))):
        if item.endswith("k16.wav"):
            continue
        else:
            os.remove(item)
    pbar = ProgressBar()
    # rename files to remove k16
    for item in pbar(glob(path.join(data_dir, '*.wav'))):
        os.rename(item, item.replace('k16', ''))
    print("end")
示例#17
0
def _maybe_convert_wav_dataset(extracted_dir, data_set):
    # Create source dir
    source_dir = path.join(extracted_dir, data_set, "sph")

    # Create target dir
    target_dir = path.join(extracted_dir, data_set, "wav")

    # Conditionally convert sph files to wav files
    if not gfile.Exists(target_dir):
        # Create target_dir
        makedirs(target_dir)

        # Loop over sph files in source_dir and convert each to wav
        for sph_file in glob(path.join(source_dir, "*.sph")):
            transformer = Transformer()
            wav_filename = path.splitext(path.basename(sph_file))[0] + ".wav"
            wav_file = path.join(target_dir, wav_filename)
            transformer.build(sph_file, wav_file)
            remove(sph_file)

        # Remove source_dir
        rmdir(source_dir)
示例#18
0
def _maybe_convert_wav_dataset(extracted_dir, data_set):
    # Create source dir
    source_dir = path.join(extracted_dir, data_set, "sph")

    # Create target dir
    target_dir = path.join(extracted_dir, data_set, "wav")

    # Conditionally convert sph files to wav files
    if not gfile.Exists(target_dir):
        # Create target_dir
        makedirs(target_dir)

        # Loop over sph files in source_dir and convert each to wav
        for sph_file in glob(path.join(source_dir, "*.sph")):
            transformer = Transformer()
            wav_filename = path.splitext(path.basename(sph_file))[0] + ".wav"
            wav_file = path.join(target_dir, wav_filename)
            transformer.build(sph_file, wav_file)
            remove(sph_file)

        # Remove source_dir
        rmdir(source_dir)
示例#19
0
def __process_data(data_folder: str, dst_folder: str, manifest_file: str):
    """
    Converts flac to wav and build manifests's json
    Args:
        data_folder: source with flac files
        dst_folder: where wav files will be stored
        manifest_file: where to store manifest

    Returns:

    """

    if not os.path.exists(dst_folder):
        os.makedirs(dst_folder)

    files = []
    entries = []

    for root, dirnames, filenames in os.walk(data_folder):
        for filename in fnmatch.filter(filenames, '*.trans.txt'):
            files.append((os.path.join(root, filename), root))

    for transcripts_file, root in tqdm(files):
        with open(transcripts_file, encoding="utf-8") as fin:
            for line in fin:
                id, text = line[:line.index(" ")], line[
                                                   line.index(" ") + 1:]
                transcript_text = text.lower().strip()

                # Convert FLAC file to WAV
                flac_file = os.path.join(root, id + ".flac")
                wav_file = os.path.join(dst_folder, id + ".wav")
                if not os.path.exists(wav_file):
                    Transformer().build(flac_file, wav_file)
                #else:
                #   raise AssertionError("WAV file {0} already exists. Clean up"
                #                        "your destination folder and try again"
                #                        .format(wav_file))
                # check duration
                duration = subprocess.check_output(
                    "soxi -D {0}".format(wav_file), shell=True)

                entry = dict()
                entry['audio_filepath'] = os.path.abspath(wav_file)
                entry['duration'] = float(duration)
                entry['text'] = transcript_text
                entries.append(entry)

    with open(manifest_file, 'w') as fout:
        for m in entries:
            fout.write(json.dumps(m) + '\n')
示例#20
0
def sample_triplet(
    transformer: sox.Transformer, utterances: Utterances
) -> ((np.ndarray, str), (np.ndarray, str), (np.ndarray, str)):
    anchor = random.choice(utterances.words)

    negative = random.choice(utterances.words)
    while negative == anchor:
        negative = random.choice(utterances.words)

    anchor_file = random.choice(utterances.word_files[anchor])

    positive_file = random.choice(utterances.word_files[anchor])
    while positive_file == anchor_file:
        positive_file = random.choice(utterances.word_files[anchor])

    negative_file = random.choice(utterances.word_files[negative])

    # print("anchor file", anchor_file)
    # print("positive file", positive_file)
    # print("negative file", negative_file)

    anchor_word = anchor_file.parent.stem
    anchor_audio = transformer.build_array(input_filepath=str(anchor_file))

    positive_word = positive_file.parent.stem
    positive_audio = transformer.build_array(input_filepath=str(positive_file))

    negative_word = negative_file.parent.stem
    negative_audio = transformer.build_array(input_filepath=str(negative_file))

    #print("anchor-word", anchor_word)
    #print("positive-word", positive_word)
    #print("negative-word", negative_word)

    return (anchor_audio, anchor_word), (positive_audio,
                                         positive_word), (negative_audio,
                                                          negative_word)
示例#21
0
def compressed_wav_to_full(source_dir, target_dir):
    """Convert compressed wav files to full wav files."""

    assert path.exists(source_dir) is True

    if not path.exists(target_dir):
        makedirs(target_dir)

    for compressed_file in glob(path.join(source_dir, "*.wav")):
        transformer = Transformer()
        if hp.callhome_rate == 8000:
            transformer.set_output_format(encoding='signed-integer', channels=1)  # Also set single channel.
        else:  # Do resampling if specified.
            transformer.set_output_format(encoding='signed-integer', channels=1, rate=hp.callhome_rate)
        wav_filename = path.basename(compressed_file)
        wav_file = path.join(target_dir, wav_filename)
        transformer.build(compressed_file, wav_file)
示例#22
0
文件: dataset.py 项目: s3prl/s3prl
def loadFile_thread_exec(data):

    wavs = []
    lengths = []
    for i in range(len(data)):

        fullPath = data[i]
        transformer = Transformer()
        transformer.norm()
        transformer.silence(silence_threshold=1, min_silence_duration=0.1)
        transformer.set_output_format(rate=16000, bits=16, channels=1)
        wav = transformer.build_array(input_filepath=str(fullPath))
        wav = torch.tensor(wav / (2**15)).float()
        length = len(wav)
        if length > max_timestep:
            start = random.randint(0, int(length - max_timestep))
            end = start + max_timestep
            length = max_timestep
            wav = wav[start:end]
        wavs.append(wav)
        lengths.append(torch.tensor(length).long())
    return wavs, lengths
    def process(x):
        file_path, text = x
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        text = text.strip().upper()
        with open(os.path.join(txt_dir, file_name + '.txt'), 'w') as f:
            f.write(text)
        audio_path = os.path.join(audio_clips_path, file_path)
        output_wav_path = os.path.join(wav_dir, file_name + '.wav')

        tfm = Transformer()
        tfm.rate(samplerate=args.sample_rate)
        tfm.build(input_filepath=audio_path, output_filepath=output_wav_path)
示例#24
0
    def process(x):
        file_path, text = x
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        text = text.lower().strip()
        audio_path = os.path.join(audio_clips_path, file_path)
        if os.path.getsize(audio_path) == 0:
            logging.warning(f'Skipping empty audio file {audio_path}')
            return '', '', ''

        output_wav_path = os.path.join(wav_dir, file_name + '.wav')

        if not os.path.exists(output_wav_path):
            tfm = Transformer()
            tfm.rate(samplerate=args.sample_rate)
            tfm.channels(n_channels=args.n_channels)
            tfm.build(input_filepath=audio_path, output_filepath=output_wav_path)

        duration = sox.file_info.duration(output_wav_path)
        return output_wav_path, duration, text
示例#25
0
文件: dataset.py 项目: s3prl/s3prl
def loadFile(data, max_timestep):
    transformer = Transformer()
    transformer.norm()
    # transformer.silence(silence_threshold=1, min_silence_duration=0.1)
    transformer.set_output_format(rate=16000, bits=16, channels=1)
    wav = transformer.build_array(input_filepath=str(data))
    wav = torch.tensor(wav / (2**15)).float()
    length = len(wav)
    if length > max_timestep:
        start = 0
        end = max_timestep
        length = max_timestep
        wav = wav[start:end]
    length = torch.tensor(length).long()

    return wav, length
def sph_to_wav(source_dir, target_dir):
    """Convert .sph files to .wav files."""

    assert path.exists(source_dir) is True

    if not path.exists(target_dir):
        makedirs(target_dir)

    for sph_file in glob(path.join(source_dir, "*.sph")):
        transformer = Transformer()
        if hp.tedlium_rate != 16000:
            transformer.set_output_format(encoding='signed-integer',
                                          channels=1,
                                          rate=hp.tedlium_rate)
        wav_filename = path.splitext(path.basename(sph_file))[0] + ".wav"
        wav_file = path.join(target_dir, wav_filename)
        transformer.build(sph_file, wav_file)
示例#27
0
def pitch_shift(aud_seg: AudioSegment, semi: float, **kwargs):
    """Pitch shift audio sample by semi semitones, without changing
    the speed of the audio segment.

    Arguments:
        aud_seg: audio segment to alter
        semi: Number of semitones to pitch audio
    """
    # Create a sox transformer
    tfm = Transformer()
    tfm.pitch(semi)
    # Unfortunately, using our current libraries, idk how to make this faster
    # Sox requires an input file and an output file to perform the pitch shift
    temp_in_file = NamedTemporaryFile(suffix='.wav')
    aud_seg.export(temp_in_file, format='wav')
    temp_out_file = NamedTemporaryFile(suffix='.wav')
    tfm.build(temp_in_file.name, temp_out_file.name)

    return AudioSegment.from_file(temp_out_file.name, format='wav')
示例#28
0
    def read(self, audio_metadata):
        """Read an audio file.

        :param audio_metadata: metadata info of an audio
        :return: raw audio data as float32 array and duration in seconds.
        """
        fd = temp_path = None
        # Convert it to a wav file.
        if not audio_metadata.path.endswith('.wav'):
            original_sample_rate = file_info.sample_rate(audio_metadata.path)
            assert self._sample_rate <= original_sample_rate
            transformer = Transformer()
            transformer.convert(samplerate=self._sample_rate,
                                n_channels=self._channels,
                                bitdepth=self._bits_per_sample)
            fd, temp_path = tempfile.mkstemp(suffix='.wav')
            transformer.build(audio_metadata.path, temp_path)

        if temp_path:
            path = temp_path
        else:
            path = audio_metadata.path

        # Read the audio file.
        with SoundFile(path) as soundfile:
            # make sure the audio properties are as expected.
            assert soundfile.samplerate == self._sample_rate
            assert soundfile.channels == self._channels
            duration_sec = len(soundfile) / self._sample_rate
            pcm = soundfile.read(dtype='float32')

            # Add 0.5 second silence to the end of files containing keyword as in occasionally the user stopped
            # recording right after uttering the keyword. If the detector needs some time after seeing the keyword to
            # make a decision (e.g. endpointing) this is going to artificially increase the miss rates.
            if audio_metadata.is_keyword:
                pcm = np.append(pcm, np.zeros(self._sample_rate // 2))

            if temp_path:
                os.close(fd)
                os.remove(temp_path)

            return pcm, duration_sec
示例#29
0
def speedup(aud_seg: AudioSegment, speed: float, **kwargs):
    """Speed up (or slow down) audio segment
    
    Args:
        aud_seg: audio segment to alter
        speed: new playback speed. Should be thought of as a
            percentage. For example, if we want to speed up
            aud_seg by 20%, we pass in 1.2. To slow it down
            to 80%, pass in 0.8
    """
    tfm = Transformer()
    tfm.tempo(speed)
    # Unfortunately, using our current libraries, idk how to make this faster
    # Sox requires an input file and an output file to perform the tempo shift
    temp_in_file = NamedTemporaryFile(suffix='.wav')
    aud_seg.export(temp_in_file, format='wav')
    temp_out_file = NamedTemporaryFile(suffix='.wav')
    tfm.build(temp_in_file.name, temp_out_file.name)

    return AudioSegment.from_file(temp_out_file.name, format='wav')
 def convert(self):
     """Converts the mp3's associated with this instance to wav's
     Return:
       wav_directory (os.path): The directory into which the associated wav's were downloaded
     """
     wav_directory = self._pre_convert()
     for mp3_filename in self.mp3_directory.glob('**/*.mp3'):
         wav_filename = path.join(
             wav_directory,
             os.path.splitext(os.path.basename(mp3_filename))[0] + ".wav")
         if not path.exists(wav_filename):
             _logger.debug("Converting mp3 file %s to wav file %s" %
                           (mp3_filename, wav_filename))
             transformer = Transformer()
             transformer.convert(samplerate=SAMPLE_RATE,
                                 n_channels=N_CHANNELS,
                                 bitdepth=BITDEPTH)
             transformer.build(str(mp3_filename), str(wav_filename))
         else:
             _logger.debug("Already converted mp3 file %s to wav file %s" %
                           (mp3_filename, wav_filename))
     return wav_directory
示例#31
0
def convert_audio_and_split_transcript(input_dir, source_name, target_name,
                                       output_dir, output_file):
    """Convert FLAC to WAV and split the transcript.

  For audio file, convert the format from FLAC to WAV using the sox.Transformer
  library.
  For transcripts, each line contains the sequence id and the corresponding
  transcript (separated by space):
  Input data format: seq-id transcript_of_seq-id
  For example:
   1-2-0 transcript_of_1-2-0.flac
   1-2-1 transcript_of_1-2-1.flac
   ...

  Each sequence id has a corresponding .flac file.
  Parse the transcript file and generate a new csv file which has three columns:
  "wav_filename": the absolute path to a wav file.
  "wav_filesize": the size of the corresponding wav file.
  "transcript": the transcript for this audio segement.

  Args:
    input_dir: the directory which holds the input dataset.
    source_name: the name of the specified dataset. e.g. test-clean
    target_name: the directory name for the newly generated audio files.
                 e.g. test-clean-wav
    output_dir: the directory to place the newly generated csv files.
    output_file: the name of the newly generated csv file. e.g. test-clean.csv
  """

    tf.logging.info("Preprocessing audio and transcript for %s" % source_name)
    source_dir = os.path.join(input_dir, source_name)
    target_dir = os.path.join(input_dir, target_name)

    if not tf.gfile.Exists(target_dir):
        tf.gfile.MakeDirs(target_dir)

    files = []
    tfm = Transformer()
    # Convert all FLAC file into WAV format. At the same time, generate the csv
    # file.
    for root, _, filenames in tf.gfile.Walk(source_dir):
        for filename in fnmatch.filter(filenames, "*.trans.txt"):
            trans_file = os.path.join(root, filename)
            with codecs.open(trans_file, "r", "utf-8") as fin:
                for line in fin:
                    seqid, transcript = line.split(" ", 1)
                    # We do a encode-decode transformation here because the output type
                    # of encode is a bytes object, we need convert it to string.
                    transcript = unicodedata.normalize(
                        "NFKD", transcript).encode("ascii", "ignore").decode(
                            "ascii", "ignore").strip().lower()

                    # Convert FLAC to WAV.
                    flac_file = os.path.join(root, seqid + ".flac")
                    wav_file = os.path.join(target_dir, seqid + ".wav")
                    if not tf.gfile.Exists(wav_file):
                        tfm.build(flac_file, wav_file)
                    wav_filesize = os.path.getsize(wav_file)

                    files.append(
                        (os.path.abspath(wav_file), wav_filesize, transcript))

    # Write to CSV file which contains three columns:
    # "wav_filename", "wav_filesize", "transcript".
    csv_file_path = os.path.join(output_dir, output_file)
    df = pandas.DataFrame(
        data=files, columns=["wav_filename", "wav_filesize", "transcript"])
    df.to_csv(csv_file_path, index=False, sep="\t")
    tf.logging.info("Successfully generated csv file {}".format(csv_file_path))
示例#32
0
def convert_audio_and_split_transcript(directory, subset, output_dir):
    """Convert SPH to WAV and split the transcript.
    Args:
        directory: the directory which holds the input dataset.
        subset: the name of the specified dataset. supports train 
                (switchboard+fisher), switchboard, fisher, hub500 and rt03s.
        output_dir: the directory to place the newly generated csv files.
    """
    logging.info("Processing audio and transcript for %s" % subset)
    gfile = tf.compat.v1.gfile
    sph2pip = os.path.join(os.path.dirname(__file__), "../utils/sph2pipe")

    swd_audio_trans_dir = [os.path.join(directory, "LDC97S62")]
    fisher_audio_dirs = [
        os.path.join(directory, "LDC2004S13"),
        os.path.join(directory, "LDC2005S13"),
    ]
    fisher_trans_dirs = [
        os.path.join(directory, "LDC2004T19"),
        os.path.join(directory, "LDC2005T19"),
    ]
    hub_audio_dir = [os.path.join(directory, "LDC2002S09")]
    hub_trans_dir = [os.path.join(directory, "LDC2002T43")]
    rts_audio_trans_dir = [os.path.join(directory, "LDC2007S10")]

    if subset == "train":
        # Combination of switchboard corpus and fisher corpus.
        audio_dir = swd_audio_trans_dir + fisher_audio_dirs
        trans_dir = swd_audio_trans_dir + fisher_trans_dirs
    elif subset == "switchboard":
        audio_dir = swd_audio_trans_dir
        trans_dir = swd_audio_trans_dir
    elif subset == "fisher":
        audio_dir = fisher_audio_dirs
        trans_dir = fisher_trans_dirs
    elif subset == "hub500":
        audio_dir = hub_audio_dir
        trans_dir = hub_trans_dir
    elif subset == "rt03s":
        audio_dir = rts_audio_trans_dir
        trans_dir = rts_audio_trans_dir
    else:
        raise ValueError(subset, " is not in switchboard_fisher")

    subset_dir = os.path.join(directory, subset)
    if not gfile.Exists(subset_dir):
        gfile.MakeDirs(subset_dir)
    output_wav_dir = os.path.join(directory, subset + "/wav")
    if not gfile.Exists(output_wav_dir):
        gfile.MakeDirs(output_wav_dir)
    tmp_dir = os.path.join(directory, "tmp")
    if not gfile.Exists(tmp_dir):
        gfile.MakeDirs(tmp_dir)

    # Build SPH dict.
    files = []
    sph_files_dict = {}
    for sub_audio_dir in audio_dir:
        for root, _, filenames in gfile.Walk(sub_audio_dir):
            for filename in fnmatch.filter(filenames, "*.[Ss][Pp][Hh]"):
                sph_key = os.path.splitext(filename)[0]
                sph_file = os.path.join(root, filename)
                sph_files_dict[sph_key] = sph_file

    with TemporaryDirectory(dir=tmp_dir) as output_tmp_wav_dir:
        for sub_trans_dir in trans_dir:
            if sub_trans_dir in swd_audio_trans_dir:
                fnmatch_pat = "*-trans.text"
                split_and_norm_func = split_line_and_norm_swd
            elif sub_trans_dir in fisher_trans_dirs:
                fnmatch_pat = "*.[Tt][Xx][Tt]"
                split_and_norm_func = split_line_and_norm_fisher
            elif sub_trans_dir in hub_trans_dir:
                fnmatch_pat = "hub5e00.english.000405.stm"
                split_and_norm_func = split_line_and_norm_hub_rts
            else:
                fnmatch_pat = "*.stm"
                split_and_norm_func = split_line_and_norm_hub_rts

            for root, _, filenames in gfile.Walk(sub_trans_dir):
                for filename in fnmatch.filter(filenames, fnmatch_pat):
                    trans_file = os.path.join(root, filename)
                    if 1 in [
                            ele in root for ele in [
                                "doc",
                                "DOC",
                                "mandarin",
                                "arabic",
                                "concatenated",
                                "bnews",
                            ]
                    ]:
                        continue
                    with codecs.open(trans_file, "r", "utf-8") as fin:
                        for line in fin:
                            line = line.strip()
                            (
                                sph_key,
                                speaker,
                                time_start,
                                time_end,
                                norm_trans,
                            ) = split_and_norm_func(line, filename)

                            # Too short, skip the wave file
                            if time_end - time_start <= 0.1:
                                continue
                            if norm_trans == "":
                                continue
                            if speaker == "A":
                                channel = 1
                            else:
                                channel = 2

                            # Convert SPH to split WAV.
                            if sph_key not in sph_files_dict:
                                print(sph_key + " not found, please check.")
                                continue
                            sph_file = sph_files_dict[sph_key]
                            wav_file = os.path.join(
                                output_tmp_wav_dir,
                                sph_key + "." + speaker + ".wav")
                            if not gfile.Exists(sph_file):
                                raise ValueError(
                                    "the sph file {} is not exists".format(
                                        sph_file))

                            sub_wav_filename = "{0}-{1}-{2:06d}-{3:06d}".format(
                                sph_key,
                                speaker,
                                round(time_start * 100),
                                round(time_end * 100),
                            )
                            sub_wav_file = os.path.join(
                                output_wav_dir, sub_wav_filename + ".wav")

                            if not gfile.Exists(sub_wav_file):
                                if not gfile.Exists(wav_file):
                                    sph2pipe_cmd = (sph2pip +
                                                    " -f wav -c {} -p ".format(
                                                        str(channel)) +
                                                    sph_file + " " + wav_file)
                                    os.system(sph2pipe_cmd)
                                tfm = Transformer()
                                tfm.trim(time_start, time_end)
                                tfm.build(wav_file, sub_wav_file)

                            # wav_filesize = os.path.getsize(sub_wav_file)
                            wav_length = get_wave_file_length(sub_wav_file)
                            speaker_name = sph_key + "-" + speaker
                            files.append(
                                (os.path.abspath(sub_wav_file), wav_length,
                                 norm_trans, speaker_name))

    # Write to CSV file which contains four columns:
    # "wav_filename", "wav_length_ms", "transcript", "speaker".
    out_csv_file = os.path.join(output_dir, subset + ".csv")
    df = pandas.DataFrame(
        data=files,
        columns=["wav_filename", "wav_length_ms", "transcript", "speaker"])
    df.to_csv(out_csv_file, index=False, sep="\t")
    logging.info("Successfully generated csv file {}".format(out_csv_file))
示例#33
0
def _maybe_convert_wav(mp3_filename, wav_filename):
    if not path.exists(wav_filename):
        transformer = Transformer()
        transformer.convert(samplerate=SAMPLE_RATE)
        transformer.build(mp3_filename, wav_filename)
示例#34
0
def convert_audio_and_split_transcript(input_dir, source_name, target_name,
                                       output_dir, output_file):
  """Convert FLAC to WAV and split the transcript.

  For audio file, convert the format from FLAC to WAV using the sox.Transformer
  library.
  For transcripts, each line contains the sequence id and the corresponding
  transcript (separated by space):
  Input data format: seq-id transcript_of_seq-id
  For example:
   1-2-0 transcript_of_1-2-0.flac
   1-2-1 transcript_of_1-2-1.flac
   ...

  Each sequence id has a corresponding .flac file.
  Parse the transcript file and generate a new csv file which has three columns:
  "wav_filename": the absolute path to a wav file.
  "wav_filesize": the size of the corresponding wav file.
  "transcript": the transcript for this audio segement.

  Args:
    input_dir: the directory which holds the input dataset.
    source_name: the name of the specified dataset. e.g. test-clean
    target_name: the directory name for the newly generated audio files.
                 e.g. test-clean-wav
    output_dir: the directory to place the newly generated csv files.
    output_file: the name of the newly generated csv file. e.g. test-clean.csv
  """

  tf.logging.info("Preprocessing audio and transcript for %s" % source_name)
  source_dir = os.path.join(input_dir, source_name)
  target_dir = os.path.join(input_dir, target_name)

  if not tf.gfile.Exists(target_dir):
    tf.gfile.MakeDirs(target_dir)

  files = []
  tfm = Transformer()
  # Convert all FLAC file into WAV format. At the same time, generate the csv
  # file.
  for root, _, filenames in tf.gfile.Walk(source_dir):
    for filename in fnmatch.filter(filenames, "*.trans.txt"):
      trans_file = os.path.join(root, filename)
      with codecs.open(trans_file, "r", "utf-8") as fin:
        for line in fin:
          seqid, transcript = line.split(" ", 1)
          # We do a encode-decode transformation here because the output type
          # of encode is a bytes object, we need convert it to string.
          transcript = unicodedata.normalize("NFKD", transcript).encode(
              "ascii", "ignore").decode("ascii", "ignore").strip().lower()

          # Convert FLAC to WAV.
          flac_file = os.path.join(root, seqid + ".flac")
          wav_file = os.path.join(target_dir, seqid + ".wav")
          if not tf.gfile.Exists(wav_file):
            tfm.build(flac_file, wav_file)
          wav_filesize = os.path.getsize(wav_file)

          files.append((os.path.abspath(wav_file), wav_filesize, transcript))

  # Write to CSV file which contains three columns:
  # "wav_filename", "wav_filesize", "transcript".
  csv_file_path = os.path.join(output_dir, output_file)
  df = pandas.DataFrame(
      data=files, columns=["wav_filename", "wav_filesize", "transcript"])
  df.to_csv(csv_file_path, index=False, sep="\t")
  tf.logging.info("Successfully generated csv file {}".format(csv_file_path))