Exemplo n.º 1
0
def generate_wavs(data_dir):
    print("hello")
    pbar = ProgressBar()
    for mp3_file in pbar(glob(path.join(data_dir, '*.mp3'))):
        sound = pydub.AudioSegment.from_mp3(mp3_file)
        filename = mp3_file[-10:-4]
        new_file = path.splitext(data_dir)[0] + "/wavs/" + filename + ".wav"
        sound.export(new_file, format="wav")
    pbar = ProgressBar()
    data_dir = data_dir + '/wavs/'
    # change audio file to 16k sample rate
    for wav_file in pbar(glob(path.join(data_dir, '*.wav'))):
        new_file = path.splitext(wav_file)[0] + "k16.wav"
        transformer = Transformer()
        transformer.convert(samplerate=sample_rate)
        transformer.build(wav_file, new_file)
    pbar = ProgressBar()
    # remove old files
    for item in pbar(glob(path.join(data_dir, '*.wav'))):
        if item.endswith("k16.wav"):
            continue
        else:
            os.remove(item)
    pbar = ProgressBar()
    # rename files to remove k16
    for item in pbar(glob(path.join(data_dir, '*.wav'))):
        os.rename(item, item.replace('k16', ''))
    print("end")
Exemplo n.º 2
0
def extract_audio():
    all_videos = find_all_video_files(output_dir)
    for video in tqdm(all_videos):
        mkvfile = os.path.join(os.path.dirname(video), 'temp.mkv')
        command = 'mkvmerge -o ' + mkvfile + ' ' + video
        subprocess.call(command, shell=True)
        video_ts_file = os.path.join(os.path.dirname(video), 'video_ts.txt')
        audio_ts_file = os.path.join(os.path.dirname(video), 'audio_ts.txt')
        command = 'mkvextract ' + mkvfile + ' timestamps_v2 0:' + video_ts_file
        subprocess.call(command, shell=True)
        command = 'mkvextract ' + mkvfile + ' timestamps_v2 1:' + audio_ts_file
        subprocess.call(command, shell=True)
        with open(video_ts_file, 'r') as f:
            f.readline()  # skip header
            video_start = f.readline()
        with open(audio_ts_file, 'r') as f:
            f.readline()  # skip header
            audio_start = f.readline()
        offset_ms = int(audio_start) - int(video_start)
        # extract audio
        audio_tmp = os.path.join(os.path.dirname(video), 'temp.wav')
        command = 'ffmpeg -i ' + video + ' -ar 44100 -ac 1 -y ' + audio_tmp
        subprocess.call(command, shell=True)
        # use the offset to pad the audio with zeros, or trim the audio
        audio_name = os.path.splitext(video)[0] + '.wav'
        tfm = Transformer()
        if offset_ms >= 0:
            tfm.pad(start_duration=offset_ms / 1000)
        elif offset_ms < 0:
            tfm.trim(start_time=-offset_ms / 1000)
        tfm.build(audio_tmp, audio_name)
        os.remove(mkvfile)
        os.remove(audio_tmp)
        os.remove(video_ts_file)
        os.remove(audio_ts_file)
Exemplo n.º 3
0
def convert_audio_and_split_transcript(input_dir, source_name, target_name,
                                       output_dir, output_file):
    print(f"Pre-processing audio and transcript for {source_name}")
    source_dir = os.path.join(input_dir, source_name)
    target_dir = os.path.join(input_dir, target_name)

    if not tf.io.gfile.exists(target_dir):
        tf.io.gfile.makedirs(target_dir)

    files = []
    tfm = Transformer()
    for root, _, filenames in tf.io.gfile.walk(source_dir):
        for filename in fnmatch.filter(filenames, "*.trans.txt"):
            trans_file = os.path.join(root, filename)
            with codecs.open(trans_file, "r", "utf-8") as fin:
                for line in fin:
                    seq_id, transcript = line.split(" ", 1)
                    transcript = unicodedata.normalize(
                        "NFKD", transcript).encode("ascii", "ignore").decode(
                            "ascii", "ignore").strip().lower()

                    flac_file = os.path.join(root, seq_id + ".flac")
                    wav_file = os.path.join(target_dir, seq_id + ".wav")
                    if not tf.io.gfile.exists(wav_file):
                        tfm.build(flac_file, wav_file)
                    wav_filesize = os.path.getsize(wav_file)

                    files.append(
                        (os.path.abspath(wav_file), wav_filesize, transcript))

    csv_file_path = os.path.join(output_dir, output_file)
    df = pandas.DataFrame(
        data=files, columns=["wav_filename", "wav_filesize", "transcript"])
    df.to_csv(csv_file_path, index=False, sep="\t")
    print(f"Successfully generated csv file {csv_file_path}")
Exemplo n.º 4
0
def convert_audio_and_split_transcript(input_dir,
                                       source_name,
                                       target_name,
                                       output_dir,
                                       output_file):
    """Convert FLAC to WAV and split the transcript.
    Args:
        input_dir: the directory which holds the input dataset.
        source_name: the name of the specified dataset. e.g. test-clean
        target_name: the directory name for the newly generated audio files.
                 e.g. test-clean-wav
        output_dir: the directory to place the newly generated csv files.
        output_file: the name of the newly generated csv file. e.g. test-clean.csv
    """

    logging.info("Processing audio and transcript for %s" % source_name)
    source_dir = os.path.join(input_dir, source_name)
    target_dir = os.path.join(input_dir, target_name)

    if not gfile.Exists(target_dir):
        gfile.MakeDirs(target_dir)

    files = []
    tfm = Transformer()
    # Convert all FLAC file into WAV format. At the same time, generate the csv
    for root, _, filenames in gfile.Walk(source_dir):
        for filename in fnmatch.filter(filenames, "*.trans.txt"):
            trans_file = os.path.join(root, filename)
            with codecs.open(trans_file, "r", "utf-8") as fin:
                for line in fin:
                    seqid, transcript = line.split(" ", 1)
                    # We do a encode-decode transformation here because the output type
                    # of encode is a bytes object, we need convert it to string.
                    transcript = (
                        unicodedata.normalize("NFKD", transcript)
                        .encode("ascii", "ignore")
                        .decode("ascii", "ignore")
                        .strip()
                        .lower()
                    )

                    # Convert FLAC to WAV.
                    flac_file = os.path.join(root, seqid + ".flac")
                    wav_file = os.path.join(target_dir, seqid + ".wav")
                    if not gfile.Exists(wav_file):
                        tfm.build(flac_file, wav_file)
                    # wav_filesize = os.path.getsize(wav_file)
                    wav_length = get_wave_file_length(wav_file)

                    files.append((os.path.abspath(wav_file), wav_length, transcript))
    # Write to CSV file which contains three columns:
    # "wav_filename", "wav_length_ms", "transcript".
    csv_file_path = os.path.join(output_dir, output_file)
    df = pandas.DataFrame(
        data=files, columns=["wav_filename", "wav_length_ms", "transcript"]
    )
    df.to_csv(csv_file_path, index=False, sep="\t")
    logging.info("Successfully generated csv file {}".format(csv_file_path))
Exemplo n.º 5
0
def cut(input_path, output_file, metadata):
    segments = metadata['segments']
    segments = [segment_seconds(segment) for segment in segments]

    with TempFile('.mp3') as temp_file:
        # Open a new temporary file to store audio in between processes
        if segments:
            # Cut audio into segments and create fade in/out
            # We need to use a new temporary file for each
            # audio segment
            temp_segments = [TempFile('.mp3') for segment in segments]
            try:
                for index, segment in enumerate(segments):
                    sox = Transformer()
                    sox.channels(1)
                    sox.norm(-24)
                    sox.trim(*segment)
                    sox.fade(1, 2, 't')
                    sox.build(input_path, temp_segments[index].path)

                if len(segments) > 1:
                    # Concatenate all the audio segments back together
                    # and output to our main temporary file
                    Combiner().build(
                        [temp_segment.path for temp_segment in temp_segments],
                        temp_file.path,
                        'concatenate',
                    )
                else:
                    # Only one segment so we don't need to combine anything
                    subprocess.run(
                        ['cp', temp_segments[0].path, temp_file.path])

            except Exception as e:
                raise (e)
            finally:
                # Cleanup temporary segment files even on error
                if temp_segments:
                    for temp_segment in temp_segments:
                        temp_segment.close()

        # Second process: filter, compress and EQ the
        # audio in temporary file and output to output_file
        sox = Transformer()
        sox.highpass(100)
        sox.lowpass(10000)
        sox.compand(0.005, 0.12, 6, [
            (-90, -90),
            (-70, -55),
            (-50, -35),
            (-32, -32),
            (-24, -24),
            (0, -8),
        ])
        sox.equalizer(3000, 1000, 3)
        sox.equalizer(280, 120, 3)
        sox.build(temp_file.path, output_file)
Exemplo n.º 6
0
def _convert_audio_and_split_sentences(extracted_dir, data_set, dest_dir):
    source_dir = os.path.join(extracted_dir, data_set)
    target_dir = os.path.join(extracted_dir, dest_dir)

    if not os.path.exists(target_dir):
        os.makedirs(target_dir)

    # Loop over transcription files and split each one
    #
    # The format for each file 1-2.trans.txt is:
    #  1-2-0 transcription of 1-2-0.flac
    #  1-2-1 transcription of 1-2-1.flac
    #  ...
    #
    # Each file is then split into several files:
    #  1-2-0.txt (contains transcription of 1-2-0.flac)
    #  1-2-1.txt (contains transcription of 1-2-1.flac)
    #  ...
    #
    # We also convert the corresponding FLACs to WAV in the same pass
    files = []
    for root, dirnames, filenames in os.walk(source_dir):
        for filename in fnmatch.filter(filenames, "*.trans.txt"):
            trans_filename = os.path.join(root, filename)
            with codecs.open(trans_filename, "r", "utf-8") as fin:
                for line in fin:
                    # Parse each segment line
                    first_space = line.find(" ")
                    seqid, transcript = line[:first_space], line[first_space + 1 :]

                    # We need to do the encode-decode dance here because encode
                    # returns a bytes() object on Python 3, and text_to_char_array
                    # expects a string.
                    transcript = (
                        unicodedata.normalize("NFKD", transcript)
                        .encode("ascii", "ignore")
                        .decode("ascii", "ignore")
                    )

                    transcript = transcript.lower().strip()

                    # Convert corresponding FLAC to a WAV
                    flac_file = os.path.join(root, seqid + ".flac")
                    wav_file = os.path.join(target_dir, seqid + ".wav")
                    if not os.path.exists(wav_file):
                        tfm = Transformer()
                        tfm.set_output_format(rate=SAMPLE_RATE)
                        tfm.build(flac_file, wav_file)
                    wav_filesize = os.path.getsize(wav_file)

                    files.append((os.path.abspath(wav_file), wav_filesize, transcript))

    return pandas.DataFrame(
        data=files, columns=["wav_filename", "wav_filesize", "transcript"]
    )
Exemplo n.º 7
0
    def process(x):
        file_path, text = x
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        text = text.lower().strip()
        audio_path = os.path.join(audio_clips_path, file_path)
        output_wav_path = os.path.join(wav_dir, file_name + '.wav')

        tfm = Transformer()
        tfm.rate(samplerate=args.sample_rate)
        tfm.build(input_filepath=audio_path, output_filepath=output_wav_path)
        duration = sox.file_info.duration(output_wav_path)
        return output_wav_path, duration, text
    def process(x):
        file_path, text = x
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        text = text.strip().upper()
        with open(os.path.join(txt_dir, file_name + '.txt'), 'w') as f:
            f.write(text)
        audio_path = os.path.join(audio_clips_path, file_path)
        output_wav_path = os.path.join(wav_dir, file_name + '.wav')

        tfm = Transformer()
        tfm.rate(samplerate=args.sample_rate)
        tfm.build(input_filepath=audio_path, output_filepath=output_wav_path)
Exemplo n.º 9
0
 def convert(self):
     """Converts the mp3's associated with this instance to wav's
     Return:
       wav_directory (os.path): The directory into which the associated wav's were downloaded
     """
     wav_directory = self._pre_convert()
     for mp3_filename in self.mp3_directory.glob('**/*.mp3'):
         wav_filename = path.join(wav_directory, os.path.splitext(os.path.basename(mp3_filename))[0] + ".wav")
         if not path.exists(wav_filename):
             _logger.debug("Converting mp3 file %s to wav file %s" % (mp3_filename, wav_filename))
             transformer = Transformer()
             transformer.convert(samplerate=SAMPLE_RATE, n_channels=N_CHANNELS, bitdepth=BITDEPTH)
             transformer.build(str(mp3_filename), str(wav_filename))
         else:
             _logger.debug("Already converted mp3 file %s to wav file %s" % (mp3_filename, wav_filename))
     return wav_directory
Exemplo n.º 10
0
def compressed_wav_to_full(source_dir, target_dir):
    """Convert compressed wav files to full wav files."""

    assert path.exists(source_dir) is True

    if not path.exists(target_dir):
        makedirs(target_dir)

    for compressed_file in glob(path.join(source_dir, "*.wav")):
        transformer = Transformer()
        if hp.callhome_rate == 8000:
            transformer.set_output_format(encoding='signed-integer', channels=1)  # Also set single channel.
        else:  # Do resampling if specified.
            transformer.set_output_format(encoding='signed-integer', channels=1, rate=hp.callhome_rate)
        wav_filename = path.basename(compressed_file)
        wav_file = path.join(target_dir, wav_filename)
        transformer.build(compressed_file, wav_file)
def sph_to_wav(source_dir, target_dir):
    """Convert .sph files to .wav files."""

    assert path.exists(source_dir) is True

    if not path.exists(target_dir):
        makedirs(target_dir)

    for sph_file in glob(path.join(source_dir, "*.sph")):
        transformer = Transformer()
        if hp.tedlium_rate != 16000:
            transformer.set_output_format(encoding='signed-integer',
                                          channels=1,
                                          rate=hp.tedlium_rate)
        wav_filename = path.splitext(path.basename(sph_file))[0] + ".wav"
        wav_file = path.join(target_dir, wav_filename)
        transformer.build(sph_file, wav_file)
Exemplo n.º 12
0
def _maybe_convert_wav(data_dir, extracted_data, converted_data):
    source_dir = os.path.join(data_dir, extracted_data)
    target_dir = os.path.join(data_dir, converted_data)

    # Conditionally convert FLAC files to wav files
    if not gfile.Exists(target_dir):
        # Create target_dir
        os.makedirs(target_dir)

        # Loop over FLAC files in source_dir and convert each to wav
        for root, dirnames, filenames in os.walk(source_dir):
            for filename in fnmatch.filter(filenames, '*.flac'):
                flac_file = os.path.join(root, filename)
                wav_filename = os.path.splitext(os.path.basename(flac_file))[0] + ".wav"
                wav_file = os.path.join(target_dir, wav_filename)
                transformer = Transformer()
                transformer.build(flac_file, wav_file)
                os.remove(flac_file)
Exemplo n.º 13
0
def pitch_shift(aud_seg: AudioSegment, semi: float, **kwargs):
    """Pitch shift audio sample by semi semitones, without changing
    the speed of the audio segment.

    Arguments:
        aud_seg: audio segment to alter
        semi: Number of semitones to pitch audio
    """
    # Create a sox transformer
    tfm = Transformer()
    tfm.pitch(semi)
    # Unfortunately, using our current libraries, idk how to make this faster
    # Sox requires an input file and an output file to perform the pitch shift
    temp_in_file = NamedTemporaryFile(suffix='.wav')
    aud_seg.export(temp_in_file, format='wav')
    temp_out_file = NamedTemporaryFile(suffix='.wav')
    tfm.build(temp_in_file.name, temp_out_file.name)

    return AudioSegment.from_file(temp_out_file.name, format='wav')
Exemplo n.º 14
0
    def process(x):
        file_path, text = x
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        text = text.lower().strip()
        audio_path = os.path.join(audio_clips_path, file_path)
        if os.path.getsize(audio_path) == 0:
            logging.warning(f'Skipping empty audio file {audio_path}')
            return '', '', ''

        output_wav_path = os.path.join(wav_dir, file_name + '.wav')

        if not os.path.exists(output_wav_path):
            tfm = Transformer()
            tfm.rate(samplerate=args.sample_rate)
            tfm.channels(n_channels=args.n_channels)
            tfm.build(input_filepath=audio_path, output_filepath=output_wav_path)

        duration = sox.file_info.duration(output_wav_path)
        return output_wav_path, duration, text
Exemplo n.º 15
0
    def read(self, audio_metadata):
        """Read an audio file.

        :param audio_metadata: metadata info of an audio
        :return: raw audio data as float32 array and duration in seconds.
        """
        fd = temp_path = None
        # Convert it to a wav file.
        if not audio_metadata.path.endswith('.wav'):
            original_sample_rate = file_info.sample_rate(audio_metadata.path)
            assert self._sample_rate <= original_sample_rate
            transformer = Transformer()
            transformer.convert(samplerate=self._sample_rate,
                                n_channels=self._channels,
                                bitdepth=self._bits_per_sample)
            fd, temp_path = tempfile.mkstemp(suffix='.wav')
            transformer.build(audio_metadata.path, temp_path)

        if temp_path:
            path = temp_path
        else:
            path = audio_metadata.path

        # Read the audio file.
        with SoundFile(path) as soundfile:
            # make sure the audio properties are as expected.
            assert soundfile.samplerate == self._sample_rate
            assert soundfile.channels == self._channels
            duration_sec = len(soundfile) / self._sample_rate
            pcm = soundfile.read(dtype='float32')

            # Add 0.5 second silence to the end of files containing keyword as in occasionally the user stopped
            # recording right after uttering the keyword. If the detector needs some time after seeing the keyword to
            # make a decision (e.g. endpointing) this is going to artificially increase the miss rates.
            if audio_metadata.is_keyword:
                pcm = np.append(pcm, np.zeros(self._sample_rate // 2))

            if temp_path:
                os.close(fd)
                os.remove(temp_path)

            return pcm, duration_sec
Exemplo n.º 16
0
def speedup(aud_seg: AudioSegment, speed: float, **kwargs):
    """Speed up (or slow down) audio segment
    
    Args:
        aud_seg: audio segment to alter
        speed: new playback speed. Should be thought of as a
            percentage. For example, if we want to speed up
            aud_seg by 20%, we pass in 1.2. To slow it down
            to 80%, pass in 0.8
    """
    tfm = Transformer()
    tfm.tempo(speed)
    # Unfortunately, using our current libraries, idk how to make this faster
    # Sox requires an input file and an output file to perform the tempo shift
    temp_in_file = NamedTemporaryFile(suffix='.wav')
    aud_seg.export(temp_in_file, format='wav')
    temp_out_file = NamedTemporaryFile(suffix='.wav')
    tfm.build(temp_in_file.name, temp_out_file.name)

    return AudioSegment.from_file(temp_out_file.name, format='wav')
Exemplo n.º 17
0
def main():
    fileName = "audio"
    sx = Transformer()
    proxyPool = scraper()
    prefs = getProfile(proxyPool)
    urlAddr, inputs = getInputs()
    browser = automatePage(fireFoxPath=FIREFOX_PATH,
                           prefs=prefs,
                           address=urlAddr,
                           inputList=inputs)

    ##########################
    ### Convert Audio File ###
    ##########################
    print "Converting Audio File"
    sx.build(fileName + ".mp3", fileName + ".wav")

    answer = getAnswer(fileName)

    submitAnswer(browser, answer)
Exemplo n.º 18
0
def _maybe_convert_wav_dataset(extracted_dir, data_set):
    # Create source dir
    source_dir = path.join(extracted_dir, data_set, "sph")

    # Create target dir
    target_dir = path.join(extracted_dir, data_set, "wav")

    # Conditionally convert sph files to wav files
    if not gfile.Exists(target_dir):
        # Create target_dir
        makedirs(target_dir)

        # Loop over sph files in source_dir and convert each to wav
        for sph_file in glob(path.join(source_dir, "*.sph")):
            transformer = Transformer()
            wav_filename = path.splitext(path.basename(sph_file))[0] + ".wav"
            wav_file = path.join(target_dir, wav_filename)
            transformer.build(sph_file, wav_file)
            remove(sph_file)

        # Remove source_dir
        rmdir(source_dir)
Exemplo n.º 19
0
def _maybe_convert_wav_dataset(extracted_dir, data_set):
    # Create source dir
    source_dir = path.join(extracted_dir, data_set, "sph")

    # Create target dir
    target_dir = path.join(extracted_dir, data_set, "wav")

    # Conditionally convert sph files to wav files
    if not gfile.Exists(target_dir):
        # Create target_dir
        makedirs(target_dir)

        # Loop over sph files in source_dir and convert each to wav
        for sph_file in glob(path.join(source_dir, "*.sph")):
            transformer = Transformer()
            wav_filename = path.splitext(path.basename(sph_file))[0] + ".wav"
            wav_file = path.join(target_dir, wav_filename)
            transformer.build(sph_file, wav_file)
            remove(sph_file)

        # Remove source_dir
        rmdir(source_dir)
 def convert(self):
     """Converts the mp3's associated with this instance to wav's
     Return:
       wav_directory (os.path): The directory into which the associated wav's were downloaded
     """
     wav_directory = self._pre_convert()
     for mp3_filename in self.mp3_directory.glob('**/*.mp3'):
         wav_filename = path.join(
             wav_directory,
             os.path.splitext(os.path.basename(mp3_filename))[0] + ".wav")
         if not path.exists(wav_filename):
             _logger.debug("Converting mp3 file %s to wav file %s" %
                           (mp3_filename, wav_filename))
             transformer = Transformer()
             transformer.convert(samplerate=SAMPLE_RATE,
                                 n_channels=N_CHANNELS,
                                 bitdepth=BITDEPTH)
             transformer.build(str(mp3_filename), str(wav_filename))
         else:
             _logger.debug("Already converted mp3 file %s to wav file %s" %
                           (mp3_filename, wav_filename))
     return wav_directory
Exemplo n.º 21
0
def _processSamples(sample_list):

    for sample in sample_list:

        sample_new_name = _renameSample(sample)
        _out = join(out_path, sample_new_name)
        processed_samples.append(_out)
        _in = sample

        # Sox processing using Transform instance
        tfm = Transformer()
        tfm.convert(samplerate=44100, n_channels=2, bitdepth=16)

        if NORMALIZE:
            tfm.norm(db_level=-3)
        if SILENCE:
            tfm.silence(location=-1,
                        silence_threshold=0.05,
                        min_silence_duration=0.1)
        if PADDING:
            tfm.pad(0, PADDING)

        tfm.build(_in, _out)
Exemplo n.º 22
0
def convert_audio_and_split_transcript(input_dir, source_name, target_name,
                                       output_dir, output_file):
    """Convert FLAC to WAV and split the transcript.

  For audio file, convert the format from FLAC to WAV using the sox.Transformer
  library.
  For transcripts, each line contains the sequence id and the corresponding
  transcript (separated by space):
  Input data format: seq-id transcript_of_seq-id
  For example:
   1-2-0 transcript_of_1-2-0.flac
   1-2-1 transcript_of_1-2-1.flac
   ...

  Each sequence id has a corresponding .flac file.
  Parse the transcript file and generate a new csv file which has three columns:
  "wav_filename": the absolute path to a wav file.
  "wav_filesize": the size of the corresponding wav file.
  "transcript": the transcript for this audio segement.

  Args:
    input_dir: the directory which holds the input dataset.
    source_name: the name of the specified dataset. e.g. test-clean
    target_name: the directory name for the newly generated audio files.
                 e.g. test-clean-wav
    output_dir: the directory to place the newly generated csv files.
    output_file: the name of the newly generated csv file. e.g. test-clean.csv
  """

    tf.logging.info("Preprocessing audio and transcript for %s" % source_name)
    source_dir = os.path.join(input_dir, source_name)
    target_dir = os.path.join(input_dir, target_name)

    if not tf.gfile.Exists(target_dir):
        tf.gfile.MakeDirs(target_dir)

    files = []
    tfm = Transformer()
    # Convert all FLAC file into WAV format. At the same time, generate the csv
    # file.
    for root, _, filenames in tf.gfile.Walk(source_dir):
        for filename in fnmatch.filter(filenames, "*.trans.txt"):
            trans_file = os.path.join(root, filename)
            with codecs.open(trans_file, "r", "utf-8") as fin:
                for line in fin:
                    seqid, transcript = line.split(" ", 1)
                    # We do a encode-decode transformation here because the output type
                    # of encode is a bytes object, we need convert it to string.
                    transcript = unicodedata.normalize(
                        "NFKD", transcript).encode("ascii", "ignore").decode(
                            "ascii", "ignore").strip().lower()

                    # Convert FLAC to WAV.
                    flac_file = os.path.join(root, seqid + ".flac")
                    wav_file = os.path.join(target_dir, seqid + ".wav")
                    if not tf.gfile.Exists(wav_file):
                        tfm.build(flac_file, wav_file)
                    wav_filesize = os.path.getsize(wav_file)

                    files.append(
                        (os.path.abspath(wav_file), wav_filesize, transcript))

    # Write to CSV file which contains three columns:
    # "wav_filename", "wav_filesize", "transcript".
    csv_file_path = os.path.join(output_dir, output_file)
    df = pandas.DataFrame(
        data=files, columns=["wav_filename", "wav_filesize", "transcript"])
    df.to_csv(csv_file_path, index=False, sep="\t")
    tf.logging.info("Successfully generated csv file {}".format(csv_file_path))
Exemplo n.º 23
0
def convert_audio_and_split_transcript(directory, subset, output_dir):
    """Convert SPH to WAV and split the transcript.
    Args:
        directory: the directory which holds the input dataset.
        subset: the name of the specified dataset. supports train 
                (switchboard+fisher), switchboard, fisher, hub500 and rt03s.
        output_dir: the directory to place the newly generated csv files.
    """
    logging.info("Processing audio and transcript for %s" % subset)
    gfile = tf.compat.v1.gfile
    sph2pip = os.path.join(os.path.dirname(__file__), "../utils/sph2pipe")

    swd_audio_trans_dir = [os.path.join(directory, "LDC97S62")]
    fisher_audio_dirs = [
        os.path.join(directory, "LDC2004S13"),
        os.path.join(directory, "LDC2005S13"),
    ]
    fisher_trans_dirs = [
        os.path.join(directory, "LDC2004T19"),
        os.path.join(directory, "LDC2005T19"),
    ]
    hub_audio_dir = [os.path.join(directory, "LDC2002S09")]
    hub_trans_dir = [os.path.join(directory, "LDC2002T43")]
    rts_audio_trans_dir = [os.path.join(directory, "LDC2007S10")]

    if subset == "train":
        # Combination of switchboard corpus and fisher corpus.
        audio_dir = swd_audio_trans_dir + fisher_audio_dirs
        trans_dir = swd_audio_trans_dir + fisher_trans_dirs
    elif subset == "switchboard":
        audio_dir = swd_audio_trans_dir
        trans_dir = swd_audio_trans_dir
    elif subset == "fisher":
        audio_dir = fisher_audio_dirs
        trans_dir = fisher_trans_dirs
    elif subset == "hub500":
        audio_dir = hub_audio_dir
        trans_dir = hub_trans_dir
    elif subset == "rt03s":
        audio_dir = rts_audio_trans_dir
        trans_dir = rts_audio_trans_dir
    else:
        raise ValueError(subset, " is not in switchboard_fisher")

    subset_dir = os.path.join(directory, subset)
    if not gfile.Exists(subset_dir):
        gfile.MakeDirs(subset_dir)
    output_wav_dir = os.path.join(directory, subset + "/wav")
    if not gfile.Exists(output_wav_dir):
        gfile.MakeDirs(output_wav_dir)
    tmp_dir = os.path.join(directory, "tmp")
    if not gfile.Exists(tmp_dir):
        gfile.MakeDirs(tmp_dir)

    # Build SPH dict.
    files = []
    sph_files_dict = {}
    for sub_audio_dir in audio_dir:
        for root, _, filenames in gfile.Walk(sub_audio_dir):
            for filename in fnmatch.filter(filenames, "*.[Ss][Pp][Hh]"):
                sph_key = os.path.splitext(filename)[0]
                sph_file = os.path.join(root, filename)
                sph_files_dict[sph_key] = sph_file

    with TemporaryDirectory(dir=tmp_dir) as output_tmp_wav_dir:
        for sub_trans_dir in trans_dir:
            if sub_trans_dir in swd_audio_trans_dir:
                fnmatch_pat = "*-trans.text"
                split_and_norm_func = split_line_and_norm_swd
            elif sub_trans_dir in fisher_trans_dirs:
                fnmatch_pat = "*.[Tt][Xx][Tt]"
                split_and_norm_func = split_line_and_norm_fisher
            elif sub_trans_dir in hub_trans_dir:
                fnmatch_pat = "hub5e00.english.000405.stm"
                split_and_norm_func = split_line_and_norm_hub_rts
            else:
                fnmatch_pat = "*.stm"
                split_and_norm_func = split_line_and_norm_hub_rts

            for root, _, filenames in gfile.Walk(sub_trans_dir):
                for filename in fnmatch.filter(filenames, fnmatch_pat):
                    trans_file = os.path.join(root, filename)
                    if 1 in [
                            ele in root for ele in [
                                "doc",
                                "DOC",
                                "mandarin",
                                "arabic",
                                "concatenated",
                                "bnews",
                            ]
                    ]:
                        continue
                    with codecs.open(trans_file, "r", "utf-8") as fin:
                        for line in fin:
                            line = line.strip()
                            (
                                sph_key,
                                speaker,
                                time_start,
                                time_end,
                                norm_trans,
                            ) = split_and_norm_func(line, filename)

                            # Too short, skip the wave file
                            if time_end - time_start <= 0.1:
                                continue
                            if norm_trans == "":
                                continue
                            if speaker == "A":
                                channel = 1
                            else:
                                channel = 2

                            # Convert SPH to split WAV.
                            if sph_key not in sph_files_dict:
                                print(sph_key + " not found, please check.")
                                continue
                            sph_file = sph_files_dict[sph_key]
                            wav_file = os.path.join(
                                output_tmp_wav_dir,
                                sph_key + "." + speaker + ".wav")
                            if not gfile.Exists(sph_file):
                                raise ValueError(
                                    "the sph file {} is not exists".format(
                                        sph_file))

                            sub_wav_filename = "{0}-{1}-{2:06d}-{3:06d}".format(
                                sph_key,
                                speaker,
                                round(time_start * 100),
                                round(time_end * 100),
                            )
                            sub_wav_file = os.path.join(
                                output_wav_dir, sub_wav_filename + ".wav")

                            if not gfile.Exists(sub_wav_file):
                                if not gfile.Exists(wav_file):
                                    sph2pipe_cmd = (sph2pip +
                                                    " -f wav -c {} -p ".format(
                                                        str(channel)) +
                                                    sph_file + " " + wav_file)
                                    os.system(sph2pipe_cmd)
                                tfm = Transformer()
                                tfm.trim(time_start, time_end)
                                tfm.build(wav_file, sub_wav_file)

                            # wav_filesize = os.path.getsize(sub_wav_file)
                            wav_length = get_wave_file_length(sub_wav_file)
                            speaker_name = sph_key + "-" + speaker
                            files.append(
                                (os.path.abspath(sub_wav_file), wav_length,
                                 norm_trans, speaker_name))

    # Write to CSV file which contains four columns:
    # "wav_filename", "wav_length_ms", "transcript", "speaker".
    out_csv_file = os.path.join(output_dir, subset + ".csv")
    df = pandas.DataFrame(
        data=files,
        columns=["wav_filename", "wav_length_ms", "transcript", "speaker"])
    df.to_csv(out_csv_file, index=False, sep="\t")
    logging.info("Successfully generated csv file {}".format(out_csv_file))
Exemplo n.º 24
0
def _maybe_convert_wav(mp3_filename, wav_filename):
    if not path.exists(wav_filename):
        transformer = Transformer()
        transformer.convert(samplerate=SAMPLE_RATE)
        transformer.build(mp3_filename, wav_filename)
Exemplo n.º 25
0
def convert_sr_channel(audio) :

	transformer = Transformer()
	transformer.convert(samplerate=16000, n_channels=1)
	transformer.build(original_audio_file, wav_file)
Exemplo n.º 26
0
def convert_audio_and_split_transcript(input_dir, source_name, target_name,
                                       output_dir, output_file):
  """Convert FLAC to WAV and split the transcript.

  For audio file, convert the format from FLAC to WAV using the sox.Transformer
  library.
  For transcripts, each line contains the sequence id and the corresponding
  transcript (separated by space):
  Input data format: seq-id transcript_of_seq-id
  For example:
   1-2-0 transcript_of_1-2-0.flac
   1-2-1 transcript_of_1-2-1.flac
   ...

  Each sequence id has a corresponding .flac file.
  Parse the transcript file and generate a new csv file which has three columns:
  "wav_filename": the absolute path to a wav file.
  "wav_filesize": the size of the corresponding wav file.
  "transcript": the transcript for this audio segement.

  Args:
    input_dir: the directory which holds the input dataset.
    source_name: the name of the specified dataset. e.g. test-clean
    target_name: the directory name for the newly generated audio files.
                 e.g. test-clean-wav
    output_dir: the directory to place the newly generated csv files.
    output_file: the name of the newly generated csv file. e.g. test-clean.csv
  """

  tf.logging.info("Preprocessing audio and transcript for %s" % source_name)
  source_dir = os.path.join(input_dir, source_name)
  target_dir = os.path.join(input_dir, target_name)

  if not tf.gfile.Exists(target_dir):
    tf.gfile.MakeDirs(target_dir)

  files = []
  tfm = Transformer()
  # Convert all FLAC file into WAV format. At the same time, generate the csv
  # file.
  for root, _, filenames in tf.gfile.Walk(source_dir):
    for filename in fnmatch.filter(filenames, "*.trans.txt"):
      trans_file = os.path.join(root, filename)
      with codecs.open(trans_file, "r", "utf-8") as fin:
        for line in fin:
          seqid, transcript = line.split(" ", 1)
          # We do a encode-decode transformation here because the output type
          # of encode is a bytes object, we need convert it to string.
          transcript = unicodedata.normalize("NFKD", transcript).encode(
              "ascii", "ignore").decode("ascii", "ignore").strip().lower()

          # Convert FLAC to WAV.
          flac_file = os.path.join(root, seqid + ".flac")
          wav_file = os.path.join(target_dir, seqid + ".wav")
          if not tf.gfile.Exists(wav_file):
            tfm.build(flac_file, wav_file)
          wav_filesize = os.path.getsize(wav_file)

          files.append((os.path.abspath(wav_file), wav_filesize, transcript))

  # Write to CSV file which contains three columns:
  # "wav_filename", "wav_filesize", "transcript".
  csv_file_path = os.path.join(output_dir, output_file)
  df = pandas.DataFrame(
      data=files, columns=["wav_filename", "wav_filesize", "transcript"])
  df.to_csv(csv_file_path, index=False, sep="\t")
  tf.logging.info("Successfully generated csv file {}".format(csv_file_path))
Exemplo n.º 27
0
# !/usr/bin/python3

import os
import tqdm
from sox import Transformer

SAMPLE_RATE = 16000
remove_flac = False
path = '/home/dsmolen/agh/LibriSpeech/'

i = 0
tq = tqdm.tqdm(os.walk(path, topdown=False))
for root, dirs, files in tq:
    for name in files:
        if name.endswith('.flac'):
            tq.set_postfix(converted=i)
            i += 1
            name = name[:-5]
            flac_file = os.path.join(root, name + ".flac")
            wav_file = os.path.join(root, name + ".wav")
            if not os.path.exists(wav_file):
                tfm = Transformer()
                tfm.set_output_format(rate=SAMPLE_RATE)
                tfm.build(flac_file, wav_file)
            if remove_flac:
                os.remove(flac_file)
Exemplo n.º 28
0
def convert_audio_and_split_transcript(dataset_dir, subset, out_csv_file,
                                       output_dir):
    """Convert SPH to WAV and split the transcript.

  Args:
    dataset_dir  : the directory which holds the input dataset.
    subset       : the name of the specified dataset. e.g. dev.
    out_csv_file : the resulting output csv file.
    output_dir   : Athena working directory.
  """
    gfile = tf.compat.v1.gfile
    sph2pip = os.path.join(os.path.dirname(__file__),
                           "../../../../tools/sph2pipe/sph2pipe")
    text_featurizer = TextFeaturizer()

    logging.info("Processing audio and transcript for %s" % subset)
    audio_dir = os.path.join(dataset_dir, "LDC2005S15/")
    trans_dir = os.path.join(dataset_dir, "LDC2005T32/")

    output_wav_dir = os.path.join(output_dir, subset + "/wav")
    if not gfile.Exists(output_wav_dir):
        gfile.MakeDirs(output_wav_dir)

    files = []
    char_dict = {}

    sph_files_dict = {}
    for root, _, filenames in gfile.Walk(audio_dir):
        for filename in fnmatch.filter(filenames, "*.sph"):
            if subset in root:
                sph_key = os.path.splitext(filename)[0]
                sph_file = os.path.join(root, filename)
                sph_files_dict[sph_key] = sph_file

    # Convert all SPH file into WAV format.
    # Generate the JSON file and char dict file.
    with TemporaryDirectory(dir=output_dir) as output_tmp_wav_dir:
        for root, _, filenames in gfile.Walk(trans_dir):
            if not re.match('.*/' + subset + '.*', root):
                continue
            for filename in fnmatch.filter(filenames, "*.txt"):
                trans_file = os.path.join(root, filename)
                sph_key = ""
                speaker_A = ""
                speaker_B = ""
                with codecs.open(trans_file, "r", "gb18030") as fin:
                    for line in fin:
                        line = line.strip()
                        if len(line.split(" ")) <= 1:
                            continue
                        if len(line.split(" ")) == 2:
                            sph_key = line.split(" ")[1]
                            speaker_A = sph_key.split("_")[2]
                            speaker_B = sph_key.split("_")[3]
                            continue

                        time_start, time_end, speaker, transcript = line.split(
                            " ", 3)
                        time_start = float(time_start)
                        time_end = float(time_end)
                        # too short, skip the wave file
                        if time_end - time_start <= 0.1:
                            continue

                        speaker = speaker[0]  # remove ':' in 'A:'
                        if speaker == "A":
                            channel = 1
                            speaker_id = speaker_A
                        else:
                            channel = 2
                            speaker_id = speaker_B

                        # Convert SPH to split WAV.
                        sph_file = sph_files_dict[sph_key]
                        wav_file = os.path.join(
                            output_tmp_wav_dir,
                            sph_key + "." + speaker[0] + ".wav")
                        if not gfile.Exists(sph_file):
                            raise ValueError(
                                "the sph file {} is not exists".format(
                                    sph_file))
                        if not gfile.Exists(wav_file):
                            sph2pipe_cmd = (
                                sph2pip +
                                " -f wav -c {} -p ".format(str(channel)) +
                                sph_file + " " + wav_file)
                            os.system(sph2pipe_cmd)

                        sub_wav_filename = "{0}-{1}-{2:06d}-{3:06d}".format(
                            sph_key, speaker, int(time_start * 100),
                            int(time_end * 100))
                        sub_wav_file = os.path.join(output_wav_dir,
                                                    sub_wav_filename + ".wav")
                        if not gfile.Exists(sub_wav_file):
                            tfm = Transformer()
                            tfm.trim(time_start, time_end)
                            tfm.build(wav_file, sub_wav_file)

                        wav_length = get_wave_file_length(sub_wav_file)

                        transcript = normalize_hkust_trans(transcript)
                        transcript = text_featurizer.delete_punct(transcript)

                        if len(transcript) > 0:
                            for char in transcript:
                                if char in char_dict:
                                    char_dict[char] += 1
                                else:
                                    char_dict[char] = 0
                            files.append((
                                os.path.abspath(sub_wav_file),
                                wav_length,
                                transcript,
                                speaker_id,
                            ))

    # Write to CSV file which contains three columns:
    # "wav_filename", "wav_length_ms", "labels".
    df = pandas.DataFrame(
        data=files,
        columns=["wav_filename", "wav_length_ms", "transcript", "speaker"])
    df.to_csv(out_csv_file, index=False, sep="\t")
    logging.info("Successfully generated csv file {}".format(out_csv_file))
if not os.path.exists(target_dir):
    os.makedirs(target_dir)

files = []
SAMPLE_RATE = 16000
for set in selection:
    for item in set['set_items']:
        filename =  source_dir + '/' + item['path']
        transcript = item['sentence']
        transcript = unicodedata.normalize('NFKD', transcript)  \
                                .encode('ascii', 'ignore')      \
                                .decode('ascii', 'ignore')

        transcript = transcript.lower().strip()

        # Convert corresponding MP3 to a WAV
        mp3_file =  source_dir + '/' + item['path']
        wav_file = target_dir + '/' + item['path'].replace('.mp3', '.wav')

        if not os.path.exists(wav_file):
            tfm = Transformer()
            tfm.set_output_format(rate=SAMPLE_RATE)
            tfm.build(mp3_file, wav_file)
        wav_filesize = os.path.getsize(wav_file)

        files.append((os.path.abspath(wav_file), wav_filesize, transcript))

data_info = pandas.DataFrame(data=files, columns=['wav_filename', 'wav_filesize', 'transcript'])
data_info.to_csv(data_dir + '/common-voice-pertubed_sets.csv', index=False)
Exemplo n.º 30
0
class Audio:
    def __init__(self, fname):
        '''
            opens .wav audio file with fname
        '''
        self.wav_fname = fname
        try:
            with wave.open(fname, mode="r") as wav:
                self.nchannels, self.sampwidth, self.framerate, self.nframes, self.comptype, self.compname = wav.getparams(
                )
                self.duration = self.nframes / self.framerate
                self.peak = 256**self.sampwidth / 2
                self.content = wav.readframes(self.nframes)
                self.samples = np.fromstring(self.content,
                                             dtype=types[self.sampwidth])
                self.tsm = Transformer()
                return
        except FileNotFoundError as err:
            print(err)
            print("Try Audio.reload_file function")
            raise FileNotFoundError

    def reload_file(self, fname=None):
        '''
            recalls the init function
        '''
        if fname:
            self.__init__(fname)
        else:
            self.__init__(self.wav_fname)
        return

    def prepare_file(self, fname=None, trim=True, nnoise=True, reload=True):
        '''
            trims and reduce noise on file

            should automatically trim the silent regions from the beginning/end,
            so you should use trim with nnoise

            reload - determines whether to reload the audio file
            fname - name of output, changed file
        '''
        if nnoise:
            self.tsm.noiseprof(self.wav_fname,
                               self.wav_fname.rstrip(".wav") + '_noiseprof')
            self.tsm.noisered(self.wav_fname.rstrip(".wav") + '_noiseprof')
        if trim:
            self.tsm.silence(1)
            self.tsm.silence(-1)
        if fname:
            outname = fname
        else:
            outname = self.wav_fname.rstrip(".wav") + "_nnoise_trim.wav"
        self.tsm.build(self.wav_fname, outname)
        if reload:
            self.reload_file(fname=outname)

    def find_patt(self,
                  patt,
                  k=100,
                  use_fastdtw=False,
                  with_dwt=True) -> (float, float):
        '''
            patt - pattern, Audio object
            k - downsampling coefficient, integer
            use_fastdtw - determines whether to use dtw for distance or just euclidean distance
            with_dwt - use single level Discrete Wavelet Transform for data

            return: time of found pattern in seconds and time of search
        '''

        evaltime = time()

        # prepare samples
        data = self.samples
        pattern = patt.samples
        data = data[0::k]
        pattern = pattern[0::k]
        data = (data - data.mean()) / data.std()
        pattern = (pattern - pattern.mean()) / pattern.std()
        if with_dwt:
            data, *_ = dwt(data, 'db1')
            pattern, *_ = dwt(pattern, 'db1')

        distances = []
        maxcount = len(data) - len(pattern) + 1
        for i in range(maxcount):
            if use_fastdtw:
                distances.append(
                    fastdtw(data[i:i + len(pattern)], pattern,
                            dist=euclidean)[0])
            else:
                distances.append(euclidean(data[i:i + len(pattern)], pattern))

        evaltime = time() - evaltime
        res = (distances.index(min(distances)) / len(data)) * self.duration

        return res, evaltime

    def draw_waveform_matplotlib(self, fname=None, save=False):
        '''
            better use plotly, does the same
        '''
        import matplotlib.pyplot as plt
        import matplotlib.ticker as ticker

        w, h = 800, 300
        # k = self.nframes//w//32
        DPI = 72

        plt.figure(1, figsize=(float(w) / DPI, float(h) / DPI), dpi=DPI)
        plt.subplots_adjust(wspace=0, hspace=0)

        for n in range(self.nchannels):
            channel = self.samples[n::self.nchannels]

            # channel = channel[0::k]
            # if self.nchannels == 1:
            #     channel = channel - self.peak

            axes = plt.subplot(2, 1, n + 1)
            axes.plot(channel, "g")
            axes.yaxis.set_major_formatter(
                ticker.FuncFormatter(
                    lambda x, pos=None: format_db(x, self, pos=pos)))
            plt.grid(True, color="w")
            axes.xaxis.set_major_formatter(ticker.NullFormatter())

        axes.xaxis.set_major_formatter(
            ticker.FuncFormatter(
                lambda x, pos=None: format_time(x, self, pos=pos)))
        if save:
            if fname == None:
                plt.savefig(self.wav_fname.rstrip(".wav") + "_waveform",
                            dpi=DPI)
            else:
                plt.savefig(fname, dpi=DPI)
        plt.show()

    def draw_waveform_plotly(self,
                             fname=None,
                             save=False,
                             select_interval=None):
        '''
            fname - name for .html file
            save - if True saves to png with the fname
            select_interval - (float, float) - time for colouring the data
        '''

        import plotly.offline as py
        import plotly.graph_objs as go

        time = [(x / float(self.nframes) * self.duration)
                for x in range(self.nframes)]
        # samples = [20 * math.log10(abs(x-peak) / float(peak)) if x != 0 else "-inf" for x in samples]
        graphs = []
        if select_interval:
            start = round((select_interval[0] / self.duration) * self.nframes)
            end = round((select_interval[1] / self.duration) * self.nframes)

            before_start = go.Scatter(x=time[:start],
                                      y=self.samples[:start],
                                      mode='lines',
                                      name='sample',
                                      line={"color": "#0000ff"})
            graphs.append(before_start)
            after_start = go.Scatter(x=time[start:end],
                                     y=self.samples[start:end],
                                     mode='lines',
                                     name='pattern',
                                     line={"color": "#ff0000"})
            graphs.append(after_start)
            after_end = go.Scatter(x=time[end:],
                                   y=self.samples[end:],
                                   mode='lines',
                                   name='sample',
                                   line={"color": "#0000ff"})
            graphs.append(after_end)
        else:
            trace = go.Scatter(x=time,
                               y=self.samples,
                               mode='lines',
                               name='sample')
            graphs.append(trace)

        layout = dict(
            title=f"{self.wav_fname[self.wav_fname.rfind('/'):]} waveform",
            #   yaxis = dict(zeroline = False),
            xaxis=dict(title="Time in seconds"))
        fig = dict(data=graphs, layout=layout)
        if fname == None:
            if save:
                py.plot(fig,
                        filename=(self.wav_fname.rstrip(".wav") +
                                  "_waveform.html"),
                        image='png')
            else:
                py.plot(fig,
                        filename=(self.wav_fname.rstrip(".wav") +
                                  "_waveform.html"))
        else:
            if save:
                py.plot(fig, filename=fname, image='png')
            else:
                py.plot(fig, filename=fname)