def generate_wavs(data_dir): print("hello") pbar = ProgressBar() for mp3_file in pbar(glob(path.join(data_dir, '*.mp3'))): sound = pydub.AudioSegment.from_mp3(mp3_file) filename = mp3_file[-10:-4] new_file = path.splitext(data_dir)[0] + "/wavs/" + filename + ".wav" sound.export(new_file, format="wav") pbar = ProgressBar() data_dir = data_dir + '/wavs/' # change audio file to 16k sample rate for wav_file in pbar(glob(path.join(data_dir, '*.wav'))): new_file = path.splitext(wav_file)[0] + "k16.wav" transformer = Transformer() transformer.convert(samplerate=sample_rate) transformer.build(wav_file, new_file) pbar = ProgressBar() # remove old files for item in pbar(glob(path.join(data_dir, '*.wav'))): if item.endswith("k16.wav"): continue else: os.remove(item) pbar = ProgressBar() # rename files to remove k16 for item in pbar(glob(path.join(data_dir, '*.wav'))): os.rename(item, item.replace('k16', '')) print("end")
def extract_audio(): all_videos = find_all_video_files(output_dir) for video in tqdm(all_videos): mkvfile = os.path.join(os.path.dirname(video), 'temp.mkv') command = 'mkvmerge -o ' + mkvfile + ' ' + video subprocess.call(command, shell=True) video_ts_file = os.path.join(os.path.dirname(video), 'video_ts.txt') audio_ts_file = os.path.join(os.path.dirname(video), 'audio_ts.txt') command = 'mkvextract ' + mkvfile + ' timestamps_v2 0:' + video_ts_file subprocess.call(command, shell=True) command = 'mkvextract ' + mkvfile + ' timestamps_v2 1:' + audio_ts_file subprocess.call(command, shell=True) with open(video_ts_file, 'r') as f: f.readline() # skip header video_start = f.readline() with open(audio_ts_file, 'r') as f: f.readline() # skip header audio_start = f.readline() offset_ms = int(audio_start) - int(video_start) # extract audio audio_tmp = os.path.join(os.path.dirname(video), 'temp.wav') command = 'ffmpeg -i ' + video + ' -ar 44100 -ac 1 -y ' + audio_tmp subprocess.call(command, shell=True) # use the offset to pad the audio with zeros, or trim the audio audio_name = os.path.splitext(video)[0] + '.wav' tfm = Transformer() if offset_ms >= 0: tfm.pad(start_duration=offset_ms / 1000) elif offset_ms < 0: tfm.trim(start_time=-offset_ms / 1000) tfm.build(audio_tmp, audio_name) os.remove(mkvfile) os.remove(audio_tmp) os.remove(video_ts_file) os.remove(audio_ts_file)
def convert_audio_and_split_transcript(input_dir, source_name, target_name, output_dir, output_file): print(f"Pre-processing audio and transcript for {source_name}") source_dir = os.path.join(input_dir, source_name) target_dir = os.path.join(input_dir, target_name) if not tf.io.gfile.exists(target_dir): tf.io.gfile.makedirs(target_dir) files = [] tfm = Transformer() for root, _, filenames in tf.io.gfile.walk(source_dir): for filename in fnmatch.filter(filenames, "*.trans.txt"): trans_file = os.path.join(root, filename) with codecs.open(trans_file, "r", "utf-8") as fin: for line in fin: seq_id, transcript = line.split(" ", 1) transcript = unicodedata.normalize( "NFKD", transcript).encode("ascii", "ignore").decode( "ascii", "ignore").strip().lower() flac_file = os.path.join(root, seq_id + ".flac") wav_file = os.path.join(target_dir, seq_id + ".wav") if not tf.io.gfile.exists(wav_file): tfm.build(flac_file, wav_file) wav_filesize = os.path.getsize(wav_file) files.append( (os.path.abspath(wav_file), wav_filesize, transcript)) csv_file_path = os.path.join(output_dir, output_file) df = pandas.DataFrame( data=files, columns=["wav_filename", "wav_filesize", "transcript"]) df.to_csv(csv_file_path, index=False, sep="\t") print(f"Successfully generated csv file {csv_file_path}")
def convert_audio_and_split_transcript(input_dir, source_name, target_name, output_dir, output_file): """Convert FLAC to WAV and split the transcript. Args: input_dir: the directory which holds the input dataset. source_name: the name of the specified dataset. e.g. test-clean target_name: the directory name for the newly generated audio files. e.g. test-clean-wav output_dir: the directory to place the newly generated csv files. output_file: the name of the newly generated csv file. e.g. test-clean.csv """ logging.info("Processing audio and transcript for %s" % source_name) source_dir = os.path.join(input_dir, source_name) target_dir = os.path.join(input_dir, target_name) if not gfile.Exists(target_dir): gfile.MakeDirs(target_dir) files = [] tfm = Transformer() # Convert all FLAC file into WAV format. At the same time, generate the csv for root, _, filenames in gfile.Walk(source_dir): for filename in fnmatch.filter(filenames, "*.trans.txt"): trans_file = os.path.join(root, filename) with codecs.open(trans_file, "r", "utf-8") as fin: for line in fin: seqid, transcript = line.split(" ", 1) # We do a encode-decode transformation here because the output type # of encode is a bytes object, we need convert it to string. transcript = ( unicodedata.normalize("NFKD", transcript) .encode("ascii", "ignore") .decode("ascii", "ignore") .strip() .lower() ) # Convert FLAC to WAV. flac_file = os.path.join(root, seqid + ".flac") wav_file = os.path.join(target_dir, seqid + ".wav") if not gfile.Exists(wav_file): tfm.build(flac_file, wav_file) # wav_filesize = os.path.getsize(wav_file) wav_length = get_wave_file_length(wav_file) files.append((os.path.abspath(wav_file), wav_length, transcript)) # Write to CSV file which contains three columns: # "wav_filename", "wav_length_ms", "transcript". csv_file_path = os.path.join(output_dir, output_file) df = pandas.DataFrame( data=files, columns=["wav_filename", "wav_length_ms", "transcript"] ) df.to_csv(csv_file_path, index=False, sep="\t") logging.info("Successfully generated csv file {}".format(csv_file_path))
def cut(input_path, output_file, metadata): segments = metadata['segments'] segments = [segment_seconds(segment) for segment in segments] with TempFile('.mp3') as temp_file: # Open a new temporary file to store audio in between processes if segments: # Cut audio into segments and create fade in/out # We need to use a new temporary file for each # audio segment temp_segments = [TempFile('.mp3') for segment in segments] try: for index, segment in enumerate(segments): sox = Transformer() sox.channels(1) sox.norm(-24) sox.trim(*segment) sox.fade(1, 2, 't') sox.build(input_path, temp_segments[index].path) if len(segments) > 1: # Concatenate all the audio segments back together # and output to our main temporary file Combiner().build( [temp_segment.path for temp_segment in temp_segments], temp_file.path, 'concatenate', ) else: # Only one segment so we don't need to combine anything subprocess.run( ['cp', temp_segments[0].path, temp_file.path]) except Exception as e: raise (e) finally: # Cleanup temporary segment files even on error if temp_segments: for temp_segment in temp_segments: temp_segment.close() # Second process: filter, compress and EQ the # audio in temporary file and output to output_file sox = Transformer() sox.highpass(100) sox.lowpass(10000) sox.compand(0.005, 0.12, 6, [ (-90, -90), (-70, -55), (-50, -35), (-32, -32), (-24, -24), (0, -8), ]) sox.equalizer(3000, 1000, 3) sox.equalizer(280, 120, 3) sox.build(temp_file.path, output_file)
def _convert_audio_and_split_sentences(extracted_dir, data_set, dest_dir): source_dir = os.path.join(extracted_dir, data_set) target_dir = os.path.join(extracted_dir, dest_dir) if not os.path.exists(target_dir): os.makedirs(target_dir) # Loop over transcription files and split each one # # The format for each file 1-2.trans.txt is: # 1-2-0 transcription of 1-2-0.flac # 1-2-1 transcription of 1-2-1.flac # ... # # Each file is then split into several files: # 1-2-0.txt (contains transcription of 1-2-0.flac) # 1-2-1.txt (contains transcription of 1-2-1.flac) # ... # # We also convert the corresponding FLACs to WAV in the same pass files = [] for root, dirnames, filenames in os.walk(source_dir): for filename in fnmatch.filter(filenames, "*.trans.txt"): trans_filename = os.path.join(root, filename) with codecs.open(trans_filename, "r", "utf-8") as fin: for line in fin: # Parse each segment line first_space = line.find(" ") seqid, transcript = line[:first_space], line[first_space + 1 :] # We need to do the encode-decode dance here because encode # returns a bytes() object on Python 3, and text_to_char_array # expects a string. transcript = ( unicodedata.normalize("NFKD", transcript) .encode("ascii", "ignore") .decode("ascii", "ignore") ) transcript = transcript.lower().strip() # Convert corresponding FLAC to a WAV flac_file = os.path.join(root, seqid + ".flac") wav_file = os.path.join(target_dir, seqid + ".wav") if not os.path.exists(wav_file): tfm = Transformer() tfm.set_output_format(rate=SAMPLE_RATE) tfm.build(flac_file, wav_file) wav_filesize = os.path.getsize(wav_file) files.append((os.path.abspath(wav_file), wav_filesize, transcript)) return pandas.DataFrame( data=files, columns=["wav_filename", "wav_filesize", "transcript"] )
def process(x): file_path, text = x file_name = os.path.splitext(os.path.basename(file_path))[0] text = text.lower().strip() audio_path = os.path.join(audio_clips_path, file_path) output_wav_path = os.path.join(wav_dir, file_name + '.wav') tfm = Transformer() tfm.rate(samplerate=args.sample_rate) tfm.build(input_filepath=audio_path, output_filepath=output_wav_path) duration = sox.file_info.duration(output_wav_path) return output_wav_path, duration, text
def process(x): file_path, text = x file_name = os.path.splitext(os.path.basename(file_path))[0] text = text.strip().upper() with open(os.path.join(txt_dir, file_name + '.txt'), 'w') as f: f.write(text) audio_path = os.path.join(audio_clips_path, file_path) output_wav_path = os.path.join(wav_dir, file_name + '.wav') tfm = Transformer() tfm.rate(samplerate=args.sample_rate) tfm.build(input_filepath=audio_path, output_filepath=output_wav_path)
def convert(self): """Converts the mp3's associated with this instance to wav's Return: wav_directory (os.path): The directory into which the associated wav's were downloaded """ wav_directory = self._pre_convert() for mp3_filename in self.mp3_directory.glob('**/*.mp3'): wav_filename = path.join(wav_directory, os.path.splitext(os.path.basename(mp3_filename))[0] + ".wav") if not path.exists(wav_filename): _logger.debug("Converting mp3 file %s to wav file %s" % (mp3_filename, wav_filename)) transformer = Transformer() transformer.convert(samplerate=SAMPLE_RATE, n_channels=N_CHANNELS, bitdepth=BITDEPTH) transformer.build(str(mp3_filename), str(wav_filename)) else: _logger.debug("Already converted mp3 file %s to wav file %s" % (mp3_filename, wav_filename)) return wav_directory
def compressed_wav_to_full(source_dir, target_dir): """Convert compressed wav files to full wav files.""" assert path.exists(source_dir) is True if not path.exists(target_dir): makedirs(target_dir) for compressed_file in glob(path.join(source_dir, "*.wav")): transformer = Transformer() if hp.callhome_rate == 8000: transformer.set_output_format(encoding='signed-integer', channels=1) # Also set single channel. else: # Do resampling if specified. transformer.set_output_format(encoding='signed-integer', channels=1, rate=hp.callhome_rate) wav_filename = path.basename(compressed_file) wav_file = path.join(target_dir, wav_filename) transformer.build(compressed_file, wav_file)
def sph_to_wav(source_dir, target_dir): """Convert .sph files to .wav files.""" assert path.exists(source_dir) is True if not path.exists(target_dir): makedirs(target_dir) for sph_file in glob(path.join(source_dir, "*.sph")): transformer = Transformer() if hp.tedlium_rate != 16000: transformer.set_output_format(encoding='signed-integer', channels=1, rate=hp.tedlium_rate) wav_filename = path.splitext(path.basename(sph_file))[0] + ".wav" wav_file = path.join(target_dir, wav_filename) transformer.build(sph_file, wav_file)
def _maybe_convert_wav(data_dir, extracted_data, converted_data): source_dir = os.path.join(data_dir, extracted_data) target_dir = os.path.join(data_dir, converted_data) # Conditionally convert FLAC files to wav files if not gfile.Exists(target_dir): # Create target_dir os.makedirs(target_dir) # Loop over FLAC files in source_dir and convert each to wav for root, dirnames, filenames in os.walk(source_dir): for filename in fnmatch.filter(filenames, '*.flac'): flac_file = os.path.join(root, filename) wav_filename = os.path.splitext(os.path.basename(flac_file))[0] + ".wav" wav_file = os.path.join(target_dir, wav_filename) transformer = Transformer() transformer.build(flac_file, wav_file) os.remove(flac_file)
def pitch_shift(aud_seg: AudioSegment, semi: float, **kwargs): """Pitch shift audio sample by semi semitones, without changing the speed of the audio segment. Arguments: aud_seg: audio segment to alter semi: Number of semitones to pitch audio """ # Create a sox transformer tfm = Transformer() tfm.pitch(semi) # Unfortunately, using our current libraries, idk how to make this faster # Sox requires an input file and an output file to perform the pitch shift temp_in_file = NamedTemporaryFile(suffix='.wav') aud_seg.export(temp_in_file, format='wav') temp_out_file = NamedTemporaryFile(suffix='.wav') tfm.build(temp_in_file.name, temp_out_file.name) return AudioSegment.from_file(temp_out_file.name, format='wav')
def process(x): file_path, text = x file_name = os.path.splitext(os.path.basename(file_path))[0] text = text.lower().strip() audio_path = os.path.join(audio_clips_path, file_path) if os.path.getsize(audio_path) == 0: logging.warning(f'Skipping empty audio file {audio_path}') return '', '', '' output_wav_path = os.path.join(wav_dir, file_name + '.wav') if not os.path.exists(output_wav_path): tfm = Transformer() tfm.rate(samplerate=args.sample_rate) tfm.channels(n_channels=args.n_channels) tfm.build(input_filepath=audio_path, output_filepath=output_wav_path) duration = sox.file_info.duration(output_wav_path) return output_wav_path, duration, text
def read(self, audio_metadata): """Read an audio file. :param audio_metadata: metadata info of an audio :return: raw audio data as float32 array and duration in seconds. """ fd = temp_path = None # Convert it to a wav file. if not audio_metadata.path.endswith('.wav'): original_sample_rate = file_info.sample_rate(audio_metadata.path) assert self._sample_rate <= original_sample_rate transformer = Transformer() transformer.convert(samplerate=self._sample_rate, n_channels=self._channels, bitdepth=self._bits_per_sample) fd, temp_path = tempfile.mkstemp(suffix='.wav') transformer.build(audio_metadata.path, temp_path) if temp_path: path = temp_path else: path = audio_metadata.path # Read the audio file. with SoundFile(path) as soundfile: # make sure the audio properties are as expected. assert soundfile.samplerate == self._sample_rate assert soundfile.channels == self._channels duration_sec = len(soundfile) / self._sample_rate pcm = soundfile.read(dtype='float32') # Add 0.5 second silence to the end of files containing keyword as in occasionally the user stopped # recording right after uttering the keyword. If the detector needs some time after seeing the keyword to # make a decision (e.g. endpointing) this is going to artificially increase the miss rates. if audio_metadata.is_keyword: pcm = np.append(pcm, np.zeros(self._sample_rate // 2)) if temp_path: os.close(fd) os.remove(temp_path) return pcm, duration_sec
def speedup(aud_seg: AudioSegment, speed: float, **kwargs): """Speed up (or slow down) audio segment Args: aud_seg: audio segment to alter speed: new playback speed. Should be thought of as a percentage. For example, if we want to speed up aud_seg by 20%, we pass in 1.2. To slow it down to 80%, pass in 0.8 """ tfm = Transformer() tfm.tempo(speed) # Unfortunately, using our current libraries, idk how to make this faster # Sox requires an input file and an output file to perform the tempo shift temp_in_file = NamedTemporaryFile(suffix='.wav') aud_seg.export(temp_in_file, format='wav') temp_out_file = NamedTemporaryFile(suffix='.wav') tfm.build(temp_in_file.name, temp_out_file.name) return AudioSegment.from_file(temp_out_file.name, format='wav')
def main(): fileName = "audio" sx = Transformer() proxyPool = scraper() prefs = getProfile(proxyPool) urlAddr, inputs = getInputs() browser = automatePage(fireFoxPath=FIREFOX_PATH, prefs=prefs, address=urlAddr, inputList=inputs) ########################## ### Convert Audio File ### ########################## print "Converting Audio File" sx.build(fileName + ".mp3", fileName + ".wav") answer = getAnswer(fileName) submitAnswer(browser, answer)
def _maybe_convert_wav_dataset(extracted_dir, data_set): # Create source dir source_dir = path.join(extracted_dir, data_set, "sph") # Create target dir target_dir = path.join(extracted_dir, data_set, "wav") # Conditionally convert sph files to wav files if not gfile.Exists(target_dir): # Create target_dir makedirs(target_dir) # Loop over sph files in source_dir and convert each to wav for sph_file in glob(path.join(source_dir, "*.sph")): transformer = Transformer() wav_filename = path.splitext(path.basename(sph_file))[0] + ".wav" wav_file = path.join(target_dir, wav_filename) transformer.build(sph_file, wav_file) remove(sph_file) # Remove source_dir rmdir(source_dir)
def convert(self): """Converts the mp3's associated with this instance to wav's Return: wav_directory (os.path): The directory into which the associated wav's were downloaded """ wav_directory = self._pre_convert() for mp3_filename in self.mp3_directory.glob('**/*.mp3'): wav_filename = path.join( wav_directory, os.path.splitext(os.path.basename(mp3_filename))[0] + ".wav") if not path.exists(wav_filename): _logger.debug("Converting mp3 file %s to wav file %s" % (mp3_filename, wav_filename)) transformer = Transformer() transformer.convert(samplerate=SAMPLE_RATE, n_channels=N_CHANNELS, bitdepth=BITDEPTH) transformer.build(str(mp3_filename), str(wav_filename)) else: _logger.debug("Already converted mp3 file %s to wav file %s" % (mp3_filename, wav_filename)) return wav_directory
def _processSamples(sample_list): for sample in sample_list: sample_new_name = _renameSample(sample) _out = join(out_path, sample_new_name) processed_samples.append(_out) _in = sample # Sox processing using Transform instance tfm = Transformer() tfm.convert(samplerate=44100, n_channels=2, bitdepth=16) if NORMALIZE: tfm.norm(db_level=-3) if SILENCE: tfm.silence(location=-1, silence_threshold=0.05, min_silence_duration=0.1) if PADDING: tfm.pad(0, PADDING) tfm.build(_in, _out)
def convert_audio_and_split_transcript(input_dir, source_name, target_name, output_dir, output_file): """Convert FLAC to WAV and split the transcript. For audio file, convert the format from FLAC to WAV using the sox.Transformer library. For transcripts, each line contains the sequence id and the corresponding transcript (separated by space): Input data format: seq-id transcript_of_seq-id For example: 1-2-0 transcript_of_1-2-0.flac 1-2-1 transcript_of_1-2-1.flac ... Each sequence id has a corresponding .flac file. Parse the transcript file and generate a new csv file which has three columns: "wav_filename": the absolute path to a wav file. "wav_filesize": the size of the corresponding wav file. "transcript": the transcript for this audio segement. Args: input_dir: the directory which holds the input dataset. source_name: the name of the specified dataset. e.g. test-clean target_name: the directory name for the newly generated audio files. e.g. test-clean-wav output_dir: the directory to place the newly generated csv files. output_file: the name of the newly generated csv file. e.g. test-clean.csv """ tf.logging.info("Preprocessing audio and transcript for %s" % source_name) source_dir = os.path.join(input_dir, source_name) target_dir = os.path.join(input_dir, target_name) if not tf.gfile.Exists(target_dir): tf.gfile.MakeDirs(target_dir) files = [] tfm = Transformer() # Convert all FLAC file into WAV format. At the same time, generate the csv # file. for root, _, filenames in tf.gfile.Walk(source_dir): for filename in fnmatch.filter(filenames, "*.trans.txt"): trans_file = os.path.join(root, filename) with codecs.open(trans_file, "r", "utf-8") as fin: for line in fin: seqid, transcript = line.split(" ", 1) # We do a encode-decode transformation here because the output type # of encode is a bytes object, we need convert it to string. transcript = unicodedata.normalize( "NFKD", transcript).encode("ascii", "ignore").decode( "ascii", "ignore").strip().lower() # Convert FLAC to WAV. flac_file = os.path.join(root, seqid + ".flac") wav_file = os.path.join(target_dir, seqid + ".wav") if not tf.gfile.Exists(wav_file): tfm.build(flac_file, wav_file) wav_filesize = os.path.getsize(wav_file) files.append( (os.path.abspath(wav_file), wav_filesize, transcript)) # Write to CSV file which contains three columns: # "wav_filename", "wav_filesize", "transcript". csv_file_path = os.path.join(output_dir, output_file) df = pandas.DataFrame( data=files, columns=["wav_filename", "wav_filesize", "transcript"]) df.to_csv(csv_file_path, index=False, sep="\t") tf.logging.info("Successfully generated csv file {}".format(csv_file_path))
def convert_audio_and_split_transcript(directory, subset, output_dir): """Convert SPH to WAV and split the transcript. Args: directory: the directory which holds the input dataset. subset: the name of the specified dataset. supports train (switchboard+fisher), switchboard, fisher, hub500 and rt03s. output_dir: the directory to place the newly generated csv files. """ logging.info("Processing audio and transcript for %s" % subset) gfile = tf.compat.v1.gfile sph2pip = os.path.join(os.path.dirname(__file__), "../utils/sph2pipe") swd_audio_trans_dir = [os.path.join(directory, "LDC97S62")] fisher_audio_dirs = [ os.path.join(directory, "LDC2004S13"), os.path.join(directory, "LDC2005S13"), ] fisher_trans_dirs = [ os.path.join(directory, "LDC2004T19"), os.path.join(directory, "LDC2005T19"), ] hub_audio_dir = [os.path.join(directory, "LDC2002S09")] hub_trans_dir = [os.path.join(directory, "LDC2002T43")] rts_audio_trans_dir = [os.path.join(directory, "LDC2007S10")] if subset == "train": # Combination of switchboard corpus and fisher corpus. audio_dir = swd_audio_trans_dir + fisher_audio_dirs trans_dir = swd_audio_trans_dir + fisher_trans_dirs elif subset == "switchboard": audio_dir = swd_audio_trans_dir trans_dir = swd_audio_trans_dir elif subset == "fisher": audio_dir = fisher_audio_dirs trans_dir = fisher_trans_dirs elif subset == "hub500": audio_dir = hub_audio_dir trans_dir = hub_trans_dir elif subset == "rt03s": audio_dir = rts_audio_trans_dir trans_dir = rts_audio_trans_dir else: raise ValueError(subset, " is not in switchboard_fisher") subset_dir = os.path.join(directory, subset) if not gfile.Exists(subset_dir): gfile.MakeDirs(subset_dir) output_wav_dir = os.path.join(directory, subset + "/wav") if not gfile.Exists(output_wav_dir): gfile.MakeDirs(output_wav_dir) tmp_dir = os.path.join(directory, "tmp") if not gfile.Exists(tmp_dir): gfile.MakeDirs(tmp_dir) # Build SPH dict. files = [] sph_files_dict = {} for sub_audio_dir in audio_dir: for root, _, filenames in gfile.Walk(sub_audio_dir): for filename in fnmatch.filter(filenames, "*.[Ss][Pp][Hh]"): sph_key = os.path.splitext(filename)[0] sph_file = os.path.join(root, filename) sph_files_dict[sph_key] = sph_file with TemporaryDirectory(dir=tmp_dir) as output_tmp_wav_dir: for sub_trans_dir in trans_dir: if sub_trans_dir in swd_audio_trans_dir: fnmatch_pat = "*-trans.text" split_and_norm_func = split_line_and_norm_swd elif sub_trans_dir in fisher_trans_dirs: fnmatch_pat = "*.[Tt][Xx][Tt]" split_and_norm_func = split_line_and_norm_fisher elif sub_trans_dir in hub_trans_dir: fnmatch_pat = "hub5e00.english.000405.stm" split_and_norm_func = split_line_and_norm_hub_rts else: fnmatch_pat = "*.stm" split_and_norm_func = split_line_and_norm_hub_rts for root, _, filenames in gfile.Walk(sub_trans_dir): for filename in fnmatch.filter(filenames, fnmatch_pat): trans_file = os.path.join(root, filename) if 1 in [ ele in root for ele in [ "doc", "DOC", "mandarin", "arabic", "concatenated", "bnews", ] ]: continue with codecs.open(trans_file, "r", "utf-8") as fin: for line in fin: line = line.strip() ( sph_key, speaker, time_start, time_end, norm_trans, ) = split_and_norm_func(line, filename) # Too short, skip the wave file if time_end - time_start <= 0.1: continue if norm_trans == "": continue if speaker == "A": channel = 1 else: channel = 2 # Convert SPH to split WAV. if sph_key not in sph_files_dict: print(sph_key + " not found, please check.") continue sph_file = sph_files_dict[sph_key] wav_file = os.path.join( output_tmp_wav_dir, sph_key + "." + speaker + ".wav") if not gfile.Exists(sph_file): raise ValueError( "the sph file {} is not exists".format( sph_file)) sub_wav_filename = "{0}-{1}-{2:06d}-{3:06d}".format( sph_key, speaker, round(time_start * 100), round(time_end * 100), ) sub_wav_file = os.path.join( output_wav_dir, sub_wav_filename + ".wav") if not gfile.Exists(sub_wav_file): if not gfile.Exists(wav_file): sph2pipe_cmd = (sph2pip + " -f wav -c {} -p ".format( str(channel)) + sph_file + " " + wav_file) os.system(sph2pipe_cmd) tfm = Transformer() tfm.trim(time_start, time_end) tfm.build(wav_file, sub_wav_file) # wav_filesize = os.path.getsize(sub_wav_file) wav_length = get_wave_file_length(sub_wav_file) speaker_name = sph_key + "-" + speaker files.append( (os.path.abspath(sub_wav_file), wav_length, norm_trans, speaker_name)) # Write to CSV file which contains four columns: # "wav_filename", "wav_length_ms", "transcript", "speaker". out_csv_file = os.path.join(output_dir, subset + ".csv") df = pandas.DataFrame( data=files, columns=["wav_filename", "wav_length_ms", "transcript", "speaker"]) df.to_csv(out_csv_file, index=False, sep="\t") logging.info("Successfully generated csv file {}".format(out_csv_file))
def _maybe_convert_wav(mp3_filename, wav_filename): if not path.exists(wav_filename): transformer = Transformer() transformer.convert(samplerate=SAMPLE_RATE) transformer.build(mp3_filename, wav_filename)
def convert_sr_channel(audio) : transformer = Transformer() transformer.convert(samplerate=16000, n_channels=1) transformer.build(original_audio_file, wav_file)
def convert_audio_and_split_transcript(input_dir, source_name, target_name, output_dir, output_file): """Convert FLAC to WAV and split the transcript. For audio file, convert the format from FLAC to WAV using the sox.Transformer library. For transcripts, each line contains the sequence id and the corresponding transcript (separated by space): Input data format: seq-id transcript_of_seq-id For example: 1-2-0 transcript_of_1-2-0.flac 1-2-1 transcript_of_1-2-1.flac ... Each sequence id has a corresponding .flac file. Parse the transcript file and generate a new csv file which has three columns: "wav_filename": the absolute path to a wav file. "wav_filesize": the size of the corresponding wav file. "transcript": the transcript for this audio segement. Args: input_dir: the directory which holds the input dataset. source_name: the name of the specified dataset. e.g. test-clean target_name: the directory name for the newly generated audio files. e.g. test-clean-wav output_dir: the directory to place the newly generated csv files. output_file: the name of the newly generated csv file. e.g. test-clean.csv """ tf.logging.info("Preprocessing audio and transcript for %s" % source_name) source_dir = os.path.join(input_dir, source_name) target_dir = os.path.join(input_dir, target_name) if not tf.gfile.Exists(target_dir): tf.gfile.MakeDirs(target_dir) files = [] tfm = Transformer() # Convert all FLAC file into WAV format. At the same time, generate the csv # file. for root, _, filenames in tf.gfile.Walk(source_dir): for filename in fnmatch.filter(filenames, "*.trans.txt"): trans_file = os.path.join(root, filename) with codecs.open(trans_file, "r", "utf-8") as fin: for line in fin: seqid, transcript = line.split(" ", 1) # We do a encode-decode transformation here because the output type # of encode is a bytes object, we need convert it to string. transcript = unicodedata.normalize("NFKD", transcript).encode( "ascii", "ignore").decode("ascii", "ignore").strip().lower() # Convert FLAC to WAV. flac_file = os.path.join(root, seqid + ".flac") wav_file = os.path.join(target_dir, seqid + ".wav") if not tf.gfile.Exists(wav_file): tfm.build(flac_file, wav_file) wav_filesize = os.path.getsize(wav_file) files.append((os.path.abspath(wav_file), wav_filesize, transcript)) # Write to CSV file which contains three columns: # "wav_filename", "wav_filesize", "transcript". csv_file_path = os.path.join(output_dir, output_file) df = pandas.DataFrame( data=files, columns=["wav_filename", "wav_filesize", "transcript"]) df.to_csv(csv_file_path, index=False, sep="\t") tf.logging.info("Successfully generated csv file {}".format(csv_file_path))
# !/usr/bin/python3 import os import tqdm from sox import Transformer SAMPLE_RATE = 16000 remove_flac = False path = '/home/dsmolen/agh/LibriSpeech/' i = 0 tq = tqdm.tqdm(os.walk(path, topdown=False)) for root, dirs, files in tq: for name in files: if name.endswith('.flac'): tq.set_postfix(converted=i) i += 1 name = name[:-5] flac_file = os.path.join(root, name + ".flac") wav_file = os.path.join(root, name + ".wav") if not os.path.exists(wav_file): tfm = Transformer() tfm.set_output_format(rate=SAMPLE_RATE) tfm.build(flac_file, wav_file) if remove_flac: os.remove(flac_file)
def convert_audio_and_split_transcript(dataset_dir, subset, out_csv_file, output_dir): """Convert SPH to WAV and split the transcript. Args: dataset_dir : the directory which holds the input dataset. subset : the name of the specified dataset. e.g. dev. out_csv_file : the resulting output csv file. output_dir : Athena working directory. """ gfile = tf.compat.v1.gfile sph2pip = os.path.join(os.path.dirname(__file__), "../../../../tools/sph2pipe/sph2pipe") text_featurizer = TextFeaturizer() logging.info("Processing audio and transcript for %s" % subset) audio_dir = os.path.join(dataset_dir, "LDC2005S15/") trans_dir = os.path.join(dataset_dir, "LDC2005T32/") output_wav_dir = os.path.join(output_dir, subset + "/wav") if not gfile.Exists(output_wav_dir): gfile.MakeDirs(output_wav_dir) files = [] char_dict = {} sph_files_dict = {} for root, _, filenames in gfile.Walk(audio_dir): for filename in fnmatch.filter(filenames, "*.sph"): if subset in root: sph_key = os.path.splitext(filename)[0] sph_file = os.path.join(root, filename) sph_files_dict[sph_key] = sph_file # Convert all SPH file into WAV format. # Generate the JSON file and char dict file. with TemporaryDirectory(dir=output_dir) as output_tmp_wav_dir: for root, _, filenames in gfile.Walk(trans_dir): if not re.match('.*/' + subset + '.*', root): continue for filename in fnmatch.filter(filenames, "*.txt"): trans_file = os.path.join(root, filename) sph_key = "" speaker_A = "" speaker_B = "" with codecs.open(trans_file, "r", "gb18030") as fin: for line in fin: line = line.strip() if len(line.split(" ")) <= 1: continue if len(line.split(" ")) == 2: sph_key = line.split(" ")[1] speaker_A = sph_key.split("_")[2] speaker_B = sph_key.split("_")[3] continue time_start, time_end, speaker, transcript = line.split( " ", 3) time_start = float(time_start) time_end = float(time_end) # too short, skip the wave file if time_end - time_start <= 0.1: continue speaker = speaker[0] # remove ':' in 'A:' if speaker == "A": channel = 1 speaker_id = speaker_A else: channel = 2 speaker_id = speaker_B # Convert SPH to split WAV. sph_file = sph_files_dict[sph_key] wav_file = os.path.join( output_tmp_wav_dir, sph_key + "." + speaker[0] + ".wav") if not gfile.Exists(sph_file): raise ValueError( "the sph file {} is not exists".format( sph_file)) if not gfile.Exists(wav_file): sph2pipe_cmd = ( sph2pip + " -f wav -c {} -p ".format(str(channel)) + sph_file + " " + wav_file) os.system(sph2pipe_cmd) sub_wav_filename = "{0}-{1}-{2:06d}-{3:06d}".format( sph_key, speaker, int(time_start * 100), int(time_end * 100)) sub_wav_file = os.path.join(output_wav_dir, sub_wav_filename + ".wav") if not gfile.Exists(sub_wav_file): tfm = Transformer() tfm.trim(time_start, time_end) tfm.build(wav_file, sub_wav_file) wav_length = get_wave_file_length(sub_wav_file) transcript = normalize_hkust_trans(transcript) transcript = text_featurizer.delete_punct(transcript) if len(transcript) > 0: for char in transcript: if char in char_dict: char_dict[char] += 1 else: char_dict[char] = 0 files.append(( os.path.abspath(sub_wav_file), wav_length, transcript, speaker_id, )) # Write to CSV file which contains three columns: # "wav_filename", "wav_length_ms", "labels". df = pandas.DataFrame( data=files, columns=["wav_filename", "wav_length_ms", "transcript", "speaker"]) df.to_csv(out_csv_file, index=False, sep="\t") logging.info("Successfully generated csv file {}".format(out_csv_file))
if not os.path.exists(target_dir): os.makedirs(target_dir) files = [] SAMPLE_RATE = 16000 for set in selection: for item in set['set_items']: filename = source_dir + '/' + item['path'] transcript = item['sentence'] transcript = unicodedata.normalize('NFKD', transcript) \ .encode('ascii', 'ignore') \ .decode('ascii', 'ignore') transcript = transcript.lower().strip() # Convert corresponding MP3 to a WAV mp3_file = source_dir + '/' + item['path'] wav_file = target_dir + '/' + item['path'].replace('.mp3', '.wav') if not os.path.exists(wav_file): tfm = Transformer() tfm.set_output_format(rate=SAMPLE_RATE) tfm.build(mp3_file, wav_file) wav_filesize = os.path.getsize(wav_file) files.append((os.path.abspath(wav_file), wav_filesize, transcript)) data_info = pandas.DataFrame(data=files, columns=['wav_filename', 'wav_filesize', 'transcript']) data_info.to_csv(data_dir + '/common-voice-pertubed_sets.csv', index=False)
class Audio: def __init__(self, fname): ''' opens .wav audio file with fname ''' self.wav_fname = fname try: with wave.open(fname, mode="r") as wav: self.nchannels, self.sampwidth, self.framerate, self.nframes, self.comptype, self.compname = wav.getparams( ) self.duration = self.nframes / self.framerate self.peak = 256**self.sampwidth / 2 self.content = wav.readframes(self.nframes) self.samples = np.fromstring(self.content, dtype=types[self.sampwidth]) self.tsm = Transformer() return except FileNotFoundError as err: print(err) print("Try Audio.reload_file function") raise FileNotFoundError def reload_file(self, fname=None): ''' recalls the init function ''' if fname: self.__init__(fname) else: self.__init__(self.wav_fname) return def prepare_file(self, fname=None, trim=True, nnoise=True, reload=True): ''' trims and reduce noise on file should automatically trim the silent regions from the beginning/end, so you should use trim with nnoise reload - determines whether to reload the audio file fname - name of output, changed file ''' if nnoise: self.tsm.noiseprof(self.wav_fname, self.wav_fname.rstrip(".wav") + '_noiseprof') self.tsm.noisered(self.wav_fname.rstrip(".wav") + '_noiseprof') if trim: self.tsm.silence(1) self.tsm.silence(-1) if fname: outname = fname else: outname = self.wav_fname.rstrip(".wav") + "_nnoise_trim.wav" self.tsm.build(self.wav_fname, outname) if reload: self.reload_file(fname=outname) def find_patt(self, patt, k=100, use_fastdtw=False, with_dwt=True) -> (float, float): ''' patt - pattern, Audio object k - downsampling coefficient, integer use_fastdtw - determines whether to use dtw for distance or just euclidean distance with_dwt - use single level Discrete Wavelet Transform for data return: time of found pattern in seconds and time of search ''' evaltime = time() # prepare samples data = self.samples pattern = patt.samples data = data[0::k] pattern = pattern[0::k] data = (data - data.mean()) / data.std() pattern = (pattern - pattern.mean()) / pattern.std() if with_dwt: data, *_ = dwt(data, 'db1') pattern, *_ = dwt(pattern, 'db1') distances = [] maxcount = len(data) - len(pattern) + 1 for i in range(maxcount): if use_fastdtw: distances.append( fastdtw(data[i:i + len(pattern)], pattern, dist=euclidean)[0]) else: distances.append(euclidean(data[i:i + len(pattern)], pattern)) evaltime = time() - evaltime res = (distances.index(min(distances)) / len(data)) * self.duration return res, evaltime def draw_waveform_matplotlib(self, fname=None, save=False): ''' better use plotly, does the same ''' import matplotlib.pyplot as plt import matplotlib.ticker as ticker w, h = 800, 300 # k = self.nframes//w//32 DPI = 72 plt.figure(1, figsize=(float(w) / DPI, float(h) / DPI), dpi=DPI) plt.subplots_adjust(wspace=0, hspace=0) for n in range(self.nchannels): channel = self.samples[n::self.nchannels] # channel = channel[0::k] # if self.nchannels == 1: # channel = channel - self.peak axes = plt.subplot(2, 1, n + 1) axes.plot(channel, "g") axes.yaxis.set_major_formatter( ticker.FuncFormatter( lambda x, pos=None: format_db(x, self, pos=pos))) plt.grid(True, color="w") axes.xaxis.set_major_formatter(ticker.NullFormatter()) axes.xaxis.set_major_formatter( ticker.FuncFormatter( lambda x, pos=None: format_time(x, self, pos=pos))) if save: if fname == None: plt.savefig(self.wav_fname.rstrip(".wav") + "_waveform", dpi=DPI) else: plt.savefig(fname, dpi=DPI) plt.show() def draw_waveform_plotly(self, fname=None, save=False, select_interval=None): ''' fname - name for .html file save - if True saves to png with the fname select_interval - (float, float) - time for colouring the data ''' import plotly.offline as py import plotly.graph_objs as go time = [(x / float(self.nframes) * self.duration) for x in range(self.nframes)] # samples = [20 * math.log10(abs(x-peak) / float(peak)) if x != 0 else "-inf" for x in samples] graphs = [] if select_interval: start = round((select_interval[0] / self.duration) * self.nframes) end = round((select_interval[1] / self.duration) * self.nframes) before_start = go.Scatter(x=time[:start], y=self.samples[:start], mode='lines', name='sample', line={"color": "#0000ff"}) graphs.append(before_start) after_start = go.Scatter(x=time[start:end], y=self.samples[start:end], mode='lines', name='pattern', line={"color": "#ff0000"}) graphs.append(after_start) after_end = go.Scatter(x=time[end:], y=self.samples[end:], mode='lines', name='sample', line={"color": "#0000ff"}) graphs.append(after_end) else: trace = go.Scatter(x=time, y=self.samples, mode='lines', name='sample') graphs.append(trace) layout = dict( title=f"{self.wav_fname[self.wav_fname.rfind('/'):]} waveform", # yaxis = dict(zeroline = False), xaxis=dict(title="Time in seconds")) fig = dict(data=graphs, layout=layout) if fname == None: if save: py.plot(fig, filename=(self.wav_fname.rstrip(".wav") + "_waveform.html"), image='png') else: py.plot(fig, filename=(self.wav_fname.rstrip(".wav") + "_waveform.html")) else: if save: py.plot(fig, filename=fname, image='png') else: py.plot(fig, filename=fname)