def main(): target_dir = args.target_dir os.makedirs(target_dir, exist_ok=True) target_unpacked_dir = os.path.join(target_dir, "CV_unpacked") os.makedirs(target_unpacked_dir, exist_ok=True) if args.tar_path and os.path.exists(args.tar_path): print('Find existing file {}'.format(args.tar_path)) target_file = args.tar_path else: print("Could not find downloaded Common Voice archive, Downloading corpus...") filename = wget.download(COMMON_VOICE_URL, target_dir) target_file = os.path.join(target_dir, os.path.basename(filename)) print("Unpacking corpus to {} ...".format(target_unpacked_dir)) tar = tarfile.open(target_file) tar.extractall(target_unpacked_dir) tar.close() for csv_file in args.files_to_process.split(','): convert_to_wav(os.path.join(target_unpacked_dir, 'cv_corpus_v1/', csv_file), os.path.join(target_dir, os.path.splitext(csv_file)[0])) print('Creating manifests...') for csv_file in args.files_to_process.split(','): create_manifest(os.path.join(target_dir, os.path.splitext(csv_file)[0]), os.path.splitext(csv_file)[0] + '_manifest.csv', args.min_duration, args.max_duration)
def main(): target_dir = args.target_dir os.makedirs(target_dir, exist_ok=True) target_unpacked_dir = os.path.join(target_dir, "CV_unpacked") os.makedirs(target_unpacked_dir, exist_ok=True) if args.tar_path and os.path.exists(args.tar_path): print('Find existing file {}'.format(args.tar_path)) target_file = args.tar_path else: print("Could not find downloaded Common Voice archive, Downloading corpus...") filename = wget.download(COMMON_VOICE_URL, target_dir) target_file = os.path.join(target_dir, os.path.basename(filename)) print("Unpacking corpus to {} ...".format(target_unpacked_dir)) tar = tarfile.open(target_file) tar.extractall(target_unpacked_dir) tar.close() for csv_file in args.files_to_process.split(','): convert_to_wav(os.path.join(target_unpacked_dir, 'cv_corpus_v1/', csv_file), os.path.join(target_dir, os.path.splitext(csv_file)[0])) print('Creating manifests...') for csv_file in args.files_to_process.split(','): create_manifest(os.path.join(target_dir, os.path.splitext(csv_file)[0]), os.path.splitext(csv_file)[0] + '_manifest.csv', args.min_duration, args.max_duration)
def main(): target_dl_dir = args.target_dir source_dl_dir = args.source_dir #speech_only/speech_and_noise data_type = args.data_type if not os.path.exists(target_dl_dir): os.makedirs(target_dl_dir) # prepare target dir target_train_dir = os.path.join(target_dl_dir, "train_C") if not os.path.exists(target_train_dir): os.makedirs(target_train_dir) target_val_dir = os.path.join(target_dl_dir, "dev") if not os.path.exists(target_val_dir): os.makedirs(target_val_dir) target_test_dir = os.path.join(target_dl_dir, "test") if not os.path.exists(target_test_dir): os.makedirs(target_test_dir) # source dir source_train = os.path.join(source_dl_dir, "train_C") source_val = os.path.join(source_dl_dir, "dev") source_test = os.path.join(source_dl_dir, "test") print " prepare data for train " prepare_dir(target_train_dir, source_train, data_type) print " prepare data for dev " # prepare_dir(target_val_dir, source_val, data_type) print " prepare data for test " # prepare_dir(target_test_dir, source_test, data_type) print('Creating manifests...') create_manifest(target_dl_dir, os.path.join(target_train_dir, "converted"), 'train_C', data_type)
def main(): target_dl_dir = args.target_dir if not os.path.exists(target_dl_dir): os.makedirs(target_dl_dir) files_to_dl = args.files_to_use.strip().split(',') for split_type, lst_libri_urls in LIBRI_SPEECH_URLS.items(): split_dir = os.path.join(target_dl_dir, split_type) if not os.path.exists(split_dir): os.makedirs(split_dir) split_wav_dir = os.path.join(split_dir, "wav") if not os.path.exists(split_wav_dir): os.makedirs(split_wav_dir) split_txt_dir = os.path.join(split_dir, "txt") if not os.path.exists(split_txt_dir): os.makedirs(split_txt_dir) extracted_dir = os.path.join(split_dir, "LibriSpeech") if os.path.exists(extracted_dir): shutil.rmtree(extracted_dir) for url in lst_libri_urls: # check if we want to dl this file filename = url.split("/")[-1] target_filename = os.path.join(split_dir, filename) matches = [f for f in files_to_dl if url.find(f) != -1] if not matches: continue if not os.path.exists(target_filename): print("Downloading file {} from {} to {}".format( filename, url, target_filename)) raise Exception("Error") wget.download(url, split_dir) else: print("Skipping existing file from url: {}".format(url)) print("Unpacking {}...".format(filename)) tar = tarfile.open(target_filename) tar.extractall(split_dir) tar.close() os.remove(target_filename) print("Converting flac files to wav and extracting transcripts...") assert os.path.exists( extracted_dir ), "Archive {} was not properly uncompressed.".format(filename) for root, subdirs, files in tqdm(os.walk(extracted_dir)): for f in files: if f.find(".flac") != -1: _process_file(wav_dir=split_wav_dir, txt_dir=split_txt_dir, base_filename=f, root_dir=root) print("Finished {}".format(url)) shutil.rmtree(extracted_dir) if split_type == 'train': # Prune to min/max duration create_manifest(split_dir, 'libri_' + split_type + '_manifest.csv', args.min_duration, args.max_duration) else: create_manifest(split_dir, 'libri_' + split_type + '_manifest.csv')
def main(): if not os.path.isdir(args.target_dir): os.makedirs(args.target_dir) train_path = args.target_dir + '/train/' test_path = args.target_dir + '/test/' subprocess.call(["local/clean_corpus.sh","$HOME/copora/NIKL",args.target_dir]) subprocess.call(["local/data_prep.sh","$HOME/copora/NIKL",args.target_dir]) print ('\n', 'Creating manifests...') create_manifest(train_path, 'nikl_train_manifest.csv', args.min_duration, args.max_duration) create_manifest(test_path, 'nikl_val_manifest.csv')
def main(): target_dir = args.target_dir os.makedirs(target_dir, exist_ok=True) target_unpacked_dir = os.path.join(target_dir, "CV_unpacked") os.makedirs(target_unpacked_dir, exist_ok=True) if args.data_dir and os.path.exists(args.data_dir): print('Find existing file {}'.format(args.data_dir)) else: raise RuntimeError( "Could not find downloaded IARPA babel corpus, please download the relevant corpus from LDC" ) if os.path.isdir(args.data_dir): print("Identified unpacked IARPA dataset") unpacked_location = args.data_dir else: print("Unpacking corpus to {} ...".format(target_unpacked_dir)) tar = tarfile.open(target_file) tar.extractall(target_unpacked_dir) tar.close() unpacked_location = target_unpacked_dir path_flattened = re.sub(r"[\/]", "_", os.path.splitext(args.data_dir)[0]) os.makedirs(os.path.join(target_dir, path_flattened), exist_ok=True) roots = {} # collect all the filepaths for root, dirs, files in os.walk(unpacked_location): roots[root] = files audio_trans_pairs = [] # this is a list of tuples for root in roots: # find all the audio directories if re.search(r"/audio", root): transcription_root = re.sub(r"/audio", "/transcription", root) print(transcription_root) for fp in roots[root]: txt_fp = re.sub(r"\.wav", ".txt", fp) if os.path.exists(os.path.join(transcription_root, txt_fp)): pair_tuple = (os.path.join(transcription_root, txt_fp), os.path.join(root, fp)) audio_trans_pairs.append(pair_tuple) for txt_path, audio_path in audio_trans_pairs: convert_to_wav(txt_path, audio_path, os.path.join(target_dir, path_flattened)) # make a separate manifest for each print('Creating manifests...') create_manifest(os.path.join(target_dir, path_flattened), path_flattened + '_manifest.csv', args.min_duration, args.max_duration)
def main(): target_dl_dir = args.target_dir if not os.path.exists(target_dl_dir): os.makedirs(target_dl_dir) files_to_dl = args.files_to_use.strip().split(',') for split_type, lst_libri_urls in LIBRI_SPEECH_URLS.items(): split_dir = os.path.join(target_dl_dir, split_type) if not os.path.exists(split_dir): os.makedirs(split_dir) split_wav_dir = os.path.join(split_dir, "wav") if not os.path.exists(split_wav_dir): os.makedirs(split_wav_dir) split_txt_dir = os.path.join(split_dir, "txt") if not os.path.exists(split_txt_dir): os.makedirs(split_txt_dir) extracted_dir = os.path.join(split_dir, "LibriSpeech") if os.path.exists(extracted_dir): shutil.rmtree(extracted_dir) for url in lst_libri_urls: # check if we want to dl this file dl_flag = False for f in files_to_dl: if url.find(f) != -1: dl_flag = True if not dl_flag: print("Skipping url: {}".format(url)) continue filename = url.split("/")[-1] target_filename = os.path.join(split_dir, filename) if not os.path.exists(target_filename): wget.download(url, split_dir) print("Unpacking {}...".format(filename)) tar = tarfile.open(target_filename) tar.extractall(split_dir) tar.close() os.remove(target_filename) print("Converting flac files to wav and extracting transcripts...") assert os.path.exists( extracted_dir ), "Archive {} was not properly uncompressed.".format(filename) for root, subdirs, files in os.walk(extracted_dir): for f in files: if f.find(".flac") != -1: _process_file(wav_dir=split_wav_dir, txt_dir=split_txt_dir, base_filename=f, root_dir=root) print("Finished {}".format(url)) shutil.rmtree(extracted_dir) create_manifest(split_dir, 'libri_' + split_type)
def main(): target_dl_dir = args.target_dir if not os.path.exists(target_dl_dir): os.makedirs(target_dl_dir) files_to_dl = args.files_to_use.strip().split(',') for split_type, lst_libri_urls in LIBRI_SPEECH_URLS.items(): split_dir = os.path.join(target_dl_dir, split_type) if not os.path.exists(split_dir): os.makedirs(split_dir) split_wav_dir = os.path.join(split_dir, "wav") if not os.path.exists(split_wav_dir): os.makedirs(split_wav_dir) split_txt_dir = os.path.join(split_dir, "txt") if not os.path.exists(split_txt_dir): os.makedirs(split_txt_dir) extracted_dir = os.path.join(split_dir, "LibriSpeech") if os.path.exists(extracted_dir): shutil.rmtree(extracted_dir) for url in lst_libri_urls: # check if we want to dl this file dl_flag = False for f in files_to_dl: if url.find(f) != -1: dl_flag = True if not dl_flag: print("Skipping url: {}".format(url)) continue filename = url.split("/")[-1] target_filename = os.path.join(split_dir, filename) if not os.path.exists(target_filename): wget.download(url, split_dir) print("Unpacking {}...".format(filename)) tar = tarfile.open(target_filename) tar.extractall(split_dir) tar.close() os.remove(target_filename) print("Converting flac files to wav and extracting transcripts...") assert os.path.exists(extracted_dir), "Archive {} was not properly uncompressed.".format(filename) for root, subdirs, files in tqdm(os.walk(extracted_dir)): for f in files: if f.find(".flac") != -1: _process_file(wav_dir=split_wav_dir, txt_dir=split_txt_dir, base_filename=f, root_dir=root) print("Finished {}".format(url)) shutil.rmtree(extracted_dir) if split_type == 'train': # Prune to min/max duration create_manifest(split_dir, 'libri_' + split_type + '_manifest.csv', args.min_duration, args.max_duration) else: create_manifest(split_dir, 'libri_' + split_type + '_manifest.csv')
def test_run_media_linker_during_adapter(self): mfest = otio.plugins.ActiveManifest() manifest = utils.create_manifest() # this wires up the media linkers into the active manifest mfest.media_linkers.extend(manifest.media_linkers) fake_tl = self.adp.read_from_file("foo", media_linker_name="example") self.assertTrue( fake_tl.tracks[0][0].media_reference.metadata.get( 'from_test_linker' ) ) fake_tl = self.adp.read_from_string( "foo", media_linker_name="example" ) self.assertTrue( fake_tl.tracks[0][0].media_reference.metadata.get( 'from_test_linker' ) ) # explicitly turn the media_linker off fake_tl = self.adp.read_from_file("foo", media_linker_name=None) self.assertIsNone( fake_tl.tracks[0][0].media_reference.metadata.get( 'from_test_linker' ) ) # Delete the temporary manifest utils.remove_manifest(manifest)
def main(): root_path = 'an4/' name = 'an4' wget.download('http://www.speech.cs.cmu.edu/databases/an4/an4_raw.bigendian.tar.gz') tar = tarfile.open('an4_raw.bigendian.tar.gz') tar.extractall() os.makedirs(args.target_dir) _format_data(root_path, 'train', name, 'an4_clstk') _format_data(root_path, 'test', name, 'an4test_clstk') shutil.rmtree(root_path) os.remove('an4_raw.bigendian.tar.gz') train_path = args.target_dir + '/train/' test_path = args.target_dir + '/test/' print ('\n', 'Creating manifests...') create_manifest(train_path, 'an4_train_manifest.csv', args.min_duration, args.max_duration) create_manifest(test_path, 'an4_val_manifest.csv')
def main(): root_path = 'an4/' name = 'an4' wget.download( 'http://www.speech.cs.cmu.edu/databases/an4/an4_raw.bigendian.tar.gz') tar = tarfile.open('an4_raw.bigendian.tar.gz') tar.extractall() os.makedirs(args.target_dir) _format_data(root_path, 'train', name, 'an4_clstk') _format_data(root_path, 'test', name, 'an4test_clstk') shutil.rmtree(root_path) os.remove('an4_raw.bigendian.tar.gz') train_path = args.target_dir + '/train/' test_path = args.target_dir + '/test/' print('\n', 'Creating manifests...') create_manifest(train_path, 'an4_train') create_manifest(test_path, 'an4_val')
def main(): target_dl_dir = args.target_dir if not os.path.exists(target_dl_dir): os.makedirs(target_dl_dir) target_unpacked_dir = os.path.join(target_dl_dir, "TEDLIUM_release2") if args.tar_path and os.path.exists(args.tar_path): target_file = args.tar_path else: print("Could not find downloaded TEDLIUM archive, Downloading corpus...") wget.download(TED_LIUM_V2_DL_URL, target_dl_dir) target_file = os.path.join(target_dl_dir, "TEDLIUM_release2.tar.gz") if not os.path.exists(target_unpacked_dir): print("Unpacking corpus...") tar = tarfile.open(target_file) tar.extractall(target_dl_dir) tar.close() else: print("Found TEDLIUM directory, skipping unpacking of tar files") train_ted_dir = os.path.join(target_unpacked_dir, "train") val_ted_dir = os.path.join(target_unpacked_dir, "dev") test_ted_dir = os.path.join(target_unpacked_dir, "test") prepare_dir(train_ted_dir) prepare_dir(val_ted_dir) prepare_dir(test_ted_dir) print('Creating manifests...') create_manifest(train_ted_dir, 'ted_train_manifest.csv', args.min_duration, args.max_duration) create_manifest(val_ted_dir, 'ted_val_manifest.csv') create_manifest(test_ted_dir, 'ted_test_manifest.csv')
def main(): target_dl_dir = args.target_dir source_dl_dir = args.source_dir #speech_only/speech_and_noise if not os.path.exists(target_dl_dir): os.makedirs(target_dl_dir) # prepare target dir target_train_dir = os.path.join(target_dl_dir, "train") if not os.path.exists(target_train_dir): os.makedirs(target_train_dir) target_val_dir = os.path.join(target_dl_dir, "dev") if not os.path.exists(target_val_dir): os.makedirs(target_val_dir) target_test_dir = os.path.join(target_dl_dir, "test") if not os.path.exists(target_test_dir): os.makedirs(target_test_dir) # source dir source_train = os.path.join(source_dl_dir, "train") source_val = os.path.join(source_dl_dir, "dev") source_test = os.path.join(source_dl_dir, "test") print " prepare data for train " #prepare_dir(target_train_dir, source_train,"train") print " prepare data for dev " #prepare_dir(target_val_dir, source_val,"dev") print " prepare data for test " #prepare_dir(target_test_dir, source_test,"test") print('Creating manifests...') print(" target_train_dir ", target_train_dir) create_manifest(target_dl_dir, target_train_dir, 'train') create_manifest(target_dl_dir, target_val_dir, 'val') create_manifest(target_dl_dir, target_test_dir, 'test')
def main(): target_dl_dir = args.target_dir if not os.path.exists(target_dl_dir): os.makedirs(target_dl_dir) target_unpacked_dir = os.path.join(target_dl_dir, "TEDLIUM_release2") if args.tar_path and os.path.exists(args.tar_path): target_file = args.tar_path else: print( "Could not find downloaded TEDLIUM archive, Downloading corpus...") wget.download(TED_LIUM_V2_DL_URL, target_dl_dir) target_file = os.path.join(target_dl_dir, "TEDLIUM_release2.tar.gz") if not os.path.exists(target_unpacked_dir): print("Unpacking corpus...") tar = tarfile.open(target_file) tar.extractall(target_dl_dir) tar.close() else: print("Found TEDLIUM directory, skipping unpacking of tar files") train_ted_dir = os.path.join(target_unpacked_dir, "train") val_ted_dir = os.path.join(target_unpacked_dir, "dev") test_ted_dir = os.path.join(target_unpacked_dir, "test") prepare_dir(train_ted_dir) prepare_dir(val_ted_dir) prepare_dir(test_ted_dir) print('Creating manifests...') create_manifest(train_ted_dir, 'ted_train', target_dl_dir) create_manifest(val_ted_dir, 'ted_val', target_dl_dir) create_manifest(test_ted_dir, 'ted_test', target_dl_dir)
def main(): root_path = "an4/" name = "an4" wget.download( "http://www.speech.cs.cmu.edu/databases/an4/an4_raw.bigendian.tar.gz") tar = tarfile.open("an4_raw.bigendian.tar.gz") tar.extractall() os.makedirs(args.target_dir) _format_data(root_path, "train", name, "an4_clstk") _format_data(root_path, "test", name, "an4test_clstk") shutil.rmtree(root_path) os.remove("an4_raw.bigendian.tar.gz") train_path = args.target_dir + "/train/" test_path = args.target_dir + "/test/" print("\n", "Creating manifests...") create_manifest( train_path, "an4_train_manifest.csv", args.min_duration, args.max_duration, ) create_manifest(test_path, "an4_val_manifest.csv")
def test_run_media_linker_during_adapter(self): mfest = otio.plugins.ActiveManifest() manifest = utils.create_manifest() # this wires up the media linkers into the active manifest mfest.media_linkers.extend(manifest.media_linkers) fake_tl = self.adp.read_from_file("foo", media_linker_name="example") self.assertTrue(fake_tl.tracks[0][0].media_reference.metadata.get( 'from_test_linker')) fake_tl = self.adp.read_from_string("foo", media_linker_name="example") self.assertTrue(fake_tl.tracks[0][0].media_reference.metadata.get( 'from_test_linker')) # explicitly turn the media_linker off fake_tl = self.adp.read_from_file("foo", media_linker_name=None) self.assertIsNone(fake_tl.tracks[0][0].media_reference.metadata.get( 'from_test_linker')) # Delete the temporary manifest utils.remove_manifest(manifest)
def main(): root_path = 'an4/' raw_tar_path = 'an4_raw.bigendian.tar.gz' if not os.path.exists(raw_tar_path): wget.download( 'http://www.speech.cs.cmu.edu/databases/an4/an4_raw.bigendian.tar.gz' ) tar = tarfile.open('an4_raw.bigendian.tar.gz') tar.extractall() os.makedirs(args.target_dir) _format_training_data(root_path=root_path) _format_test_data(root_path=root_path) shutil.rmtree(root_path) os.remove('an4_raw.bigendian.tar.gz') train_path = args.target_dir + '/train/' val_path = args.target_dir + '/val/' test_path = args.target_dir + '/test/' print('Creating manifests...') create_manifest(train_path, 'an4_train_manifest.csv', args.min_duration, args.max_duration) create_manifest(val_path, 'an4_val_manifest.csv', args.min_duration, args.max_duration) create_manifest(test_path, 'an4_test_manifest.csv')
def main(): target_dl_dir = args.target_dir source_dl_dir = args.source_dir if not os.path.exists(target_dl_dir): os.makedirs(target_dl_dir) # prepare target dir target_train_dir = os.path.join(target_dl_dir, "train") if not os.path.exists(target_train_dir): os.makedirs(target_train_dir) target_val_dir = os.path.join(target_dl_dir, "dev") if not os.path.exists(target_val_dir): os.makedirs(target_val_dir) target_test_dir = os.path.join(target_dl_dir, "test") if not os.path.exists(target_test_dir): os.makedirs(target_test_dir) # source dir source_train = os.path.join(source_dl_dir, "train") source_val = os.path.join(source_dl_dir, "dev") source_test = os.path.join(source_dl_dir, "test") print " prepare data for train " prepare_dir(target_train_dir, source_train, data_type) print " prepare data for dev " prepare_dir(target_val_dir, source_val, data_type) print " prepare data for test " prepare_dir(target_test_dir, source_test, data_type) print('Creating manifests...') print " target_train_dir ", target_train_dir print " target_dl_dir ", target_dl_dir create_manifest(target_dl_dir, os.path.join(target_train_dir, "converted"), 'train') create_manifest(target_dl_dir, os.path.join(target_val_dir, "converted"), 'val') create_manifest(target_dl_dir, os.path.join(target_test_dir, "converted"), 'test')
def local_product_func(self,parameters): if IE_DEBUG > 0: self._logger.info( "wfm: executing INGEST LOCAL PRODUCT, id=" +\ `parameters["scenario_id"]`) percent = 1 ncn_id = None n_errors = 0 try: sc_id = parameters["scenario_id"] self._wfm.set_scenario_status( self._id, sc_id, 0, "LOCAL ING.: UNPACK", percent) self._wfm.set_ingestion_pid(sc_id, os.getpid()) ncn_id = parameters["ncn_id"].encode('ascii','ignore') data = parameters["data"] orig_data = None data = ie_unpack_maybe(parameters["dir_path"], data) if not data: raise IngestionError( "Error unpacking or accessing " + os.path.join(parameters["dir_path"]), data) if 'NO' != parameters["s2_preprocess"]: s2script_args = self.mk_s2pre_scriptandargs( parameters["s2_preprocess"], parameters["dir_path"], parameters["metadata"]) if s2script_args: self._wfm.set_scenario_status( self._id, sc_id, 0, "LOCAL ING.: S2-PRE", percent) s2pre_errors = self.run_scripts(sc_id, ncn_id, s2script_args) if s2pre_errors > 0: n_errors += s2pre_errors else: orig_data = data data = extract_outfile(s2script_args[0][3]) mf_name = create_manifest( self._logger, ncn_id, parameters["dir_path"], metadata=parameters["metadata"], data=data, orig_data=orig_data ) self._wfm.set_scenario_status( self._id, sc_id, 0, "RUNNING SCRIPTS", percent) scripts_args = [] scripts = parameters["scripts"] if len(scripts) > 0: resp_fname = mkFname("addProdResp_") dl_dir = parameters["dir_path"] resp_full_fname = os.path.join(dl_dir,resp_fname) ap_script = [scripts[0]] ap_script.append("-add") ap_script.append("-dldir="+dl_dir) ap_script.append("-response="+resp_fname) metadata=parameters["metadata"] if metadata is not None: ap_script.append("-meta="+get_base_fname(metadata)) ap_script.append("-data="+get_base_fname(data)) scripts_args.append(ap_script) n_errors += self.run_scripts(sc_id, ncn_id, scripts_args) if n_errors > 0: raise IngestionError("Number of errors " +`n_errors`) self._wfm.set_scenario_status(self._id, sc_id, 1, "IDLE", 0) self._logger.info("Local ingestion completed, dir: " + parameters["dir_path"]) except StopRequest as e: self._logger.info(`ncn_id`+ ": Stop request from user: Local Ingestion Stopped") self._wfm.set_scenario_status(self._id, sc_id, 1, "IDLE", 0) except Exception as e: self._logger.error(`ncn_id`+" Error while ingesting local product: " + `e`) self._wfm.set_scenario_status(self._id, sc_id, 1, "INGEST ERROR", 0) if IE_DEBUG > 0: traceback.print_exc(12,sys.stdout) finally: self._wfm.set_ingestion_pid(sc_id, 0)
continue utterance = transcriptions[transcription_key] # print(utterance) target_wav_file = os.path.join( wav_dir, "{}_{}.wav".format(recording_name, recording_id)) target_txt_file = os.path.join( txt_dir, "{}_{}.txt".format(recording_name, recording_id)) with io.FileIO(target_txt_file, "w") as file: file.write(utterance.lower().encode('utf-8')) original_wav_file = os.path.join(recordings_dir, wav_file) subprocess.call([ "sox {} -r {} -b 16 -c 1 {}".format( original_wav_file, str(args.sample_rate), target_wav_file) ], shell=True) shutil.rmtree(dirpath) if __name__ == '__main__': target_dir = args.target_dir sample_rate = args.sample_rate input_dir = args.input_dir if not os.path.isdir(target_dir): os.makedirs(target_dir) for root, root_dir_names, filenames in os.walk(input_dir): print root, root_dir_names, len(filenames) for filename in fnmatch.filter(filenames, '*.tgz'): prepare_sample(os.path.join(root, filename), target_dir) print('Creating manifests...') create_manifest(target_dir, 'voxforge_train')
def setUp(self): self.man = utils.create_manifest()
def setUp(self): self.man = utils.create_manifest() self.jsn = baseline_reader.json_baseline_as_string(LINKER_PATH) self.mln = otio.adapters.otio_json.read_from_string(self.jsn) self.mln._json_path = os.path.join(baseline_reader.MODPATH, "baselines", LINKER_PATH)
transcription_key = recording_name + "/mfc/" + recording_id if transcription_key not in transcriptions: continue utterance = transcriptions[transcription_key] target_wav_file = os.path.join(wav_dir, "{}_{}.wav".format(recording_name, recording_id)) target_txt_file = os.path.join(txt_dir, "{}_{}.txt".format(recording_name, recording_id)) with io.FileIO(target_txt_file, "w") as file: file.write(utterance.encode('utf-8')) original_wav_file = os.path.join(recordings_dir, wav_file) subprocess.call(["sox {} -r {} -b 16 -c 1 {}".format(original_wav_file, str(args.sample_rate), target_wav_file)], shell=True) shutil.rmtree(dirpath) if __name__ == '__main__': target_dir = args.target_dir sample_rate = args.sample_rate if not os.path.isdir(target_dir): os.makedirs(target_dir) request = urllib.request.Request(VOXFORGE_URL_16kHz) response = urllib.request.urlopen(request) content = response.read() all_files = re.findall("href\=\"(.*\.tgz)\"", content.decode("utf-8")) for f in tqdm(all_files, total=len(all_files)): prepare_sample(f.replace(".tgz", ""), VOXFORGE_URL_16kHz + '/' + f, target_dir) print('Creating manifests...') create_manifest(target_dir, 'voxforge_train_manifest.csv', args.min_duration, args.max_duration)
str(args.sample_rate), target_wav_file, ) ], shell=True, ) shutil.rmtree(dirpath) if __name__ == "__main__": target_dir = args.target_dir sample_rate = args.sample_rate if not os.path.isdir(target_dir): os.makedirs(target_dir) request = urllib.request.Request(VOXFORGE_URL_16kHz) response = urllib.request.urlopen(request) content = response.read() all_files = re.findall('href\="(.*\.tgz)"', content.decode("utf-8")) for f in tqdm(all_files, total=len(all_files)): prepare_sample(f.replace(".tgz", ""), VOXFORGE_URL_16kHz + f, target_dir) print("Creating manifests...") create_manifest( target_dir, "voxforge_train_manifest.csv", args.min_duration, args.max_duration, )
transcription_key = recording_name + "/mfc/" + recording_id if transcription_key not in transcriptions: continue utterance = transcriptions[transcription_key] target_wav_file = os.path.join(wav_dir, "{}_{}.wav".format(recording_name, recording_id)) target_txt_file = os.path.join(txt_dir, "{}_{}.txt".format(recording_name, recording_id)) with io.FileIO(target_txt_file, "w") as file: file.write(utterance.encode('utf-8')) original_wav_file = os.path.join(recordings_dir, wav_file) subprocess.call(["sox {} -r {} -b 16 -c 1 {}".format(original_wav_file, str(args.sample_rate), target_wav_file)], shell=True) shutil.rmtree(dirpath) if __name__ == '__main__': target_dir = args.target_dir sample_rate = args.sample_rate if not os.path.isdir(target_dir): os.makedirs(target_dir) request = urllib.request.Request(VOXFORGE_URL_16kHz) response = urllib.request.urlopen(request) content = response.read() all_files = re.findall("href\=\"(.*\.tgz)\"", content.decode("utf-8")) for f in tqdm(all_files, total=len(all_files)): prepare_sample(f.replace(".tgz", ""), VOXFORGE_URL_16kHz + f, target_dir) print('Creating manifests...') create_manifest(target_dir, 'voxforge_train_manifest.csv', args.min_duration, args.max_duration)
def setUp(self): self.man = utils.create_manifest()