def main(): target_dir = args.target_dir language_dir = args.language_dir os.makedirs(target_dir, exist_ok=True) target_unpacked_dir = os.path.join(target_dir, "CV_unpacked") if os.path.exists(target_unpacked_dir): print('Find existing folder {}'.format(target_unpacked_dir)) else: print("Could not find Common Voice, Downloading corpus...") filename = wget.download(COMMON_VOICE_URL, target_dir) target_file = os.path.join(target_dir, os.path.basename(filename)) os.makedirs(target_unpacked_dir, exist_ok=True) print("Unpacking corpus to {} ...".format(target_unpacked_dir)) tar = tarfile.open(target_file) tar.extractall(target_unpacked_dir) tar.close() folder_path = os.path.join(target_unpacked_dir, VERSION + '/{}/'.format(language_dir)) for csv_file in args.files_to_process: convert_to_wav(csv_file=os.path.join(folder_path, csv_file), target_dir=os.path.join(target_dir, os.path.splitext(csv_file)[0]), num_workers=args.num_workers) print('Creating manifests...') for csv_file in args.files_to_process: create_manifest(data_path=os.path.join(target_dir, os.path.splitext(csv_file)[0]), output_name='commonvoice_' + os.path.splitext(csv_file)[0] + '_manifest.json', manifest_path=args.manifest_dir, min_duration=args.min_duration, max_duration=args.max_duration, num_workers=args.num_workers)
def main(): root_path = 'an4/' raw_tar_path = 'an4_raw.bigendian.tar.gz' if not os.path.exists(raw_tar_path): wget.download('http://www.speech.cs.cmu.edu/databases/an4/an4_raw.bigendian.tar.gz') tar = tarfile.open('an4_raw.bigendian.tar.gz') tar.extractall() os.makedirs(args.target_dir, exist_ok=True) _format_training_data(root_path=root_path) _format_test_data(root_path=root_path) shutil.rmtree(root_path) os.remove('an4_raw.bigendian.tar.gz') train_path = args.target_dir + '/train/' val_path = args.target_dir + '/val/' test_path = args.target_dir + '/test/' print('Creating manifests...') create_manifest(data_path=train_path, output_name='an4_train_manifest.csv', manifest_path=args.manifest_dir, min_duration=args.min_duration, max_duration=args.max_duration) create_manifest(data_path=val_path, output_name='an4_val_manifest.csv', manifest_path=args.manifest_dir, min_duration=args.min_duration, max_duration=args.max_duration) create_manifest(data_path=test_path, output_name='an4_test_manifest.csv', manifest_path=args.manifest_dir)
def main(): target_dir = args.target_dir os.makedirs(target_dir, exist_ok=True) target_unpacked_dir = os.path.join(target_dir, "CV_unpacked") os.makedirs(target_unpacked_dir, exist_ok=True) if args.tar_path and os.path.exists(args.tar_path): print('Find existing file {}'.format(args.tar_path)) target_file = args.tar_path else: print( "Could not find downloaded Common Voice archive, Downloading corpus..." ) filename = wget.download(COMMON_VOICE_URL, target_dir) target_file = os.path.join(target_dir, os.path.basename(filename)) print("Unpacking corpus to {} ...".format(target_unpacked_dir)) tar = tarfile.open(target_file) tar.extractall(target_unpacked_dir) tar.close() for csv_file in args.files_to_process.split(','): convert_to_wav( os.path.join(target_unpacked_dir, 'cv-corpus-5-2020-06-22/zh-CN/', csv_file), os.path.join(target_dir, os.path.splitext(csv_file)[0])) print('Creating manifests...') for csv_file in args.files_to_process.split(','): create_manifest(data_path=os.path.join(target_dir, os.path.splitext(csv_file)[0]), output_name=os.path.splitext(csv_file)[0] + '_manifest.csv', manifest_path=args.manifest_dir, min_duration=args.min_duration, max_duration=args.max_duration)
def main(): source_dir = args.source_dir if not os.path.exists(source_dir): raise NotADirectoryError(f"Directory does not exist: {source_dir}") if args.dataframe: print("Creating DataFrame") create_dataframe( data_path=source_dir, output_name=f"{args.manifest_name}.csv", manifest_path=args.manifest_dir, ) else: print("Creating manifest") manifest = create_manifest( data_path=source_dir, output_name=f"{args.manifest_name}.csv", manifest_path=args.manifest_dir, min_duration=args.min_duration, max_duration=args.max_duration, ) if args.split is not None: train, val, test = [float(a) for a in args.split] df = pd.read_csv(manifest) manifest_dir = Path(args.manifest_dir) print(f"Total size of manifest is {len(df)} records.") test_df = None # make IDE happy if test > 0: train_df, test_df = train_test_split(df, test_size=test) train_df, val_df = train_test_split(train_df, test_size=val) test_df.to_csv(manifest_dir / f"test_{args.manifest_name}.csv", index=False) else: train_df, val_df = train_test_split(df, test_size=val) train_df.to_csv(manifest_dir / f"train_{args.manifest_name}.csv", index=False) val_df.to_csv(manifest_dir / f"val_{args.manifest_name}.csv", index=False) print(f"Train size is {len(train_df)}") print(f"Val size is {len(val_df)}") print(f"Test size is {0 if test_df is None else len(test_df)}")
def main(): target_dl_dir = args.target_dir if not os.path.exists(target_dl_dir): os.makedirs(target_dl_dir) target_unpacked_dir = os.path.join(target_dl_dir, "TEDLIUM_release2") if args.tar_path and os.path.exists(args.tar_path): target_file = args.tar_path else: print( "Could not find downloaded TEDLIUM archive, Downloading corpus...") wget.download(TED_LIUM_V2_DL_URL, target_dl_dir) target_file = os.path.join(target_dl_dir, "TEDLIUM_release2.tar.gz") if not os.path.exists(target_unpacked_dir): print("Unpacking corpus...") tar = tarfile.open(target_file) tar.extractall(target_dl_dir) tar.close() else: print("Found TEDLIUM directory, skipping unpacking of tar files") train_ted_dir = os.path.join(target_unpacked_dir, "train") val_ted_dir = os.path.join(target_unpacked_dir, "dev") test_ted_dir = os.path.join(target_unpacked_dir, "test") prepare_dir(train_ted_dir) prepare_dir(val_ted_dir) prepare_dir(test_ted_dir) print('Creating manifests...') create_manifest(data_path=train_ted_dir, output_name='ted_train_manifest.json', manifest_path=args.manifest_dir, min_duration=args.min_duration, max_duration=args.max_duration, num_workers=args.num_workers) create_manifest(data_path=val_ted_dir, output_name='ted_val_manifest.json', manifest_path=args.manifest_dir, num_workers=args.num_workers) create_manifest(data_path=test_ted_dir, output_name='ted_test_manifest.json', manifest_path=args.manifest_dir, num_workers=args.num_workers)
def download_an4(target_dir: str, manifest_dir: str, min_duration: float, max_duration: float, val_fraction: float, sample_rate: int, num_workers: int): root_path = 'an4/' raw_tar_path = 'an4_raw.bigendian.tar.gz' if not os.path.exists(raw_tar_path): wget.download( 'http://www.speech.cs.cmu.edu/databases/an4/an4_raw.bigendian.tar.gz' ) tar = tarfile.open('an4_raw.bigendian.tar.gz') tar.extractall() os.makedirs(target_dir, exist_ok=True) _format_training_data(root_path=root_path, val_fraction=val_fraction, sample_rate=sample_rate, target_dir=target_dir) _format_test_data(root_path=root_path, sample_rate=sample_rate, target_dir=target_dir) shutil.rmtree(root_path) os.remove('an4_raw.bigendian.tar.gz') train_path = target_dir + '/train/' val_path = target_dir + '/val/' test_path = target_dir + '/test/' print('Creating manifests...') create_manifest(data_path=train_path, output_name='an4_train_manifest.json', manifest_path=manifest_dir, min_duration=min_duration, max_duration=max_duration, num_workers=num_workers) create_manifest(data_path=val_path, output_name='an4_val_manifest.json', manifest_path=manifest_dir, min_duration=min_duration, max_duration=max_duration, num_workers=num_workers) create_manifest(data_path=test_path, output_name='an4_test_manifest.json', manifest_path=manifest_dir, num_workers=num_workers)
def main(): target_dl_dir = args.target_dir if not os.path.exists(target_dl_dir): os.makedirs(target_dl_dir) files_to_dl = args.files_to_use.strip().split(',') for split_type, lst_libri_urls in LIBRI_SPEECH_URLS.items(): split_dir = os.path.join(target_dl_dir, split_type) if not os.path.exists(split_dir): os.makedirs(split_dir) split_wav_dir = os.path.join(split_dir, "wav") if not os.path.exists(split_wav_dir): os.makedirs(split_wav_dir) split_txt_dir = os.path.join(split_dir, "txt") if not os.path.exists(split_txt_dir): os.makedirs(split_txt_dir) extracted_dir = os.path.join(split_dir, "LibriSpeech") if os.path.exists(extracted_dir): shutil.rmtree(extracted_dir) for url in lst_libri_urls: # check if we want to dl this file dl_flag = False for f in files_to_dl: if url.find(f) != -1: dl_flag = True if not dl_flag: print("Skipping url: {}".format(url)) continue filename = url.split("/")[-1] target_filename = os.path.join(split_dir, filename) if not os.path.exists(target_filename): wget.download(url, split_dir) print("Unpacking {}...".format(filename)) tar = tarfile.open(target_filename) tar.extractall(split_dir) tar.close() os.remove(target_filename) print("Converting flac files to wav and extracting transcripts...") assert os.path.exists( extracted_dir ), "Archive {} was not properly uncompressed.".format(filename) for root, subdirs, files in tqdm(os.walk(extracted_dir)): for f in files: if f.find(".flac") != -1: _process_file(wav_dir=split_wav_dir, txt_dir=split_txt_dir, base_filename=f, root_dir=root) print("Finished {}".format(url)) shutil.rmtree(extracted_dir) if split_type == 'train': # Prune to min/max duration create_manifest(data_path=split_dir, output_name='libri_' + split_type + '_manifest.json', manifest_path=args.manifest_dir, min_duration=args.min_duration, max_duration=args.max_duration, num_workers=args.num_workers) else: create_manifest(data_path=split_dir, output_name='libri_' + split_type + '_manifest.json', manifest_path=args.manifest_dir, num_workers=args.num_workers)
with io.FileIO(target_txt_file, "w") as file: file.write(utterance.encode('utf-8')) original_wav_file = os.path.join(recordings_dir, wav_file) subprocess.call(["sox {} -r {} -b 16 -c 1 {}".format(original_wav_file, str(args.sample_rate), target_wav_file)], shell=True) shutil.rmtree(dirpath) if __name__ == '__main__': target_dir = args.target_dir sample_rate = args.sample_rate if not os.path.isdir(target_dir): os.makedirs(target_dir) request = urllib.request.Request(VOXFORGE_URL_16kHz) response = urllib.request.urlopen(request) content = response.read() all_files = re.findall("href\=\"(.*\.tgz)\"", content.decode("utf-8")) for f in tqdm(all_files, total=len(all_files)): prepare_sample(f.replace(".tgz", ""), VOXFORGE_URL_16kHz + f, target_dir) print('Creating manifests...') create_manifest( data_path=target_dir, output_name='voxforge_train_manifest.json', manifest_path=args.manifest_dir, min_duration=args.min_duration, max_duration=args.max_duration, num_workers=args.num_workers )
target_wav_file = os.path.join(wav_dir, "{}_{}.wav".format(recording_name, recording_id)) target_txt_file = os.path.join(txt_dir, "{}_{}.txt".format(recording_name, recording_id)) with io.FileIO(target_txt_file, "w") as file: file.write(utterance.encode('utf-8')) original_wav_file = os.path.join(recordings_dir, wav_file) subprocess.call(["sox {} -r {} -b 16 -c 1 {}".format(original_wav_file, str(args.sample_rate), target_wav_file)], shell=True) shutil.rmtree(dirpath) if __name__ == '__main__': target_dir = args.target_dir sample_rate = args.sample_rate if not os.path.isdir(target_dir): os.makedirs(target_dir) request = urllib.request.Request(VOXFORGE_URL_16kHz) response = urllib.request.urlopen(request) content = response.read() all_files = re.findall("href\=\"(.*\.tgz)\"", content.decode("utf-8")) for f in tqdm(all_files, total=len(all_files)): prepare_sample(f.replace(".tgz", ""), VOXFORGE_URL_16kHz + f, target_dir) print('Creating manifests...') create_manifest(data_path=target_dir, output_name='voxforge_train_manifest.csv', manifest_path=args.manifest_dir, min_duration=args.min_duration, max_duration=args.max_duration)