def tatoeba_loader(keep_archive): """ Download, extract and convert the Tatoeba archive. Then build all possible CSV files (e.g. `<dataset_name>_train.csv`, `<dataset_name>_test.csv`). Args: keep_archive (bool): Keep or delete the downloaded archive afterwards. Returns: List[str]: List containing the created CSV file paths. """ # Download and extract the dataset if necessary. download.maybe_download(__URL, md5=__MD5, cache_archive=keep_archive) if not os.path.isdir(__SOURCE_PATH): raise ValueError('"{}" is not a directory.'.format(__SOURCE_PATH)) # Download user ratings CSV file. csv_path = os.path.join(__SOURCE_PATH, 'users_sentences.csv') download.download_with_progress( 'http://downloads.tatoeba.org/exports/users_sentences.csv', csv_path) assert os.path.exists(csv_path) target = 'train' # Generate the WAV and a string for the `<target>.txt` file. output = __tatoeba_loader(target) # Generate the `<target>.txt` file. csv_path = generate_csv(__NAME, target, output) # Cleanup extracted folder. download.cleanup_cache(__FOLDER_NAME) return csv_path
def tedlium_loader(keep_archive): """ Download, extract and convert the TEDLIUM archive. Then build all possible CSV files (e.g. `<dataset_name>_train.csv`, `<dataset_name>_test.csv`). Requires lots of disk space, since the original format (SPH) is converted to WAV and then split up into parts. Args: keep_archive (bool): Keep or delete the downloaded archive afterwards. Returns: List[str]: List containing the created CSV file paths. """ # Download and extract the dataset if necessary. download.maybe_download(__URL, md5=__MD5, cache_archive=keep_archive) if not os.path.isdir(__SOURCE_PATH): raise ValueError('"{}" is not a directory.'.format(__SOURCE_PATH)) # Folders for each target. targets = [{ 'name': 'train', 'folder': 'train' }, { 'name': 'test', 'folder': 'test' }, { 'name': 'dev', 'folder': 'dev' }] txt_paths = [] for target in targets: # Create target folder if necessary. target_directory = os.path.join(__TARGET_PATH, target['folder'], 'sph') if not os.path.exists(target_directory): os.makedirs(target_directory) # Generate the WAV and a string for the `<target>.txt` file. source_directory = os.path.join(__SOURCE_PATH, target['folder']) output = __tedlium_loader(source_directory) # Generate the `<target>.txt` file. txt_paths.append(generate_csv(__NAME, target['name'], output)) # Cleanup extracted folder. download.cleanup_cache(__FOLDER_NAME) return tuple(txt_paths)
def common_voice_loader(keep_archive): """ Download, extract and convert the Common Voice archive. Then build all possible CSV files (e.g. `<dataset_name>_train.csv`, `<dataset_name>_test.csv`). Uses only the valid datasets, additional constraints are: * Downvotes must be at maximum 1/4 of upvotes. * Valid accents are: 'us', 'england', 'canada', 'australia'. * Accepting samples with only 1 upvote at the moment. Args: keep_archive (bool): Keep or delete the downloaded archive afterwards. Returns: List[str]: List containing the created CSV file paths. """ # Download and extract the dataset if necessary. download.maybe_download(__URL, md5=__MD5, cache_archive=keep_archive) if not os.path.isdir(__SOURCE_PATH): raise ValueError('"{}" is not a directory.'.format(__SOURCE_PATH)) # Folders for each target. targets = [{ 'name': 'train', 'folders': ['cv-valid-train'] }, { 'name': 'test', 'folders': ['cv-valid-test'] }, { 'name': 'dev', 'folders': ['cv-valid-dev'] }] csv_paths = [] for target in targets: # Generate the path and label for the `<target>.csv` file. output = __common_voice_loader(target['folders']) # Generate the `<target>.csv` file. csv_paths.append(generate_csv(__NAME, target['name'], output)) # Cleanup extracted folder. download.cleanup_cache(__FOLDER_NAME) return csv_paths
def libri_speech_loader(keep_archive): """ Download, extract and convert the Libri Speech archive. Then build all possible CSV files (e.g. `<dataset_name>_train.csv`, `<dataset_name>_test.csv`). Args: keep_archive (bool): Keep or delete the downloaded archive afterwards. Returns: List[str]: List containing the created CSV file paths. """ # Download and extract the dataset if necessary. download.maybe_download_batch(__URLS, md5s=__MD5S, cache_archives=keep_archive) if not os.path.isdir(__SOURCE_PATH): raise ValueError('"{}" is not a directory.'.format(__SOURCE_PATH)) # Folders for each target. targets = [{ 'name': 'train', 'folders': ['train-clean-100', 'train-clean-360'] }, { 'name': 'test', 'folders': ['test-clean'] }, { 'name': 'dev', 'folders': ['dev-clean'] }] csv_paths = [] for target in targets: # Generate the WAV and a string for the `<target>.txt` file. output = __libri_speech_loader(target['folders']) # Generate the `<target>.txt` file. csv_paths.append(generate_csv(__NAME, target['name'], output)) # Cleanup extracted folder. download.cleanup_cache(__FOLDER_NAME) return csv_paths