def _write_part_to_wav(wav_data, path, start, end, sr=16000): assert 0. <= start < (len(wav_data) / sr) assert start < end <= (len(wav_data) / sr) # print('Saving {:12,d}({:6.2f}s) to {:12,d}({:6.2f}s) at: {}' # .format(seconds_to_sample(start), start, seconds_to_sample(end), end, path)) delete_file_if_exists(path) wavfile.write( path, sr, wav_data[_seconds_to_sample(start, True ):_seconds_to_sample(end, False)])
def maybe_download(url, md5=None, cache_archive=True): """Downloads a archive file if it's not cached. The archive gets extracted afterwards. It is advised to call `cleanup_cache()` after pre-processing to remove the cached extracted folder. Currently only TAR and ZIP files are supported. Args: url (str): URL for dataset download. md5 (str): Checksum for optional integrity check or `None`. cache_archive (bool): `True` if the downloaded archive should be kept, `False` if it should be deleted. Returns: Nothing. """ file_name = os.path.basename(urlparse(url).path) storage_path = os.path.join(CACHE_DIR, '{}'.format(file_name)) # Download archive if necessary. if not os.path.isfile(storage_path): download_with_progress(url, storage_path) else: print('Using cached archive: {}'.format(storage_path)) # Optional md5 integrity check. if md5: md5sum = storage.md5(storage_path) assert md5 == md5sum, 'Checksum does not match.' # Extract archive to cache directory. print('Starting extraction of: {}'.format(storage_path)) if tarfile.is_tarfile(storage_path): storage.tar_extract_all(storage_path, CACHE_DIR) elif zipfile.is_zipfile(storage_path): with zipfile.ZipFile(storage_path, 'r') as zip_: zip_.extractall(CACHE_DIR) else: raise ValueError('Compression method not supported: ', storage_path) print('Completed extraction of: {}'.format(storage_path)) # Delete cached archive if requested. if not cache_archive: storage.delete_file_if_exists(storage_path) print('Cache file "{}" deleted.'.format(storage_path))
def __tatoeba_loader_helper(sample): path = sample['path'] text = sample['text'] mp3_path = '{}.mp3'.format(path) wav_path = '{}.wav'.format(path) wav_path = os.path.join(__TARGET_PATH, os.path.relpath(wav_path, __SOURCE_PATH)) # Check if audio file MP3 exists. if not os.path.isfile(mp3_path): # print('WARN: Audio file missing: {}'.format(mp3_path)) return None # Check if file isn't empty. try: if os.path.getsize(mp3_path) <= 4048: return None except OSError: return None delete_file_if_exists(wav_path) # Convert MP3 file into WAV file, reduce volume to 0.95, downsample to 16kHz mono sound. ret = subprocess.call(['sox', '-v', '0.95', mp3_path, '-r', '16k', wav_path, 'remix', '1']) if not os.path.isfile(wav_path): raise RuntimeError('Failed to create WAV file with error code={}: {}'.format(ret, wav_path)) # Validate that the example length is within boundaries. for i in range(5): try: (sr, y) = wavfile.read(wav_path) length_sec = len(y) / sr if not MIN_EXAMPLE_LENGTH <= length_sec <= MAX_EXAMPLE_LENGTH: return None break except ValueError: print('WARN: Could not load ({}/5) wavfile: {}'.format(i, wav_path)) if i == 4: raise time.sleep(1) # TODO: Copy used files to corpus dir wav_path = os.path.relpath(wav_path, CORPUS_DIR) return '{} {}\n'.format(wav_path, text.strip())
def generate_txt(dataset_name, target, output): """Generate *.txt files containing the audio path and the corresponding sentence. Generated files are being stored at `TXT_TARGET_PATH`. Return additional data set information, see below. Args: dataset_name (str): Name of the dataset, e.g. 'libri_speech'. target (str): Target name, e.g. 'train', 'test', 'dev' output (str): String containing the content for the `<dataset_name>_<target>.txt` file. Returns: str: Path to the created TXT file. """ target_txt_path = os.path.join(TXT_DIR, '{}_{}.txt'.format(dataset_name, target)) print('Starting to generate: {}'.format(os.path.basename(target_txt_path))) # Remove illegal characters from labels. output = _remove_illegal_characters(output) # Filter out labels that are only shorter than 2 characters. output = list( filter(lambda x: len((x.split(' ', 1)[-1]).strip()) >= 2, output)) # Write list to .txt file. print('> Writing {} lines of {} files to {}'.format( len(output), target, target_txt_path)) # Delete the old file if it exists. storage.delete_file_if_exists(target_txt_path) # Write data to the file. with open(target_txt_path, 'w') as f: f.writelines(output) return target_txt_path
def __common_voice_loader_helper(line): # Helper method for thread pool. # Cleanup label text. text = line[1].strip().replace(' ', ' ') # Enforce min label length. if len(text) > 1: # Check upvotes vs downvotes. if int(line[2]) >= 1 and int(line[3]) / int(line[2]) <= 1 / 4: # Check if speaker accent is valid. if line[6] in __VALID_ACCENTS: mp3_path = os.path.join(__SOURCE_PATH, line[0]) assert os.path.isfile(mp3_path) wav_path = os.path.relpath('{}.wav'.format(mp3_path[:-4]), __SOURCE_PATH) wav_path = os.path.join(__TARGET_PATH, wav_path) delete_file_if_exists(wav_path) # Convert MP3 to WAV, reduce volume to 0.95, downsample to 16kHz and mono sound. subprocess.call([ 'sox', '-v', '0.95', mp3_path, '-r', '16k', wav_path, 'remix', '1' ]) assert os.path.isfile(wav_path) # Validate that the example length is within boundaries. (sr, y) = wavfile.read(wav_path) length_sec = len(y) / sr if not MIN_EXAMPLE_LENGTH <= length_sec <= MAX_EXAMPLE_LENGTH: return None # Add dataset relative to dataset path, label to TXT file buffer. wav_path = os.path.relpath(wav_path, CORPUS_DIR) return '{} {}\n'.format(wav_path, text) return None
def sort_txt_by_seq_len(txt_path, num_buckets=64, max_length=1700): """Sort a train.txt like file by it's audio files sequence length. Additionally outputs longer than `max_length` are being discarded from the given TXT file. Also it prints out optimal bucket sizes after computation. Args: txt_path (str): Path to the `train.txt`. num_buckets (int): Number ob buckets to split the input into. max_length (int): Positive integer. Max length for a feature vector to keep. Set to `0` to keep everything. Returns: Tuple[List[int], float]: A tuple containing the boundary array and the total corpus length in seconds. """ # Read train.txt file. with open(txt_path, 'r') as f: lines = f.readlines() # Setup thread pool. lock = Lock() buffer = [] # Output buffer. with Pool(processes=cpu_count()) as pool: for result in tqdm(pool.imap_unordered(_feature_length, lines, chunksize=4), desc='Reading audio samples', total=len(lines), file=sys.stdout, unit='samples', dynamic_ncols=True): lock.acquire() buffer.append(result) lock.release() # Sort by sequence length. buffer = sorted(buffer, key=lambda x: x[0]) # Remove samples longer than `max_length` points. if max_length > 0: original_length = len(buffer) buffer = [s for s in buffer if s[0] < max_length] print( 'Removed {:,d} samples from training.'.format(original_length - len(buffer))) # Calculate optimal bucket sizes. lengths = [l[0] for l in buffer] step = len(lengths) // num_buckets buckets = set() for i in range(step, len(lengths), step): buckets.add(lengths[i]) buckets = list(buckets) buckets.sort() print('Suggested buckets: ', buckets) # Plot histogram of feature vector length distribution. _plot_sequence_lengths(lengths) # Determine total corpus length in seconds. total_length = sum(map(lambda x: x[0], buffer)) / 0.1 # Remove sequence length. buffer = ['{} {}'.format(p, l) for _, p, l in buffer] # Write back to file. storage.delete_file_if_exists(txt_path) with open(txt_path, 'w') as f: f.writelines(buffer) with open(txt_path, 'r') as f: print('Successfully sorted {} lines of {}'.format( len(f.readlines()), txt_path)) return buckets[:-1], total_length