def upload_dataset(): dataset = request.files["dataset"] dataset_name = request.values["name"] dataset_directory = os.path.join(paths["datasets"], dataset_name) audio_folder = os.path.join(dataset_directory, AUDIO_FOLDER) assert not os.path.isdir(dataset_directory), "Output folder already exists" with zipfile.ZipFile(dataset, mode="r") as z: files_list = z.namelist() if "metadata.csv" not in files_list: return render_template( "import-export.html", message= "Dataset missing metadata.csv. Make sure this file is in the root of the zip file", ) folders = [x.split("/")[0] for x in files_list if "/" in x] if "wavs" not in folders: return render_template( "import-export.html", message= "Dataset missing wavs folder. Make sure this folder is in the root of the zip file", ) wavs = [ x for x in files_list if x.startswith("wavs/") and x.endswith(".wav") ] if not wavs: return render_template("import-export.html", message="No wavs found in wavs folder") os.makedirs(dataset_directory, exist_ok=False) os.makedirs(audio_folder, exist_ok=False) # Save metadata with open(os.path.join(dataset_directory, "metadata.csv"), "wb") as f: data = z.read("metadata.csv") f.write(data) # Save wavs for wav in wavs: data = z.read(wav) path = os.path.join(dataset_directory, "wavs", wav.split("/")[1]) with open(path, "wb") as f: f.write(data) # Create info file save_dataset_info( os.path.join(dataset_directory, "metadata.csv"), os.path.join(dataset_directory, "wavs"), os.path.join(dataset_directory, "info.json"), ) return render_template( "import-export.html", message=f"Successfully uploaded {dataset_name} dataset")
def extend_existing_dataset( text_path, audio_path, forced_alignment_path, output_path, label_path, suffix, info_path, logging=logging, min_confidence=0.85, ): """ Extends an existing dataset. Converts audio to required format, generates clips & produces required files. Parameters ---------- text_path : str Path to source text audio_path : str Path to source audio forced_alignment_path : str Path to save alignment JSON to output_path : str Path to save audio clips to label_path : str Path to save label file to suffix : str String suffix to append to filenames info_path : str Path to save info JSON to logging : logging (optional) Logging object to write logs to min_confidence : float (optional) Minimum confidence score to generate a clip for Raises ------- AssertionError If given paths are invalid or clips could not be produced """ assert os.path.isdir(output_path), "Missing existing dataset clips folder" assert os.path.isfile(label_path), "Missing existing dataset metadata file" logging.info(f"Coverting {audio_path}...") converted_audio = convert_audio(audio_path) extend_dataset( converted_audio, text_path, forced_alignment_path, output_path, label_path, suffix, logging=logging, min_confidence=min_confidence, ) logging.info("Getting dataset info...") save_dataset_info(label_path, output_path, info_path)
def create_dataset(text_path, audio_path, forced_alignment_path, output_path, label_path, info_path, logging): logging.info(f"Coverting {audio_path}...") converted_audio = convert_audio(audio_path) clip_generator(converted_audio, text_path, forced_alignment_path, output_path, label_path, logging=logging) logging.info("Getting dataset info...") save_dataset_info(label_path, output_path, info_path)
def create_dataset( text_path, audio_path, forced_alignment_path, output_path, label_path, info_path, logging=logging, min_confidence=0.85, ): """ Generates a dataset. Converts audio to required format, generates clips & produces required files. Parameters ---------- text_path : str Path to source text audio_path : str Path to source audio forced_alignment_path : str Path to save alignment JSON to output_path : str Path to save audio clips to label_path : str Path to save label file to info_path : str Path to save info JSON to logging : logging (optional) Logging object to write logs to min_confidence : float (optional) Minimum confidence score to generate a clip for Raises ------- AssertionError If given paths are invalid or clips could not be produced """ logging.info(f"Coverting {audio_path}...") converted_audio = convert_audio(audio_path) clip_generator( converted_audio, text_path, forced_alignment_path, output_path, label_path, logging=logging, min_confidence=min_confidence, ) logging.info("Getting dataset info...") save_dataset_info(label_path, output_path, info_path)
def extend_existing_dataset(text_path, audio_path, forced_alignment_path, output_path, label_path, suffix, info_path, logging): assert os.path.isdir(output_path), "Missing existing dataset clips folder" assert os.path.isfile(label_path), "Missing existing dataset metadata file" logging.info(f"Coverting {audio_path}...") converted_audio = convert_audio(audio_path) extend_dataset(converted_audio, text_path, forced_alignment_path, output_path, label_path, suffix, logging=logging) logging.info("Getting dataset info...") save_dataset_info(label_path, output_path, info_path)
def import_dataset(dataset, dataset_directory, audio_folder, logging): """ Imports a dataset zip into the app. Checks required files are present, saves the files, converts the audio to the required format and generates the info file. Deletes given zip regardless of success. Parameters ---------- dataset : str Path to dataset zip dataset_directory : str Destination path for the dataset audio_folder : str Destination path for the dataset audio logging : logging Logging object to write logs to Raises ------- AssertionError If files are missing or invalid """ try: with zipfile.ZipFile(dataset, mode="r") as z: files_list = z.namelist() assert ("metadata.csv" in files_list) or ( "trainlist.txt" in files_list and "vallist.txt" in files_list ), "Dataset doesn't include metadata.csv or trainlist.txt/vallist.txt. Make sure this is in the root of the zip file" folders = [x.split("/")[0] for x in files_list if "/" in x] assert ( "wavs" in folders ), "Dataset missing wavs folder. Make sure this folder is in the root of the zip file" wavs = [x for x in files_list if x.startswith("wavs/") and x.endswith(".wav")] assert wavs, "No wavs found in wavs folder" logging.info("Creating directory") os.makedirs(dataset_directory, exist_ok=False) os.makedirs(audio_folder, exist_ok=False) if "metadata.csv" in files_list: metadata = z.read("metadata.csv").decode(CHARACTER_ENCODING, "ignore").replace("\r\n", "\n") num_metadata_rows = len([row for row in metadata.split("\n") if row]) assert ( len(wavs) == num_metadata_rows ), f"Number of wavs and labels do not match. metadata: {num_metadata_rows}, wavs: {len(wavs)}" # Save metadata logging.info("Saving files") with open(os.path.join(dataset_directory, "metadata.csv"), "w", encoding=CHARACTER_ENCODING) as f: f.write(metadata) else: trainlist = z.read("trainlist.txt").decode(CHARACTER_ENCODING, "ignore").replace("\r\n", "\n") vallist = z.read("vallist.txt").decode(CHARACTER_ENCODING, "ignore").replace("\r\n", "\n") num_rows = len([row for row in trainlist.split("\n") if row]) + len( [row for row in vallist.split("\n") if row] ) assert ( len(wavs) == num_rows ), f"Number of wavs and labels do not match. trainlist+vallist: {num_rows}, wavs: {len(wavs)}" # Save trainlist & vallist logging.info("Saving files") with open(os.path.join(dataset_directory, "trainlist.txt"), "w", encoding=CHARACTER_ENCODING) as f: f.write(trainlist) with open(os.path.join(dataset_directory, "vallist.txt"), "w", encoding=CHARACTER_ENCODING) as f: f.write(vallist) # Save wavs total_wavs = len(wavs) clip_lengths = [] filenames = {} for i in range(total_wavs): wav = wavs[i] data = z.read(wav) path = os.path.join(dataset_directory, "wavs", wav.split("/")[1]) with open(path, "wb") as f: f.write(data) new_path = convert_audio(path) duration = librosa.get_duration(filename=new_path) clip_lengths.append(duration) filenames[path] = new_path logging.info(f"Progress - {i+1}/{total_wavs}") logging.info(f"Longest clip: {max(clip_lengths)}s, Shortest clip: {min(clip_lengths)}s") # Get around "file in use" by using delay logging.info("Deleting temp files") for old_path, new_path in filenames.items(): os.remove(old_path) os.rename(new_path, old_path) # Create info file logging.info("Creating info file") words = ( get_text(os.path.join(dataset_directory, "metadata.csv")) if "metadata.csv" in files_list else get_text(os.path.join(dataset_directory, "trainlist.txt")) + get_text(os.path.join(dataset_directory, "vallist.txt")) ) save_dataset_info( words, os.path.join(dataset_directory, "wavs"), os.path.join(dataset_directory, "info.json"), clip_lengths=clip_lengths, ) except Exception as e: os.remove(dataset) raise e os.remove(dataset)
def extend_existing_dataset( text_path, audio_path, transcription_model, output_folder, suffix, logging=logging, min_length=MIN_LENGTH, max_length=MAX_LENGTH, min_confidence=0.85, combine_clips=True, symbols=DEFAULT_ALPHABET, ): """ Extends an existing dataset. Converts audio to required format, generates clips & produces required files. Parameters ---------- text_path : str Path to source text audio_path : str Path to source audio transcription_model : TranscriptionModel Transcription model output_folder : str Path to save dataset to suffix : str String suffix to append to filenames logging : logging (optional) Logging object to write logs to min_length : float (optional) Minimum duration of a clip in seconds max_length : float (optional) Maximum duration of a clip in seconds min_confidence : float (optional) Minimum confidence score to generate a clip for combine_clips : bool (optional) Whether to combine clips to make them longer symbols : list[str] (optional) list of valid symbols default to DEFAULT_ALPHABET Raises ------- AssertionError If given paths are invalid or clips could not be produced """ assert os.path.isdir( output_folder), "Missing existing dataset clips folder" logging.info(f"Converting {audio_path}...") converted_audio = convert_audio(audio_path) forced_alignment_path = os.path.join(output_folder, ALIGNMENT_FILE) output_path = os.path.join(output_folder, AUDIO_FOLDER) unlabelled_path = os.path.join(output_folder, UNLABELLED_FOLDER) label_path = os.path.join(output_folder, METADATA_FILE) info_path = os.path.join(output_folder, INFO_FILE) temp_label_path = label_path.replace(Path(label_path).name, "temp.csv") temp_unlabelled_folder = unlabelled_path.replace( Path(unlabelled_path).name, "temp_unlabelled") temp_wavs_folder = output_path.replace(Path(output_path).name, "temp_wavs") clip_generator( converted_audio, text_path, transcription_model, forced_alignment_path, temp_wavs_folder, temp_unlabelled_folder, temp_label_path, logging=logging, symbols=symbols, min_length=min_length, max_length=max_length, min_confidence=min_confidence, combine_clips=combine_clips, ) with open(temp_label_path) as f: new_labels = f.readlines() with open(label_path, "a+") as f: for line in new_labels: filename, text = line.split("|") new_filename = add_suffix(filename, suffix) f.write(f"{new_filename}|{text}") for filename in os.listdir(temp_wavs_folder): new_filename = add_suffix(filename, suffix) shutil.copyfile(os.path.join(temp_wavs_folder, filename), os.path.join(output_path, new_filename)) for filename in os.listdir(temp_unlabelled_folder): new_filename = add_suffix(filename, suffix) shutil.copyfile(os.path.join(temp_unlabelled_folder, filename), os.path.join(unlabelled_path, new_filename)) os.remove(temp_label_path) shutil.rmtree(temp_wavs_folder) shutil.rmtree(temp_unlabelled_folder) logging.info("Combined dataset") logging.info("Getting dataset info...") # Do not pass clip lengths from extend_dataset as we need to get size of entire dataset (not just new clips) save_dataset_info(label_path, output_path, info_path)
def create_dataset( text_path, audio_path, transcription_model, output_folder, logging=logging, min_length=MIN_LENGTH, max_length=MAX_LENGTH, min_confidence=0.85, combine_clips=True, symbols=DEFAULT_ALPHABET, ): """ Generates a dataset. Converts audio to required format, generates clips & produces required files. Parameters ---------- text_path : str Path to source text audio_path : str Path to source audio transcription_model : TranscriptionModel Transcription model output_folder : str Path to save dataset to logging : logging (optional) Logging object to write logs to min_length : float (optional) Minimum duration of a clip in seconds max_length : float (optional) Maximum duration of a clip in seconds min_confidence : float (optional) Minimum confidence score to generate a clip for combine_clips : bool (optional) Whether to combine clips to make them longer symbols : list[str] (optional) list of valid symbols default to DEFAULT_ALPHABET Raises ------- AssertionError If given paths are invalid or clips could not be produced """ logging.info(f"Converting {audio_path}...") converted_audio = convert_audio(audio_path) forced_alignment_path = os.path.join(output_folder, ALIGNMENT_FILE) output_path = os.path.join(output_folder, AUDIO_FOLDER) unlabelled_path = os.path.join(output_folder, UNLABELLED_FOLDER) label_path = os.path.join(output_folder, METADATA_FILE) info_path = os.path.join(output_folder, INFO_FILE) try: clip_lengths = clip_generator( converted_audio, text_path, transcription_model, forced_alignment_path, output_path, unlabelled_path, label_path, logging=logging, symbols=symbols, min_length=min_length, max_length=max_length, min_confidence=min_confidence, combine_clips=combine_clips, ) except Exception as e: shutil.rmtree(output_folder) raise e logging.info("Getting dataset info...") save_dataset_info(label_path, output_path, info_path, clip_lengths)
def import_dataset(dataset, dataset_directory, audio_folder, logging): try: with zipfile.ZipFile(dataset, mode="r") as z: files_list = z.namelist() assert ( "metadata.csv" in files_list ), "Dataset missing metadata.csv. Make sure this file is in the root of the zip file" folders = [x.split("/")[0] for x in files_list if "/" in x] assert ( "wavs" in folders ), "Dataset missing wavs folder. Make sure this folder is in the root of the zip file" wavs = [ x for x in files_list if x.startswith("wavs/") and x.endswith(".wav") ] assert wavs, "No wavs found in wavs folder" metadata = z.read("metadata.csv").decode("utf-8", "ignore").replace( "\r\n", "\n") num_metadata_rows = len( [row for row in metadata.split("\n") if row]) assert ( len(wavs) == num_metadata_rows ), f"Number of wavs and labels do not match. metadata: {num_metadata_rows}, wavs: {len(wavs)}" logging.info("Creating directory") os.makedirs(dataset_directory, exist_ok=False) os.makedirs(audio_folder, exist_ok=False) # Save metadata logging.info("Saving files") with open(os.path.join(dataset_directory, "metadata.csv"), "w", encoding="utf-8") as f: f.write(metadata) # Save wavs total_wavs = len(wavs) clip_lengths = [] filenames = {} for i in range(total_wavs): wav = wavs[i] data = z.read(wav) path = os.path.join(dataset_directory, "wavs", wav.split("/")[1]) with open(path, "wb") as f: f.write(data) new_path = convert_audio(path) clip_lengths.append( librosa.get_duration(filename=new_path)) filenames[path] = new_path logging.info(f"Progress - {i+1}/{total_wavs}") # Get around "file in use" by using delay logging.info("Deleting temp files") for old_path, new_path in filenames.items(): os.remove(old_path) os.rename(new_path, old_path) # Create info file logging.info("Creating info file") save_dataset_info( os.path.join(dataset_directory, "metadata.csv"), os.path.join(dataset_directory, "wavs"), os.path.join(dataset_directory, "info.json"), clip_lengths=clip_lengths, ) except Exception as e: os.remove(dataset) raise e os.remove(dataset)