def parse(dir_path: str, logger: Logger = getLogger()) -> PreDataList: if not os.path.exists(dir_path): logger.exception(f"Directory not found: {dir_path}!") raise Exception() readme_path = os.path.join(dir_path, "README.md") readme = read_lines(readme_path) readme = readme[34:58] speakers_dict = {} for speaker_details in readme: name, gender, accent, _, _ = speaker_details[1:-1].split("|") speakers_dict[name] = gender, accent speaker_folders = get_subfolders(dir_path) lang = Language.ENG entries = PreDataList() logger.info("Parsing files...") for speaker_folder in tqdm(speaker_folders): speaker_name = get_basename(speaker_folder) if speaker_name not in speakers_dict.keys(): logger.info(f"Skipping {speaker_name}") continue wavs = get_filepaths(os.path.join(speaker_folder, "wav")) # only 150, they do not contain good IPA annotations = get_filepaths(os.path.join(speaker_folder, "annotation")) textgrids = get_filepaths(os.path.join(speaker_folder, "textgrid")) transcripts = get_filepaths(os.path.join(speaker_folder, "transcript")) assert len(wavs) == len(textgrids) == len(transcripts) speaker_name = get_basename(speaker_folder) speaker_gender, speaker_accent = speakers_dict[speaker_name] accent_name = f"{speaker_accent}-{speaker_name}" gender = Gender.MALE if speaker_gender == "M" else Gender.FEMALE for wav, textgrid, transcript in zip(wavs, textgrids, transcripts): text_en = read_text(transcript) text_en = f"{text_en}." symbols = text_to_symbols(text_en, lang) entry = PreData(name=get_basename(wav), speaker_name=speaker_name, text=text_en, wav_path=wav, symbols=symbols, accents=[accent_name] * len(symbols), gender=gender, lang=lang) entries.append(entry) entries.sort(key=sort_arctic, reverse=False) logger.info( f"Parsed {len(entries)} entries from {len(speakers_dict)} speakers.") return entries
def parse(dir_path: str, logger: Logger = getLogger()) -> PreDataList: if not os.path.exists(dir_path): print("Directory not found:", dir_path) raise Exception() speakers_path = os.path.join(dir_path, "SPEAKERS.txt") speakers = read_lines(speakers_path) speakers = speakers[12:] speakers_dict = {} for speaker_details in speakers: s_id, gender, _, _, name = speaker_details.split(" | ") speakers_dict[s_id.strip()] = name.strip(), gender.strip() lang = Language.ENG entries = PreDataList() logger.info("Parsing files...") for dataset_folder in tqdm(get_subfolders(dir_path)): logger.info(f"Parsing {get_basename(dataset_folder)}...") for speaker_folder in tqdm(get_subfolders(dataset_folder)): speaker_id = get_basename(speaker_folder) speaker_name, speaker_gender = speakers_dict[speaker_id] accent_name = speaker_name gender = Gender.MALE if speaker_gender == "M" else Gender.FEMALE for chapter_folder in get_subfolders(speaker_folder): files = get_filepaths(chapter_folder) wavs = [x for x in files if x.endswith(".wav")] texts = [x for x in files if x.endswith(".normalized.txt")] assert len(wavs) == len(texts) for wav_file, text_file in zip(wavs, texts): assert get_basename(wav_file) == get_basename( text_file)[:-len(".normalized")] text_en = read_text(text_file) symbols = text_to_symbols(text_en, lang) entry = PreData(name=get_basename(wav_file), speaker_name=speaker_name, text=text_en, wav_path=wav_file, symbols=symbols, accents=[accent_name] * len(symbols), gender=gender, lang=lang) entries.append(entry) entries.sort(key=sort_libri, reverse=False) logger.info( f"Parsed {len(entries)} entries from {len(speakers_dict)} speakers.") return entries
def load_from_file(cls, filepath: str): loaded = parse_json(filepath) loaded = OrderedDict(loaded.items()) values = list(loaded.values()) assert len(values) > 0 is_v2 = isinstance(values[0], list) if is_v2: tmp = [(data[1], int(symbol_id)) for symbol_id, data in loaded.items()] tmp.sort(key=lambda x: x[1]) ids_to_symbols = OrderedDict(tmp) file_name = get_basename(filepath) backup_path = os.path.join(os.path.dirname(filepath), f"{file_name}.v2.json") copyfile(filepath, backup_path) res = cls.from_raw(ids_to_symbols) res.save(filepath) return res ids_to_symbols = loaded return cls.from_raw(ids_to_symbols)
def test_get_basename_of_dir_w_slash(self): path = "/a/b/c/test/" res = get_basename(path) self.assertEqual("", res)
def test_get_basename_of_filename(self): path = "test.wav.xyz" res = get_basename(path) self.assertEqual("test.wav", res)
def test_get_basename_full_path(self): path = "/a/b/c/test.wav.xyz" res = get_basename(path) self.assertEqual("test.wav", res)
def get_infer_dir(train_dir: str, wav_path: str, iteration: int): input_name = get_basename(wav_path) subdir_name = f"{datetime.datetime.now():%Y-%m-%d,%H-%M-%S},wav={input_name},it={iteration}" return get_subdir(get_inference_root_dir(train_dir), subdir_name, create=True)