def frontend(self, text): """ :param text: :return: """ text = custom_english_cleaners(text) if self.trans_type == "phn": text = filter(lambda s: s != " ", self.g2p(text)) text = " ".join(text) # print(f"Cleaned text: {text}") charseq = text.split(" ") else: # print(f"Cleaned text: {text}") charseq = list(text) idseq = [] for c in charseq: if c.isspace(): idseq += [self.char_to_id["<space>"]] elif c not in self.char_to_id.keys(): print(f"{c} is unknown!") idseq += [self.char_to_id["<unk>"]] elif c == ',': idseq += [self.char_to_id[c]] # done in an attempt to create a longer pause after commas, unsure exactly how this works idseq += [self.char_to_id[c]] else: idseq += [self.char_to_id[c]] idseq += [self.idim - 1] # <eos> return torch.LongTensor(idseq).view(-1).to(self.device)
def main(): parser = argparse.ArgumentParser() parser.add_argument( "--lang_tag", type=str, default=None, nargs="?", help="language tag (can be used for multi lingual case)", ) parser.add_argument("--spk_tag", type=str, help="speaker tag") parser.add_argument("jsons", nargs="+", type=str, help="*_mls.json filenames") parser.add_argument("out", type=str, help="output filename") parser.add_argument( "trans_type", type=str, default="phn", choices=["char", "phn"], help="Input transcription type", ) args = parser.parse_args() dirname = os.path.dirname(args.out) if len(dirname) != 0 and not os.path.exists(dirname): os.makedirs(dirname) with codecs.open(args.out, "w", encoding="utf-8") as out: for filename in sorted(args.jsons): with codecs.open(filename, "r", encoding="utf-8") as f: js = json.load(f) for key in sorted(js.keys()): uid = args.spk_tag + "_" + key.replace(".wav", "") content = js[key]["clean"] text = custom_english_cleaners(content.rstrip()) if args.trans_type == "phn": clean_content = text.lower() text = g2p(clean_content) if args.lang_tag is None: line = "%s %s \n" % (uid, text) else: line = "%s <%s> %s\n" % (uid, args.lang_tag, text) out.write(line)
def frontend(text, g2p, char_to_id, idim): """Clean text and then convert to id sequence.""" text = custom_english_cleaners(text) if trans_type == "phn": text = filter(lambda s: s != " ", g2p(text)) text = " ".join(text) print(f"Cleaned text: {text}") charseq = text.split(" ") else: print(f"Cleaned text: {text}") charseq = list(text) idseq = [] for c in charseq: if c.isspace(): idseq += [char_to_id["<space>"]] elif c not in char_to_id.keys(): idseq += [char_to_id["<unk>"]] else: idseq += [char_to_id[c]] idseq += [idim - 1] # <eos> return torch.LongTensor(idseq).view(-1).to(device)
def clean_blizzard17(metadata, trans_type, char2index, phn2index): g2p = G2p() cur_dir = os.path.dirname(__file__) filelists_path = os.path.join(cur_dir, "../filelists") f_read = open(metadata, "r", encoding="utf-8") f_write = open(os.path.join(filelists_path, "data.csv"), "w", encoding="utf-8") for line in tqdm(f_read, desc="cleaning and nomalizing: "): line = line.strip("(") line = line.strip(")\n") utterence_id, content, _ = line.split("\"") utterence_id = utterence_id.strip() content = content.strip() clean_char = custom_english_cleaners(content) if trans_type == "char": normalized_char = [] token_id = [] for char in clean_char: if char in char2index.keys(): normalized_char.append(char) token_id.append(char2index[char]) elif char == " ": normalized_char.append("<space>") token_id.append(char2index["<space>"]) else: normalized_char.append("<unk>") token_id.append(char2index["<unk>"]) normalized_char.append("<eos>") token_id.append(char2index["<eos>"]) normalized_char = " ".join(normalized_char) token_id = " ".join(token_id) f_write.write(utterence_id + "|" + content + "|" + normalized_char + "|" + token_id + "\n") elif trans_type == "phn": clean_char = clean_char.lower() clean_phn = g2p(clean_char) normalized_phn = [] token_id = [] for phn in clean_phn: if phn in phn2index: normalized_phn.append(phn) token_id.append(phn2index[phn]) elif phn == " ": normalized_phn.append("<space>") token_id.append(phn2index["<space>"]) else: normalized_phn.append("<unk>") token_id.append(phn2index["<unk>"]) normalized_phn.append("<eos>") token_id.append(phn2index["<eos>"]) normalized_phn = " ".join(normalized_phn) token_id = " ".join(token_id) f_write.write(utterence_id + "|" + content + "|" + normalized_phn + "|" + token_id + "\n") else: raise Exception("Wrong Type") f_read.close() f_write.close()
# NOTE: we need to download dict in initial running nltk.download("punkt") def g2p(text): """Convert grapheme to phoneme.""" tokens = filter(lambda s: s != " ", f_g2p(text)) return " ".join(tokens) if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("text", type=str, help="text to be cleaned") parser.add_argument( "trans_type", type=str, default="kana", choices=["char", "phn"], help="Input transcription type", ) args = parser.parse_args() with codecs.open(args.text, "r", "utf-8") as fid: for line in fid.readlines(): id, content = line.split(" ", 1) clean_content = custom_english_cleaners(content.rstrip()) if args.trans_type == "phn": text = clean_content.lower() clean_content = g2p(text) print("%s %s" % (id, clean_content))
help="Input transcription type", ) parser.add_argument("--lowercase", type=bool, default=False, help="Lower case the result or not") args = parser.parse_args() # clean every line in transcription file first with codecs.open(args.transcription_path, "r", "utf-8") as fid: for line in fid.read().splitlines(): segments = line.split(" ") # clean contents content = " ".join(segments[:-1]) clean_content = custom_english_cleaners(content) # get id by taking off the parentheses id = segments[-1][1:-1] if args.trans_type == "phn": text = clean_content.lower() clean_content = g2p(text) if args.lowercase: clean_content = clean_content.lower() if args.lang_tag == "": print("{} {}".format(id, clean_content)) else: print("{} {}".format(