def main(args): df = pd.read_table(args.tsv_path) df = df.dropna(subset=["utt_id", "token_id", "text"]) if not os.path.exists(args.vocab): vocab = build_vocab(df, args.vocab) else: vocab = Vocab(args.vocab) print(f"load vocab: {args.vocab}") phone_texts = [] phone_token_ids = [] phone_lens = [] for row in tqdm(df.itertuples()): text = row.text.replace(" ", "") # remove spaces phones = pyopenjtalk.g2p(text, join=False) phone_text = " ".join(phones) phone_token_id = ints2str(vocab.tokens2ids(phones)) phone_texts.append(phone_text) phone_token_ids.append(phone_token_id) phone_lens.append(len(phones)) df["phone_text"] = phone_texts df["phone_token_id"] = phone_token_ids df["plen"] = phone_lens if args.cols is not None: columns = [column for column in args.cols.split(",")] assert (("utt_id" in columns) and ("phone_text" in columns) and ("phone_token_id" in columns)) df = df[columns] if args.out is None: df.to_csv(args.tsv_path.replace(".tsv", "_p2w.tsv"), sep="\t", index=False) else: df.to_csv(args.out, sep="\t", index=False)
def main(args): df = pd.read_table(args.data) df = df.dropna() sp = spm.SentencePieceProcessor() sp.Load(args.model) vocab = Vocab(args.vocab) token_ids = [] for row in tqdm(df.itertuples()): tokens = sp.EncodeAsPieces(row.text) token_id = vocab.tokens2ids(tokens) token_ids.append(ints2str(token_id)) df["token_id"] = token_ids if args.out is None: # overwrite df.to_csv(args.data, sep="\t", index=False) else: df.to_csv(args.out, sep="\t", index=False)
def main(args): word2phone = {} with open(args.lexicon, "r", encoding="utf-8") as f: for line in f: line = re.sub(r"[\s]+", " ", line.strip()) # Remove successive spaces word = line.split(" ")[0] word = word.split("+")[0] # for CSJ word = word.lower() # for Librispeech phone_seq = " ".join(line.split(" ")[1:]) word2phone[word] = phone_seq vocab = Vocab(args.vocab) if args.input.endswith(".tsv"): tsv_path = args.input df = pd.read_table(tsv_path) df = df.dropna(subset=["utt_id", "token_id", "text"]) phone_texts = [] phone_token_ids = [] phone_lens = [] for row in tqdm(df.itertuples()): # print("text:", row.text) # print("token_id:", row.token_id) words = row.text.split(" ") phones = [] for w in words: if w in word2phone: phones += word2phone[w].split() else: phones += [args.unk] phone_text = " ".join(phones) phone_token_id = ints2str(vocab.tokens2ids(phones)) # print("phone_text:", phone_text) # print("phone_token_id:", phone_token_id) phone_texts.append(phone_text) phone_token_ids.append(phone_token_id) phone_lens.append(len(phones)) df["phone_text"] = phone_texts df["phone_token_id"] = phone_token_ids df["plen"] = phone_lens if args.cols is not None: columns = [column for column in args.cols.split(",")] assert (("utt_id" in columns) and ("phone_text" in columns) and ("phone_token_id" in columns) and ("plen" in columns)) df = df[columns] if args.out is None: df.to_csv(tsv_path.replace(".tsv", "_p2w.tsv"), sep="\t", index=False) else: df.to_csv(args.out, sep="\t", index=False) else: words = args.input.split(" ") phones = [] for w in words: if w in word2phone: phones += word2phone[w].split() else: phones += [args.unk] phone_text = " ".join(phones) phone_token_id = ints2str(vocab.tokens2ids(phones)) print(f"text: {phone_text}") print(f"token_id: {phone_token_id}")