예제 #1
0
def main(args):
    df = pd.read_table(args.tsv_path)
    df = df.dropna(subset=["utt_id", "token_id", "text"])

    if not os.path.exists(args.vocab):
        vocab = build_vocab(df, args.vocab)
    else:
        vocab = Vocab(args.vocab)
        print(f"load vocab: {args.vocab}")

    phone_texts = []
    phone_token_ids = []
    phone_lens = []

    for row in tqdm(df.itertuples()):
        text = row.text.replace(" ", "")  # remove spaces
        phones = pyopenjtalk.g2p(text, join=False)
        phone_text = " ".join(phones)
        phone_token_id = ints2str(vocab.tokens2ids(phones))

        phone_texts.append(phone_text)
        phone_token_ids.append(phone_token_id)
        phone_lens.append(len(phones))

    df["phone_text"] = phone_texts
    df["phone_token_id"] = phone_token_ids
    df["plen"] = phone_lens

    if args.cols is not None:
        columns = [column for column in args.cols.split(",")]
        assert (("utt_id" in columns) and ("phone_text" in columns)
                and ("phone_token_id" in columns))
        df = df[columns]

    if args.out is None:
        df.to_csv(args.tsv_path.replace(".tsv", "_p2w.tsv"),
                  sep="\t",
                  index=False)
    else:
        df.to_csv(args.out, sep="\t", index=False)
예제 #2
0
def main(args):
    df = pd.read_table(args.data)
    df = df.dropna()

    sp = spm.SentencePieceProcessor()
    sp.Load(args.model)
    vocab = Vocab(args.vocab)

    token_ids = []

    for row in tqdm(df.itertuples()):
        tokens = sp.EncodeAsPieces(row.text)
        token_id = vocab.tokens2ids(tokens)
        token_ids.append(ints2str(token_id))

    df["token_id"] = token_ids

    if args.out is None:
        # overwrite
        df.to_csv(args.data, sep="\t", index=False)
    else:
        df.to_csv(args.out, sep="\t", index=False)
예제 #3
0
def main(args):
    word2phone = {}
    with open(args.lexicon, "r", encoding="utf-8") as f:
        for line in f:
            line = re.sub(r"[\s]+", " ",
                          line.strip())  # Remove successive spaces
            word = line.split(" ")[0]
            word = word.split("+")[0]  # for CSJ
            word = word.lower()  # for Librispeech
            phone_seq = " ".join(line.split(" ")[1:])
            word2phone[word] = phone_seq
    vocab = Vocab(args.vocab)

    if args.input.endswith(".tsv"):
        tsv_path = args.input
        df = pd.read_table(tsv_path)
        df = df.dropna(subset=["utt_id", "token_id", "text"])

        phone_texts = []
        phone_token_ids = []
        phone_lens = []

        for row in tqdm(df.itertuples()):
            # print("text:", row.text)
            # print("token_id:", row.token_id)
            words = row.text.split(" ")
            phones = []
            for w in words:
                if w in word2phone:
                    phones += word2phone[w].split()
                else:
                    phones += [args.unk]
            phone_text = " ".join(phones)
            phone_token_id = ints2str(vocab.tokens2ids(phones))

            # print("phone_text:", phone_text)
            # print("phone_token_id:", phone_token_id)

            phone_texts.append(phone_text)
            phone_token_ids.append(phone_token_id)
            phone_lens.append(len(phones))

        df["phone_text"] = phone_texts
        df["phone_token_id"] = phone_token_ids
        df["plen"] = phone_lens

        if args.cols is not None:
            columns = [column for column in args.cols.split(",")]
            assert (("utt_id" in columns) and ("phone_text" in columns)
                    and ("phone_token_id" in columns) and ("plen" in columns))
            df = df[columns]

        if args.out is None:
            df.to_csv(tsv_path.replace(".tsv", "_p2w.tsv"),
                      sep="\t",
                      index=False)
        else:
            df.to_csv(args.out, sep="\t", index=False)
    else:
        words = args.input.split(" ")
        phones = []
        for w in words:
            if w in word2phone:
                phones += word2phone[w].split()
            else:
                phones += [args.unk]
        phone_text = " ".join(phones)
        phone_token_id = ints2str(vocab.tokens2ids(phones))

        print(f"text: {phone_text}")
        print(f"token_id: {phone_token_id}")