예제 #1
0
def __generate_simple_poly_dict():
    data_path = '/data1/liujshi/yunlv_research/total_zhuiyi_corup/' \
                'total_metadata_new'
    total_lines = read_lines(data_path)[0:]
    print(total_lines[0:5])

    total_dict, poly_dict = defaultdict(list), defaultdict(list)
    for line in total_lines:
        phone, chars = line.split("|")[1], line.split("|")[2]
        chars = clean_sentence(chars.replace(" ", ""))
        phone = __change_tone_format(phone)
        try:
            phone_pairs = phone2pairs(chars, phone)
            for c, p in phone_pairs:
                total_dict[c].append(p)
                total_dict[c] = list(set(total_dict[c]))
        except TypeError:
            pass
        except IndexError:
            print("Index Error:", phone, chars)

    for line in read_lines("../../other_files/poly_dict"):
        key = line.split(":")[0]
        value = line.split(":")[1].split(",")
        poly_dict[key] = value

    map_phone = dict()
    for line in read_lines("../../other_files/phone_map_merge.txt"):
        key = line.split(":")[0]
        value = line.split(":")[1]
        map_phone[key] = value

    new_lines = []
    for char in poly_dict.keys():
        if char not in total_dict.keys():
            pass  # 未出现过的多音字移除掉
        else:
            values = total_dict[char]
            value_saved = []
            for value in values:
                # 发音词典拼音转化成标准拼音进行比对。
                map_value = map_phone[value.split()[0]] + \
                            map_phone[value.split()[1] + value.split()[2][-1]]
                if map_value in poly_dict[char]:
                    value_saved.append(value)
            if len(value_saved) > 1:
                new_line = "{}:{}".format(char, ",".join(value_saved))
                new_lines.append(new_line)
                print("save:", new_line)
            else:
                pass  # 只出现过其中一个音的多音字移除掉。

    write_lines("../../other_files/simple_poly_dict", new_lines)
    return None
예제 #2
0
def __get_total_dict():
    with open("../../other_files/in_baiduhanyu.txt") as fr:
        lines = fr.readlines()[0:]
    new_lines = []
    for line in lines:
        line = line.replace("[", "").replace("]", "")
        # line = line.replace("[", "").replace("]", "").replace(":")
        chars = line.split(":")[0]
        print(chars)
        phones = line.split(":")[1].split(",")

        if len(chars) == 1 and len(phones) > 1:
            new_lines.append(line.strip())
    write_lines("../../other_files/poly_dict", new_lines)
예제 #3
0
    def build_vocab(self, data, token_limits, files):
        """ Build words and chars with limited sizes and write into files

        :param data: list of lines
        :param token_limits: word_limit_size, char_limit_size
        :param files: word_file_path, char_file_path
        :return:
        """
        self._set_vocab(data, token_limits[0], token_limits[1])
        utils.write_lines(files[0], self.words)
        utils.verbose(
            'words has been dumped in {}'.format(os.path.abspath(files[0])))
        utils.write_lines(files[1], self.chars)
        utils.verbose(
            'chars has been dumped in {}'.format(os.path.abspath(files[1])))
예제 #4
0
def main():
    model_dir = "/data1/liufeng/synthesis/frontend/models/psd_v1"
    eval_dir = os.path.join(model_dir, "eval")
    os.makedirs(eval_dir, exist_ok=True)

    # data_path = os.path.join(model_dir, "metadata_dev.txt")
    data_path = "/data1/liufeng/synthesis/feature/feature_taco/feat_0307/" \
                "dev_psd.txt"
    metadata = read_lines(data_path)
    print(metadata[0:2])

    text_path = os.path.join(eval_dir, "corpus.txt")
    corpus = [
        rm_prosody(line.split("|")[6].replace(" ", "")).upper()
        for line in metadata
    ]
    write_lines(text_path, corpus)

    sub_count = 0
    # truth_path = "output.txt"
    truth_path = os.path.join(eval_dir, "truth.txt")
    with open(truth_path, "w", encoding="utf-8") as fr:
        for sent_id, meta in enumerate(metadata):
            phone = (meta.split("|")[5])
            sentence = clean_sentence(
                meta.split("|")[6].replace(" ", "").upper())
            fr.write("\nid:{}\n{}\n".format(sent_id, sentence))
            print("\nid:{}\n{}".format(sent_id, sentence))

            sub_sentences = split_sentence(sentence)
            sub_phones = split_sentence(phone, split_type="phone")

            for split_id, (sent,
                           phone) in enumerate(zip(sub_sentences, sub_phones)):
                fr.write("split-id:{} | {}\n{}\n".format(
                    split_id, sent, phone))
                print("split-id:{} | {} | {}".format(split_id, sent, phone))
                sub_count += 1
            fr.write("split-end\n")

    print("\nsub count:{}".format(sub_count))
    print("write other_files to {}".format(truth_path))
예제 #5
0
def main():
    model_dir = "/data1/liufeng/synthesis/frontend/models/v3"
    eval_dir = os.path.join(model_dir, "eval")
    os.makedirs(eval_dir, exist_ok=True)

    data_path = os.path.join(model_dir, "metadata_dev.txt")
    metadata = read_lines(data_path)
    print(metadata[0])

    text_path = os.path.join(eval_dir, "corpus.txt")
    corpus = [
        rm_prosody(line.split("|")[2].replace(" ", "")).upper()
        for line in metadata
    ]
    print(corpus[0])
    write_lines(text_path, corpus)

    sub_count = 0
    truth_path = os.path.join(eval_dir, "truth.txt")
    with open(truth_path, "w", encoding="utf-8") as fr:
        for sent_id, meta in enumerate(metadata):
            meta = rm_prosody(meta)
            sentence, phone = meta.split("|")[2].replace(
                " ", ""), meta.split("|")[1]
            sentence = clean_sentence(sentence).upper()
            print(sentence)

            fr.write("\nid:{}\n{}\n".format(sent_id, sentence))
            print("\nid:{}\n{}".format(sent_id, sentence))

            sub_sentences = split_sentence(sentence)
            sub_phones = split_sentence(phone, split_type="phone")
            for split_id, (sent,
                           phone) in enumerate(zip(sub_sentences, sub_phones)):
                fr.write("split-id:{} | {}\n{}\n".format(
                    split_id, sent, phone))
                print("split-id:{} | {} | {}".format(split_id, sent, phone))
                sub_count += 1
            fr.write("split-end\n")

    print("\nsub count:{}".format(sub_count))
    print("write files to {}".format(truth_path))
예제 #6
0
def main():
  data_path = '/data1/liufeng/synthesis/TACOTRON-2-refined/data/data_0306/' \
              'metadata_tot.csv'
  total_lines = read_lines(data_path)[0:]
  print(total_lines[0])

  poly_dict = load_poly_dict()

  new_lines = []
  for line in total_lines:
    if has_poly_char(line.split("|")[2], poly_dict):
      new_lines.append(line)
  print("there are {} lines with poly char".format(len(new_lines)))

  random.shuffle(new_lines)
  dev_lines = new_lines[0:5000]
  train_lines = new_lines[5000:]

  poly_chars, tot_chars = 0, 0
  for line in new_lines:
    for char in line:
      tot_chars += 1
      if char in poly_dict.keys():
        poly_chars += 1
  print("there are {}chars in total {}chars ({})".format(
    poly_chars, tot_chars, poly_chars/tot_chars))

  train_x, train_y, data_lines = __extract_feature(train_lines, poly_dict)
  write_lines("metadata_train.txt", data_lines)
  dev_x, dev_y, data_lines = __extract_feature(dev_lines, poly_dict)
  write_lines("metadata_dev.txt", data_lines)

  with open('/data1/liufeng/synthesis/frontend/models/feature.pkl', 'wb') as fw:
    pickle.dump((train_x, train_y, dev_x, dev_y), fw)
    print("save {}/{} train/dev items ".format(len(train_x), len(dev_x)))
  return
예제 #7
0
def main():
    model_dir = "/data1/liufeng/synthesis/frontend/models/v2"
    eval_dir = os.path.join(model_dir, "eval")
    os.makedirs(eval_dir, exist_ok=True)

    # data_path = os.path.join(model_dir, "metadata_dev.txt")
    data_path = "/data1/liufeng/synthesis/feature/feature_prosody/" \
                "bzn/dev.txt"
    metadata = read_lines(data_path)
    print(metadata[0:2])
    # dev_corpus = [line.split("|")[6] for line in read_lines(data_path)]
    # dev_phones = [line.split("|")[5] for line in read_lines(data_path)]
    # print(dev_corpus[0])
    # line = dev_corpus[0]
    # x, y = split_psd(line)
    # print(x, y)
    #
    # exit()
    # metadata = [line for line in metadata if "bzn" in line]
    # print(metadata[0:3])

    text_path = os.path.join(eval_dir, "corpus.txt")
    corpus = [
        rm_prosody(line.split("|")[6].replace(" ", "")) for line in metadata
    ]
    # print(corpus[0:3])
    write_lines(text_path, corpus)
    # exit()

    sub_count = 0
    # truth_path = os.path.join(eval_dir, "bc_dev.txt")
    truth_path = "output.txt"
    with open(truth_path, "w", encoding="utf-8") as fr:
        for sent_id, meta in enumerate(metadata):

            phone = (meta.split("|")[5])
            sentence = clean_sentence(meta.split("|")[6].replace(" ", ""))
            _ss = sentence
            print(phone, sentence)
            # x, y = split_psd(sentence)
            # sentence = "".join(x)
            # assert len(y) == len(sentence)

            # if not check_exist_eng(sentence):
            #   continue

            fr.write("\nid:{}\n{}\n".format(sent_id, _ss))
            print("\nid:{}\n{}".format(sent_id, _ss))

            sub_sentences = split_sentence(sentence)
            sub_phones = split_sentence(phone, split_type="phone")
            # print(len(y), len(sub_phones), len(sub_sentences))
            for split_id, (sent,
                           phone) in enumerate(zip(sub_sentences, sub_phones)):
                x, y = split_psd(sent)
                sent = "".join(x)
                print(sent, phone)
                pairs = phone2pairs(sent, phone)
                new_pairs = [(_x[0], _x[1], _y) for _x, _y in zip(pairs, y)]
                new_phone = [_y + " #" + _z for _x, _y, _z in new_pairs]
                new_phone = " ".join(new_phone).replace(" #0", "")
                fr.write("split-id:{} | {}\n{}\n".format(
                    split_id, sent, new_phone))
                print("split-id:{} | {} | {}".format(split_id, sent,
                                                     new_phone))
                sub_count += 1
            fr.write("split-end\n")
            # exit()

    print("\nsub count:{}".format(sub_count))
    print("write other_files to {}".format(truth_path))
예제 #8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('yaml_path', help='config path for frontend')
    parser.add_argument('input_path', help='input path(txt)')
    parser.add_argument('output_path', help='output path(txt)')
    args = parser.parse_args()

    # todo: add sil label
    hparams = __load_hparams(args.yaml_path)

    text_path = args.input_path
    frontend_path = args.output_path
    flag_psd = hparams.flag_psd

    if hparams.norm_text:
        raw_file_lines = read_lines(text_path)
        sentences = []
        print("text normalize:")
        for line in raw_file_lines:
            new_line = Text().normalize(line)
            sentences.append(new_line.replace(" ", ""))
            if not new_line == line:
                print("{}->{}".format(line, new_line))
    else:
        sentences = read_lines(text_path)
    write_lines("norm.txt", sentences)
    # exit()

    trans = TranscriptToPinyin(
        dic_path=hparams.dict_path,
        eng_dic_path=hparams.eng_dict_path,
    )

    if hparams.nnet_psd and hparams.flag_psd:
        psd_predict, bert_psd_result = __compute_psd_result(
            hparams, sentences, hparams.load_memory_psd)
    else:
        psd_predict, bert_psd_result = None, None

    if hparams.nnet_phone:
        phone_predictor, bert_phone_result = __compute_nnet_phone_result(
            hparams, sentences, hparams.load_memory_phone)
    else:
        phone_predictor, bert_phone_result = None, None

    sub_count, count = 0, 0
    with open(frontend_path, "w", encoding="utf-8") as frontend_file:
        for sent_id, sentence in enumerate(sentences[0:]):
            # sentence = num2hanzi(clean_sentence(sentence))
            frontend_file.write("\nid:{}\n{}\n".format(sent_id, sentence))
            print("\nid:{}\n{}".format(sent_id, sentence))

            sub_sentences = split_sentence(sentence)
            for split_id, sub_sentence in enumerate(sub_sentences):
                sub_count += 1
                phone_pairs = trans.get_phone_pairs(
                    sub_sentence, change_eng_symbol=hparams.eng_symbol)
                new_ = []
                if hparams.nnet_phone:
                    bert_phone = bert_phone_result[count]
                    if len(sub_sentence) == len(bert_phone):
                        phone = phone_predictor.modify_result(
                            bert_phone, phone_pairs)
                        for i, (c, ph, p) in enumerate(phone_pairs):
                            new_.append((c, phone[i], p))
                        phone_pairs = new_
                    else:
                        print("Error for bert result")

                if flag_psd and not hparams.nnet_psd:
                    phone = " ".join(
                        [ph + " #" + psd for _, ph, psd in phone_pairs])
                    phone = phone.replace("#0", "").replace("#5", "")
                    sub_sentence = "".join(
                        [c + "#" + psd for c, _, psd in phone_pairs])
                    sub_sentence = sub_sentence.replace("#0",
                                                        "").replace("#5", "")
                elif flag_psd and hparams.nnet_psd:
                    new_pairs = []
                    for new_psd, (char, ph, _) in zip(bert_psd_result[count],
                                                      phone_pairs):
                        new_pairs.append((char, ph, new_psd))
                    new_pairs = psd_predict.change_by_rules(new_pairs)
                    phone = " ".join(
                        [ph + " #" + psd for _, ph, psd in new_pairs])
                    phone = phone.replace("#0", "").replace("#5", "")
                    sub_sentence = "".join(
                        [c + "#" + psd for c, _, psd in new_pairs])
                    sub_sentence = sub_sentence.replace("#0",
                                                        "").replace("#5", "")
                else:
                    phone = " ".join([ph for _, ph, _ in phone_pairs])
                    sub_sentence = "".join([c for c, _, _ in phone_pairs])

                count += 1
                frontend_file.write("split-id:{} | {}\n{}\n".format(
                    split_id, sub_sentence, phone))
                print("split-id:{} | {} | {}".format(split_id, sub_sentence,
                                                     phone))
            frontend_file.write("split-end\n")

    # todo: 改善停顿。
    # todo: 重构,废弃kashagri,使用keras-bert
    print("\nsub count:{}".format(sub_count))
    print("write output data to {}".format(frontend_path))