def __extract_feature(new_lines, poly_dict): data_lines = [] meta_data = [] for line in new_lines: line = rm_prosody(line) line = re.sub(u"\\(.*?\\)|\\{.*?}|\\[.*?]", "", line) phone, chars = line.split("|")[1], line.split("|")[2] chars = clean_sentence(chars.replace(" ", "")) # phone = split_tone(phone) try: phone_pairs = phone2pairs(chars, phone) new_phone_pairs = [] for c, p in phone_pairs: if c in poly_dict.keys(): new_phone_pairs.append((c, p)) else: new_phone_pairs.append((c, "-")) data_lines.append(line) meta_data.append(([p for p in chars], [p.replace(" ", "").replace("*", "") for _, p in new_phone_pairs])) except TypeError: pass except IndexError: print("Index Error:", phone, chars) print("there are {} trainable lines".format(len(data_lines))) for i in range(5): print(meta_data[i]) data_x = [x for x, _ in meta_data] data_y = [y for _, y in meta_data] return data_x, data_y, data_lines
def __generate_simple_poly_dict(): data_path = '/data1/liujshi/yunlv_research/total_zhuiyi_corup/' \ 'total_metadata_new' total_lines = read_lines(data_path)[0:] print(total_lines[0:5]) total_dict, poly_dict = defaultdict(list), defaultdict(list) for line in total_lines: phone, chars = line.split("|")[1], line.split("|")[2] chars = clean_sentence(chars.replace(" ", "")) phone = __change_tone_format(phone) try: phone_pairs = phone2pairs(chars, phone) for c, p in phone_pairs: total_dict[c].append(p) total_dict[c] = list(set(total_dict[c])) except TypeError: pass except IndexError: print("Index Error:", phone, chars) for line in read_lines("../../other_files/poly_dict"): key = line.split(":")[0] value = line.split(":")[1].split(",") poly_dict[key] = value map_phone = dict() for line in read_lines("../../other_files/phone_map_merge.txt"): key = line.split(":")[0] value = line.split(":")[1] map_phone[key] = value new_lines = [] for char in poly_dict.keys(): if char not in total_dict.keys(): pass # 未出现过的多音字移除掉 else: values = total_dict[char] value_saved = [] for value in values: # 发音词典拼音转化成标准拼音进行比对。 map_value = map_phone[value.split()[0]] + \ map_phone[value.split()[1] + value.split()[2][-1]] if map_value in poly_dict[char]: value_saved.append(value) if len(value_saved) > 1: new_line = "{}:{}".format(char, ",".join(value_saved)) new_lines.append(new_line) print("save:", new_line) else: pass # 只出现过其中一个音的多音字移除掉。 write_lines("../../other_files/simple_poly_dict", new_lines) return None
def main(): model_dir = "/data1/liufeng/synthesis/frontend/models/psd_v1" eval_dir = os.path.join(model_dir, "eval") os.makedirs(eval_dir, exist_ok=True) # data_path = os.path.join(model_dir, "metadata_dev.txt") data_path = "/data1/liufeng/synthesis/feature/feature_taco/feat_0307/" \ "dev_psd.txt" metadata = read_lines(data_path) print(metadata[0:2]) text_path = os.path.join(eval_dir, "corpus.txt") corpus = [ rm_prosody(line.split("|")[6].replace(" ", "")).upper() for line in metadata ] write_lines(text_path, corpus) sub_count = 0 # truth_path = "output.txt" truth_path = os.path.join(eval_dir, "truth.txt") with open(truth_path, "w", encoding="utf-8") as fr: for sent_id, meta in enumerate(metadata): phone = (meta.split("|")[5]) sentence = clean_sentence( meta.split("|")[6].replace(" ", "").upper()) fr.write("\nid:{}\n{}\n".format(sent_id, sentence)) print("\nid:{}\n{}".format(sent_id, sentence)) sub_sentences = split_sentence(sentence) sub_phones = split_sentence(phone, split_type="phone") for split_id, (sent, phone) in enumerate(zip(sub_sentences, sub_phones)): fr.write("split-id:{} | {}\n{}\n".format( split_id, sent, phone)) print("split-id:{} | {} | {}".format(split_id, sent, phone)) sub_count += 1 fr.write("split-end\n") print("\nsub count:{}".format(sub_count)) print("write other_files to {}".format(truth_path))
def main(): model_dir = "/data1/liufeng/synthesis/frontend/models/v3" eval_dir = os.path.join(model_dir, "eval") os.makedirs(eval_dir, exist_ok=True) data_path = os.path.join(model_dir, "metadata_dev.txt") metadata = read_lines(data_path) print(metadata[0]) text_path = os.path.join(eval_dir, "corpus.txt") corpus = [ rm_prosody(line.split("|")[2].replace(" ", "")).upper() for line in metadata ] print(corpus[0]) write_lines(text_path, corpus) sub_count = 0 truth_path = os.path.join(eval_dir, "truth.txt") with open(truth_path, "w", encoding="utf-8") as fr: for sent_id, meta in enumerate(metadata): meta = rm_prosody(meta) sentence, phone = meta.split("|")[2].replace( " ", ""), meta.split("|")[1] sentence = clean_sentence(sentence).upper() print(sentence) fr.write("\nid:{}\n{}\n".format(sent_id, sentence)) print("\nid:{}\n{}".format(sent_id, sentence)) sub_sentences = split_sentence(sentence) sub_phones = split_sentence(phone, split_type="phone") for split_id, (sent, phone) in enumerate(zip(sub_sentences, sub_phones)): fr.write("split-id:{} | {}\n{}\n".format( split_id, sent, phone)) print("split-id:{} | {} | {}".format(split_id, sent, phone)) sub_count += 1 fr.write("split-end\n") print("\nsub count:{}".format(sub_count)) print("write files to {}".format(truth_path))
def __compute_psd_result(hparams, sentences, load_memory=False): from src.model.prosody import BertProsody psd_predict = BertProsody() bert_result_path = os.path.join(hparams.psd_model_dir, "bert_result.pkl") if os.path.exists(bert_result_path) and load_memory: with open(bert_result_path, "rb") as fr: bert_psd_result = pickle.load(fr) else: bert_input = [] for sent_id, sentence in enumerate(sentences): sentence = clean_sentence(sentence) sub_sentences = split_sentence(sentence) bert_input.extend(sub_sentences) model_path, init_epoch = get_model_path( os.path.join(hparams.psd_model_dir, "hdf5")) psd_predict.initial_model(bert_model_path=hparams.bert_model_path, psd_model_path=model_path) bert_psd_result = psd_predict.predict(bert_input) with open(bert_result_path, "wb") as fw: pickle.dump(bert_psd_result, fw) print("completed bert inference") return psd_predict, bert_psd_result
def __compute_nnet_phone_result(hparams, sentences, load_memory=False): from src.model.phone import BertPolyPhone phone_predictor = BertPolyPhone() bert_result_path = os.path.join(hparams.poly_model_dir, "bert_result.pkl") if os.path.exists(bert_result_path) and load_memory: with open(bert_result_path, "rb") as fr: bert_phone_result = pickle.load(fr) else: bert_input = [] for sent_id, sentence in enumerate(sentences): sentence = clean_sentence(sentence) sub_sentences = split_sentence(sentence) bert_input.extend(sub_sentences) print("total sub sentences:{}".format(len(bert_input))) model_path, init_epoch = get_model_path( os.path.join(hparams.poly_model_dir, "hdf5")) phone_predictor.inialize_model(bert_model_path=hparams.bert_model_path, poly_model_path=model_path) bert_phone_result = phone_predictor.predict(bert_input) with open(bert_result_path, "wb") as fw: pickle.dump(bert_phone_result, fw) print("completed bert inference") return phone_predictor, bert_phone_result
def main(): model_dir = "/data1/liufeng/synthesis/frontend/models/v2" eval_dir = os.path.join(model_dir, "eval") os.makedirs(eval_dir, exist_ok=True) # data_path = os.path.join(model_dir, "metadata_dev.txt") data_path = "/data1/liufeng/synthesis/feature/feature_prosody/" \ "bzn/dev.txt" metadata = read_lines(data_path) print(metadata[0:2]) # dev_corpus = [line.split("|")[6] for line in read_lines(data_path)] # dev_phones = [line.split("|")[5] for line in read_lines(data_path)] # print(dev_corpus[0]) # line = dev_corpus[0] # x, y = split_psd(line) # print(x, y) # # exit() # metadata = [line for line in metadata if "bzn" in line] # print(metadata[0:3]) text_path = os.path.join(eval_dir, "corpus.txt") corpus = [ rm_prosody(line.split("|")[6].replace(" ", "")) for line in metadata ] # print(corpus[0:3]) write_lines(text_path, corpus) # exit() sub_count = 0 # truth_path = os.path.join(eval_dir, "bc_dev.txt") truth_path = "output.txt" with open(truth_path, "w", encoding="utf-8") as fr: for sent_id, meta in enumerate(metadata): phone = (meta.split("|")[5]) sentence = clean_sentence(meta.split("|")[6].replace(" ", "")) _ss = sentence print(phone, sentence) # x, y = split_psd(sentence) # sentence = "".join(x) # assert len(y) == len(sentence) # if not check_exist_eng(sentence): # continue fr.write("\nid:{}\n{}\n".format(sent_id, _ss)) print("\nid:{}\n{}".format(sent_id, _ss)) sub_sentences = split_sentence(sentence) sub_phones = split_sentence(phone, split_type="phone") # print(len(y), len(sub_phones), len(sub_sentences)) for split_id, (sent, phone) in enumerate(zip(sub_sentences, sub_phones)): x, y = split_psd(sent) sent = "".join(x) print(sent, phone) pairs = phone2pairs(sent, phone) new_pairs = [(_x[0], _x[1], _y) for _x, _y in zip(pairs, y)] new_phone = [_y + " #" + _z for _x, _y, _z in new_pairs] new_phone = " ".join(new_phone).replace(" #0", "") fr.write("split-id:{} | {}\n{}\n".format( split_id, sent, new_phone)) print("split-id:{} | {} | {}".format(split_id, sent, new_phone)) sub_count += 1 fr.write("split-end\n") # exit() print("\nsub count:{}".format(sub_count)) print("write other_files to {}".format(truth_path))