class Kashgari: def __init__(self): self.model = None self.chunk_size = 100 self.set_features_numeric = dict() self.set_features_text = dict() def prepare_data_fit(self, tokens, tags, chunk_size, overlap=10): text_list = [] first_of_p_list = [] tag_list = [] buffer_text = [] buffer_first_of_p = [] buffer_tag = [] text_features = set("token") numeric_features = set("first_of_p") self.set_features_numeric = dict() for doc, doc_tags in zip(tokens, tags): for token, tag in zip(doc, doc_tags): features = agregado(token, simple_features=True) buffer_text.append(features['token']) buffer_first_of_p.append( '2' if features['first_of_p'] else '1') buffer_tag.append(tag) if len(buffer_text) > chunk_size: text_list.append(buffer_text) first_of_p_list.append(buffer_first_of_p) tag_list.append(buffer_tag) # Zerar buffer_text = [] buffer_first_of_p = [] buffer_tag = [] print("Processed doc") if len(buffer_text) >= 0: text_list.append(buffer_text) first_of_p_list.append(buffer_first_of_p) tag_list.append(buffer_tag) results = (text_list, first_of_p_list) return results, tag_list def prepare_data_predict(self, tokens, chunk_size): text_list = [] first_of_p_list = [] buffer_text = [] buffer_first_of_p = [] for token in tokens: features = agregado(token, simple_features=True) buffer_text.append(features['token']) buffer_first_of_p.append('2' if features['first_of_p'] else '1') if len(buffer_text) >= chunk_size: text_list.append(buffer_text) first_of_p_list.append(buffer_first_of_p) # Zerar buffer_text = [] buffer_first_of_p = [] if len(buffer_text) > 0: text_list.append(buffer_text) first_of_p_list.append(buffer_first_of_p) results = (text_list, first_of_p_list) return results def train(self, tokens, tags): x, y = self.prepare_data_fit(tokens, tags, chunk_size=self.chunk_size) text_embedding = BareEmbedding(task=kashgari.LABELING, sequence_length=self.chunk_size) first_of_p_embedding = NumericFeaturesEmbedding( feature_count=2, feature_name='first_of_p', sequence_length=self.chunk_size) stack_embedding = StackedEmbedding( [text_embedding, first_of_p_embedding]) stack_embedding.analyze_corpus(x, y) from kashgari.tasks.labeling import BiLSTM_Model, BiLSTM_CRF_Model self.model = BiLSTM_CRF_Model(embedding=stack_embedding) self.model.fit(x, y, batch_size=1, epochs=20) def predict(self, tokens): import itertools results = [] for doc in tokens: x = self.prepare_data_predict(doc, chunk_size=self.chunk_size) predicted = self.model.predict(x) x_list = list(itertools.chain.from_iterable(x[0])) predicted_unified = list(itertools.chain.from_iterable(predicted)) predicted_truncated = predicted_unified[:len(doc)] print( f"len doc{len(doc)} | x_list{len(x_list)} |len predicted_unified{len(predicted_unified)} |len predicted_truncated{len(predicted_truncated)} |" ) results.append(predicted_unified[:len(doc)]) return results
text = [[0.9, 0.1, 0.1], [0.9, 0.1, 0.1], [0.1, 0.8, 0.1], [0.1, 0.8, 0.1], [0.1, 0.8, 0.1]] label = [ 'B-Category', 'I-Category', 'B-ProjectName', 'I-ProjectName', 'I-ProjectName' ] text_list = [text] * 100 label_list = [label] * 100 SEQUENCE_LEN = 80 # You can use WordEmbedding or BERTEmbedding for your text embedding bare_embedding = DirectEmbedding(task=kashgari.RAW_LABELING, sequence_length=SEQUENCE_LEN, embedding_size=3) #bare_embedding = BareEmbedding(task=kashgari.LABELING, sequence_length=SEQUENCE_LEN) x = (text_list) y = label_list bare_embedding.analyze_corpus(x, y) # Now we can embed with this stacked embedding layer # We can build any labeling model with this embedding from kashgari.tasks.labeling import BiLSTM_Model, BiLSTM_CRF_Model model = BiLSTM_CRF_Model(embedding=bare_embedding) model.fit(x, y, batch_size=1, epochs=3) print(model.predict(x)) #print(model.predict_entities(x))
class BertProsody: """ 目前只支持长度50,输入字符数49 + 终结符 """ def __init__(self): self.model, self.model_dir, self.model_path = None, None, None self.sess = None return def initial_model(self, bert_model_path, psd_model_path): print('=============init bert model=========================') print("bert model path:", bert_model_path) print("crf model path:", psd_model_path) self.sess = tf.Session() set_session(self.sess) self.model_dir = os.path.dirname(os.path.dirname(psd_model_path)) self.model_path = psd_model_path data_path = os.path.join(self.model_dir, "feature_psd.pkl") train_data, train_label, test_data, test_label = \ pickle.load(open(data_path, 'rb')) bert_embed = BERTEmbedding(bert_model_path, task=kashgari.LABELING, sequence_length=50) self.model = BiLSTM_CRF_Model(bert_embed) self.model.build_model(x_train=train_data, y_train=train_label, x_validate=test_data, y_validate=test_label) self.model.compile_model() self.model.tf_model.load_weights(psd_model_path) print('=============bert model loaded=========================') return def _write_dict(self): label_path = os.path.join(self.model_dir, "idx2label.txt") with open(label_path, "w", encoding="utf-8") as fr: for key, value in self.model.embedding.label2idx.items(): fr.write("{} {}\n".format(value, key)) token_path = os.path.join(self.model_dir, "token2idx.txt") with open(token_path, "w", encoding="utf-8") as fr: for key, value in self.model.embedding.token2idx.items(): if len(key) > 0: fr.write("{} {}\n".format(key, value)) def predict(self, sentence_list): """ 通过句子预测韵律,标点断开 """ bert_input = [] for sent in sentence_list: assert len(sent) < 50 bert_input.append([c for c in sent]) print("bert-input:", bert_input) prosody = self.model.predict(bert_input) return prosody def compute_embed(self, sentence_list): bert_input = [[c for c in sent] for sent in sentence_list] print("bert-input:", bert_input) tensor = self.model.embedding.process_x_dataset(bert_input) res = self.model.tf_model.predict(tensor) import numpy as np print("debug:", np.shape(res), res[0]) return tensor def save_pb(self): self._write_dict() pb_dir = os.path.join(self.model_dir, "pb") os.makedirs(pb_dir, exist_ok=True) # [print(n.name) for n in tf.get_default_graph().as_graph_def().node] h5_to_pb(self.model.tf_model, pb_dir, self.sess, "model_psd.pb", ["output_psd"]) return @staticmethod def change_by_rules(old_pairs): """ 强制规则: 1. 逗号之前是#3,句号之前是#4 2. 其他位置,#3 -> #2 """ new_pairs = [] for i, (char, ph, psd) in enumerate(old_pairs[0:-1]): next_char, _, _ = old_pairs[i+1] if next_char == ",": new_pairs.append((char, ph, "3")) elif next_char in ["。", "?", "!"]: new_pairs.append((char, ph, "4")) else: if psd == "3": new_pairs.append((char, ph, "2")) else: new_pairs.append((char, ph, psd)) new_pairs.append(old_pairs[-1]) return new_pairs
class BertPolyPhone: """ 拼音预测主类""" def __init__(self): super().__init__() self.poly_dict = dict() poly_dict_path = "/data1/liufeng/synthesis/frontend/data/simple_poly_dict" for line in read_lines(poly_dict_path): line = line.replace(" ", "").replace("*", "") key = line.split(":")[0] value = line.split(":")[1].split(",") self.poly_dict[key] = value self.model, self.model_dir = None, None self.sess = None def inialize_model(self, bert_model_path, poly_model_path): print('=============init phone model=========================') print("bert model path:", bert_model_path) print("crf model path:", poly_model_path) # 需要训练数据的路径构建字典 self.sess = tf.Session() set_session(self.sess) self.model_dir = os.path.dirname(os.path.dirname(poly_model_path)) data_path = os.path.join(self.model_dir, "feature.pkl") train_data, train_label, test_data, test_label = \ pickle.load(open(data_path, 'rb')) bert_embed = BERTEmbedding(bert_model_path, task=kashgari.LABELING, sequence_length=50) self.model = BiLSTM_CRF_Model(bert_embed) self.model.build_model(x_train=train_data, y_train=train_label, x_validate=test_data, y_validate=test_label) self.model.compile_model() self.model.tf_model.load_weights(poly_model_path) print('=============successful loaded=========================') def _lookup_dict(self, bert_result, pred_ph_pairs): """查字典的方法对拼音进行修正 """ # todo: 如果词在词典中,不用bert的结果。 bert_phone_result = [] for index_c, (char, ph, _) in enumerate(pred_ph_pairs): if char in self.poly_dict.keys(): # 如果bert预测结果不在多音字字典中,就是预测结果跑偏了 if bert_result[index_c] not in self.poly_dict[char]: bert_phone_result.append((char, ph)) else: bert_result[index_c] = split_phone_format(bert_result[index_c]) bert_phone_result.append((char, bert_result[index_c])) if ph != bert_result[index_c]: print("using bert result {}:{} instead of {}".format( char, bert_result[index_c], ph)) else: bert_phone_result.append((char, ph)) return bert_phone_result def predict(self, sentence_list): """ 通过句子预测韵律,标点断开 """ bert_input = [] for sent in sentence_list: assert len(sent) < 50 bert_input.append([c for c in sent]) print("bert-input:", bert_input) prosody = self.model.predict(bert_input) return prosody def save_pb(self): self._write_dict() pb_dir = os.path.join(self.model_dir, "pb") os.makedirs(pb_dir, exist_ok=True) h5_to_pb(self.model.tf_model, pb_dir, self.sess, "model_phone.pb", ["output_phone"]) return def _write_dict(self): label_path = os.path.join(self.model_dir, "pb/phone_idx2label.txt") with open(label_path, "w", encoding="utf-8") as fr: for key, value in self.model.embedding.label2idx.items(): fr.write("{} {}\n".format(value, key)) print("write {}".format(label_path)) token_path = os.path.join(self.model_dir, "pb/phone_token2idx.txt") with open(token_path, "w", encoding="utf-8") as fr: for key, value in self.model.embedding.token2idx.items(): if len(key) > 0: fr.write("{} {}\n".format(key, value)) print("write {}".format(token_path)) return def compute_embed(self, sentence_list): bert_input = [[c for c in sent] for sent in sentence_list] print("bert-input:", bert_input) import numpy as np tensor = self.model.embedding.process_x_dataset(bert_input) print("debug:", np.shape(tensor), tensor) res = self.model.tf_model.predict(tensor) import numpy as np print("debug:", np.shape(res), res[0][0: len(sentence_list[0]+1)]) return tensor @staticmethod def _merge_eng_char(bert_phone_result, dict_phone_pairs): from src.utils import check_all_chinese index = 0 new_bert_phone = [] for word, _, _ in dict_phone_pairs: if (not check_all_chinese(word)) and len(word) > 1: new_bert_phone.append(bert_phone_result[index]) index += len(word) else: new_bert_phone.append(bert_phone_result[index]) index += 1 return new_bert_phone def modify_result(self, bert_result, dict_phone_pairs): bert_result = self._merge_eng_char(bert_result, dict_phone_pairs) bert_phone_pairs = self._lookup_dict(bert_result, dict_phone_pairs) phone_pairs = bert_phone_pairs # phone_pairs = change_yi(phone_pairs) # phone_pairs = change_bu(phone_pairs) phone_pairs = sandhi(phone_pairs) bert_result = [ph for _, ph in phone_pairs] chars = "".join([c for c, _ in phone_pairs]) bert_result = change_qingyin(bert_result, chars) return bert_result
# ou o GloVE-300 do http://nilc.icmc.usp.br/embeddings se não der certo # 2 - Ver como fazer o Predict. Temos que processar a frase para ficar igual a deles. # Eles usam um PunktSentenceTokenizer com um abbrev_list. Esses scripts estao na pasta leNer-dataset. # 3 - Ver como integrar esse codigo com o webstruct atual # 4 - Seria uma boa ideia ter uma interface tipo o Broka. Para que existesse a lista de arquivos, e que # pudesse abrir para re-treinar, abrindo com o plugin de Ramon. # Uma ideia seria ate converter o dataset deles atual para o formato do broka hoje em Html ( pode ser algo simples, como colocar cada paragrafo como um p) # 5 - Fazer a persistencia ( O kashgari tem um metodo save/load) # 2 - Aumentar epochs para treinar # You can use WordEmbedding or BERTEmbedding for your text embedding text_embedding = BareEmbedding(task=kashgari.LABELING) text_embedding.analyze_corpus(tokens, labels) # Now we can embed with this stacked embedding layer # We can build any labeling model with this embedding from kashgari.tasks.labeling import BiLSTM_CRF_Model model = BiLSTM_CRF_Model(embedding=text_embedding) model.fit(tokens, labels, batch_size=8, epochs=10) print(model.predict(tokens)) # print(model.predict_entities(x))