def build_embedding(self): sample_w2v_path = get_file( 'sample_w2v.txt', "http://s3.bmio.net/kashgari/sample_w2v.txt", cache_dir=DATA_PATH) embedding = WordEmbedding(sample_w2v_path) return embedding
def test_w2v_model(self): x, y = NERCorpus.load_corpus() w2v_embedding = WordEmbedding(sample_w2v_path, task=kashgari.LABELING) model = self.model_class(embedding=w2v_embedding) try: model.fit(x, y, x, y, epochs=1) model.evaluate(x, y) assert True except Exception as e: print(model.label2idx) raise e
def test_with_word_embedding(self): w2v_embedding = WordEmbedding(TestMacros.w2v_path) model = self.TASK_MODEL_CLASS(embedding=w2v_embedding, sequence_length=120) train_x, train_y = TestMacros.load_labeling_corpus() valid_x, valid_y = train_x, train_y model.fit(train_x, train_y, x_validate=valid_x, y_validate=valid_y, epochs=self.EPOCH_COUNT)
def test_variable_length_model(self): x, y = NERCorpus.load_corpus('custom_2') hyper_params = self.model_class.get_default_hyper_parameters() for layer, config in hyper_params.items(): for key, value in config.items(): if isinstance(value, int): hyper_params[layer][key] = value + 15 w2v_embedding_variable_len = WordEmbedding(sample_w2v_path, task=kashgari.LABELING, sequence_length='variable') model = self.model_class(embedding=w2v_embedding_variable_len, hyper_parameters=hyper_params) try: model.fit(x, y, epochs=1) model.evaluate(x, y) assert True except Exception as e: print(model.label2idx) raise e
def setUpClass(cls): cls.EPOCH_COUNT = 1 cls.TASK_MODEL_CLASS = BiLSTM_Model cls.w2v_embedding = WordEmbedding(TestMacros.w2v_path)
import kashgari from kashgari.embeddings import WordEmbedding # need to spesify task for the downstream task, # if use embedding for feature extraction, just set `task=kashgari.CLASSIFICATION` bert = WordEmbedding('sgns.sogou.word', sequence_length=600) # call for bulk embed embed_tensor = bert.embed([['语', '言', '模', '型']]) # call for single embed print(embed_tensor)
from kashgari.embeddings import BareEmbedding from tensorflow.python.keras.utils import get_file valid_x, valid_y = SMP2018ECDTCorpus.load_data('valid') bert_path = get_file('bert_sample_model', "http://s3.bmio.net/kashgari/bert_sample_model.tar.bz2", cache_dir=DATA_PATH, untar=True) sample_w2v_path = get_file('sample_w2v.txt', "http://s3.bmio.net/kashgari/sample_w2v.txt", cache_dir=DATA_PATH) w2v_embedding = WordEmbedding(sample_w2v_path, task=kashgari.CLASSIFICATION) w2v_embedding_variable_len = WordEmbedding(sample_w2v_path, task=kashgari.CLASSIFICATION, sequence_length='variable') logging.basicConfig(level=logging.DEBUG) sample_train_x = [ list('语言学(英语:linguistics)是一门关于人类语言的科学研究'), list('语言学(英语:linguistics)是一门关于人类语言的科学研究'), list('语言学(英语:linguistics)是一门关于人类语言的科学研究'), list('语言学包含了几种分支领域。'), list('在语言结构(语法)研究与意义(语义与语用)研究之间存在一个重要的主题划分'), ] sample_train_y = [['b', 'c'], ['a'], ['a', 'c'], ['a', 'b'], ['c']]
tensor = embed_model.output for layer in layer_stack: tensor = layer(tensor) self.tf_model: keras.Model = keras.Model(embed_model.inputs, tensor) if __name__ == "__main__": import logging logging.basicConfig(level='DEBUG') from kashgari.embeddings import WordEmbedding w2v_path = '/Users/brikerman/Desktop/nlp/language_models/w2v/sgns.weibo.bigram-char' w2v = WordEmbedding(w2v_path, w2v_kwargs={'limit': 10000}) from kashgari.corpus import SMP2018ECDTCorpus x, y = SMP2018ECDTCorpus.load_data() model = BiLSTM_Model(embedding=w2v) model.fit(x, y) # 或者集成 CorpusGenerator 实现自己的数据迭代器 # train_gen = CorpusGenerator() # model.fit_generator(train_gen=train_gen, # valid_gen=valid_gen, # batch_size=batch_size, # epochs=epochs)
import numpy as np import kashgari from kashgari.corpus import ChineseDailyNerCorpus from kashgari.embeddings import WordEmbedding from kashgari.tasks.labeling import CNN_LSTM_Model from kashgari.macros import DATA_PATH from tensorflow.python.keras.utils import get_file valid_x, valid_y = ChineseDailyNerCorpus.load_data('valid') sample_w2v_path = get_file('sample_w2v.txt', "http://s3.bmio.net/kashgari/sample_w2v.txt", cache_dir=DATA_PATH) w2v_embedding = WordEmbedding(sample_w2v_path, task=kashgari.LABELING) w2v_embedding_variable_len = WordEmbedding(sample_w2v_path, task=kashgari.LABELING, sequence_length='variable') class TestCNN_LSTM_Model(unittest.TestCase): @classmethod def setUpClass(cls): cls.model_class = CNN_LSTM_Model def test_basic_use_build(self): model = self.model_class() model.fit(valid_x, valid_y, valid_x, valid_y, epochs=1) model.predict_entities(valid_x[:5]) model.evaluate(valid_x[:100], valid_y[:100])
# print(f"train data count: {len(train_x)}") # print(f"validate data count: {len(valid_x)}") # print(f"test data count: {len(test_x)}") from kashgari.embeddings import WordEmbedding from kashgari.embeddings import BareEmbedding from kashgari.embeddings import BERTEmbedding from kashgari.tasks.labeling import BiLSTM_CRF_Model_Attention from kashgari.tasks.labeling import CNN_BiLSTM_CRF_Model_WordSegmentation from kashgari.tasks.labeling import BiLSTM_CRF_Model from kashgari.tasks.labeling import BiLSTM_LSTMDecoder_Model from kashgari.tasks.labeling import BiLSTM_CRF_Model_Position from kashgari import callbacks_word # bare_embed = BareEmbedding(task=kashgari.LABELING,sequence_length=500) char_embed = WordEmbedding(w2v_path="/home/y182235017/law/sgns.target.word-character.char1-2.dynwin5.thr10.neg5.dim300.iter5",task=kashgari.LABELING,sequence_length=500) # bert_embed = BERTEmbedding("/home/y182235017/law/chinese_L-12_H-768_A-12",task=kashgari.LABELING,sequence_length=500) model = CNN_BiLSTM_CRF_Model_WordSegmentation(char_embed) mycallback = callbacks_word.EvalCallBack(model,test_x,test_y,batch_size=128,path="/home/y182235017/law/model/Word_CNN_BiLSTM_CRF_Model_seg/") mycallback={"callbacks":[mycallback]} model.fit_without_generator_word( train_x, train_y, train_z, x_validate=test_x, y_validate=test_y, z_validate=test_z, epochs=20, batch_size=128, **mycallback)