def test_classification_eval_callback(self): train_x, train_y = SMP2018ECDTCorpus.load_data() test_x, test_y = SMP2018ECDTCorpus.load_data('test') train_x = train_x[:1000] train_y = train_y[:1000] model = BLSTMModel() eval_callback = callbacks.EvalCallBack(model, test_x, test_y, step=1) model.fit(train_x, train_y, callbacks=[eval_callback], epochs=1)
def test_load_data(self): train_x, train_y = SMP2018ECDTCorpus.load_data() assert len(train_x) == len(train_y) assert len(train_x) > 0 assert train_x[:5] != train_y[:5] test_x, test_y = SMP2018ECDTCorpus.load_data('test') assert len(test_x) == len(test_y) assert len(test_x) > 0 test_x, test_y = SMP2018ECDTCorpus.load_data('valid') assert len(test_x) == len(test_y) assert len(test_x) > 0
def get_labels_info(self): labels = [] all_labels = [] label_map = {} lines = self.SMP2018ECDTCorpus2lines(SMP2018ECDTCorpus.load_data(subset_name='train',shuffle= True)) for line in lines: text = line[1] label = line[0] if label not in labels: labels.append(label) all_labels.append(label) # for cal intent_weights labels = sorted(set(labels), reverse=False) num_labels = sorted(set(labels), reverse=True).__len__() intent_class_weights = class_weight.compute_class_weight('balanced', labels, all_labels) label_map_file = os.path.join('output', "label_map.txt") with tf.gfile.GFile(label_map_file, "w") as writer: for (i, label) in enumerate(labels): label_map[label] = i writer.write("{}:{}\n".format(i, label)) return label_map, num_labels, intent_class_weights
def test_embed(self): embedding = self.embedding_class(task=kashgari.CLASSIFICATION, **self.config) valid_x, valid_y = SMP2018ECDTCorpus.load_data('valid') embedding.analyze_corpus(valid_x, valid_y) assert embedding.embed_one(['我', '想', '看']).shape == (15, 50257) assert embedding.embed([ ['我', '想', '看'], ['我', '想', '看', '权力的游戏'], ['Hello', 'world'] ]).shape == (3, 15, 50257) embedding = self.embedding_class(task=kashgari.LABELING, sequence_length=10, **self.config) valid_x, valid_y = ChineseDailyNerCorpus.load_data('valid') embedding.analyze_corpus(valid_x, valid_y) assert embedding.embed_one(['我', '想', '看']).shape == (10, 50257) assert embedding.embed([ ['我', '想', '看'], ['我', '想', '看', '权力的游戏'], ['Hello', 'world'] ]).shape == (3, 10, 50257)
def test_basic_use(self): model = self.TASK_MODEL_CLASS(sequence_length=20) train_x, train_y = SMP2018ECDTCorpus.load_data() valid_x, valid_y = train_x, train_y model.fit(train_x, train_y, x_validate=valid_x, y_validate=valid_y, epochs=self.EPOCH_COUNT) model_path = os.path.join(tempfile.gettempdir(), str(time.time())) original_y = model.predict(train_x[:20]) model.save(model_path) # Make sure use sigmoid as activation function assert model.tf_model.layers[-1].activation.__name__ == 'softmax' del model new_model = self.TASK_MODEL_CLASS.load_model(model_path) new_model.tf_model.summary() new_y = new_model.predict(train_x[:20]) assert new_y == original_y report = new_model.evaluate(valid_x, valid_y) for key in ['precision', 'recall', 'f1-score', 'support', 'detail']: assert key in report # Make sure use sigmoid as activation function assert new_model.tf_model.layers[-1].activation.__name__ == 'softmax'
def test_init_with_processor(self): valid_x, valid_y = SMP2018ECDTCorpus.load_data('valid') processor = ClassificationProcessor() processor.analyze_corpus(valid_x, valid_y) embedding = self.embedding_class(sequence_length=20, processor=processor, **self.config) embedding.analyze_corpus(valid_x, valid_y) assert embedding.embed_one(['我', '想', '看']).shape == (20, self.embedding_size)
def run_with_model_class(self, model_class: Type[ABCClassificationModel], epochs: int): bert_path = get_bert_path() train_x, train_y = SMP2018ECDTCorpus.load_data('train') valid_x, valid_y = SMP2018ECDTCorpus.load_data('valid') test_x, test_y = SMP2018ECDTCorpus.load_data('test') bert_embed = BertEmbedding(bert_path) model = model_class(bert_embed) log_path = os.path.join(log_root, model_class.__name__) file_writer = tf.summary.create_file_writer(log_path + "/metrics") file_writer.set_as_default() callbacks = [EvalCallBack(model, test_x, test_y, step=1)] model.fit(train_x, train_y, valid_x, valid_y, epochs=epochs, callbacks=callbacks) report = model.evaluate(test_x, test_y) del model del bert_embed return report
def test_with_model(self): x, y = SMP2018ECDTCorpus.load_data('test') embedding = self.build_embedding() model = BiGRU_Model(embedding=embedding) model.build_model(x, y) model_summary = [] embedding.embed_model.summary( print_fn=lambda x: model_summary.append(x)) logger.debug('\n'.join(model_summary)) model.fit(x, y, epochs=1) model_path = os.path.join(tempfile.gettempdir(), str(time.time())) model.save(model_path)
def test_init_with_processor(self): valid_x, valid_y = SMP2018ECDTCorpus.load_data('valid') processor = ClassificationProcessor() processor.analyze_corpus(valid_x, valid_y) if self.embedding_class is BareEmbedding: self.config['embedding_size'] = 55 embedding = self.embedding_class(sequence_length=20, processor=processor, **self.config) if self.embedding_class is BERTEmbedding: seq_len = 16 else: seq_len = 20 assert embedding.embed_one(['我', '想', '看']).shape == (seq_len, embedding.embedding_size)
def test_variable_length_embed(self): if self.embedding_class is BareEmbedding: self.config['embedding_size'] = 128 embedding = self.embedding_class(task=kashgari.CLASSIFICATION, sequence_length='variable', **self.config) valid_x, valid_y = SMP2018ECDTCorpus.load_data('valid') embedding.analyze_corpus(valid_x, valid_y) assert embedding.embed_one(['我', '想', '看' ]).shape == (3, embedding.embedding_size) assert embedding.embed_one(['Hello', 'World' ]).shape == (2, embedding.embedding_size) assert embedding.embed([['我', '想', '看'], ['我', '想', '看', '权力的游戏'], ['Hello', 'world'] ]).shape == (3, 4, embedding.embedding_size)
def test_with_word_embedding(self): model = self.TASK_MODEL_CLASS(embedding=self.w2v_embedding) train_x, train_y = SMP2018ECDTCorpus.load_data() valid_x, valid_y = train_x, train_y model.fit(train_x, train_y, x_validate=valid_x, y_validate=valid_y, epochs=self.EPOCH_COUNT) model_path = os.path.join(tempfile.gettempdir(), str(time.time())) _ = model.predict(valid_x[:20]) model.save(model_path) del model new_model = self.TASK_MODEL_CLASS.load_model(model_path) new_model.tf_model.summary() _ = new_model.predict(valid_x[:20])
def test_base_cases(self): embedding = self.build_embedding() x, y = SMP2018ECDTCorpus.load_data() processor = SequenceProcessor() processor.build_vocab(x, y) embedding.setup_text_processor(processor) samples = random.sample(x, sample_count) res = embedding.embed(samples) max_len = max([len(i) for i in samples]) + 2 if embedding.max_position is not None: max_len = embedding.max_position assert res.shape == (len(samples), max_len, embedding.embedding_size) # Test Save And Load embed_dict = embedding.to_dict() embedding2 = load_data_object(embed_dict) embedding2.setup_text_processor(processor) assert embedding2.embed(samples).shape == (len(samples), max_len, embedding.embedding_size)
return x0, x1 if __name__ == "__main__": logging.basicConfig(level=logging.DEBUG) # bert_model_path = os.path.join(utils.get_project_path(), 'tests/test-data/bert') b = BERTEmbedding( task=kashgari.CLASSIFICATION, model_folder= '/Users/brikerman/.kashgari/embedding/bert/chinese_L-12_H-768_A-12', sequence_length=12) from kashgari.corpus import SMP2018ECDTCorpus test_x, test_y = SMP2018ECDTCorpus.load_data('valid') b.analyze_corpus(test_x, test_y) data1 = 'all work and no play makes'.split(' ') data2 = '你 好 啊'.split(' ') r = b.embed([data1], True) tokens = b.process_x_dataset([['语', '言', '模', '型']])[0] target_index = [101, 6427, 6241, 3563, 1798, 102] target_index = target_index + [0] * (12 - len(target_index)) assert list(tokens[0]) == list(target_index) print(tokens) print(r) print(r.shape)
# file: test_processor.py # time: 2019-05-23 17:02 import os import time import logging import tempfile import unittest import numpy as np import random from kashgari import utils from kashgari.processors import ClassificationProcessor, LabelingProcessor, ScoringProcessor from kashgari.corpus import SMP2018ECDTCorpus, ChineseDailyNerCorpus from kashgari.tasks.classification import BiGRU_Model ner_train_x, ner_train_y = ChineseDailyNerCorpus.load_data('valid') class_train_x, class_train_y = SMP2018ECDTCorpus.load_data('valid') sample_train_x = [ list('语言学(英语:linguistics)是一门关于人类语言的科学研究'), list('语言学(英语:linguistics)是一门关于人类语言的科学研究'), list('语言学(英语:linguistics)是一门关于人类语言的科学研究'), list('语言学包含了几种分支领域。'), list('在语言结构(语法)研究与意义(语义与语用)研究之间存在一个重要的主题划分'), ] sample_train_y = [['b', 'c'], ['a'], ['a', 'c'], ['a', 'b'], ['c']] sample_eval_x = [ list('语言学是一门关于人类语言的科学研究。'), list('语言学包含了几种分支领域。'), list('在语言结构研究与意义研究之间存在一个重要的主题划分。'),
for layer in layers_rnn: tensor_rnn = layer(tensor_rnn) tensor_sensors = [layer(tensor_rnn) for layer in layers_sensor] tensor_output = layer_allviews(tensor_sensors) for layer in layers_full_connect: tensor_output = layer(tensor_output) self.tf_model = tf.keras.Model(embed_model.inputs, tensor_output) if __name__ == "__main__": print(BiLSTM_Model.get_default_hyper_parameters()) logging.basicConfig(level=logging.DEBUG) from kashgari.corpus import SMP2018ECDTCorpus x, y = SMP2018ECDTCorpus.load_data() import kashgari from kashgari.processors.classification_processor import ClassificationProcessor from kashgari.embeddings import BareEmbedding processor = ClassificationProcessor(multi_label=False) embed = BareEmbedding(task=kashgari.CLASSIFICATION, sequence_length=30, processor=processor) m = BiLSTM_Model(embed) # m.build_model(x, y) m.fit(x, y, epochs=2) print(m.predict(x[:10])) # m.evaluate(x, y) print(m.predict_top_k_class(x[:10]))
def test_jieba_load(self): train_x, train_y = SMP2018ECDTCorpus.load_data(cutter='jieba') assert len(train_x) == len(train_y) assert len(train_x) > 0
def get_test_examples(self,_): """See base class.""" return self._create_examples(self.SMP2018ECDTCorpus2lines(SMP2018ECDTCorpus.load_data(subset_name='test',shuffle= True)),'test')
'return_sequences': False }, 'layer_dense': { 'activation': 'linear' } } def build_model_arc(self): output_dim = self.processor.output_dim config = self.hyper_parameters embed_model = self.embedding.embed_model layer_bi_lstm = L.Bidirectional(L.LSTM(**config['layer_bi_lstm'])) layer_dense = L.Dense(output_dim, **config['layer_dense']) tensor = layer_bi_lstm(embed_model.output) output_tensor = layer_dense(tensor) self.tf_model = keras.Model(embed_model.inputs, output_tensor) if __name__ == "__main__": from kashgari.corpus import SMP2018ECDTCorpus import numpy as np x, y = SMP2018ECDTCorpus.load_data('valid') y = np.random.random((len(x), 4)) model = BiLSTM_Model() model.fit(x, y) print(model.predict(x[:10]))
# Time : 2020/9/3 7:23 下午 # File : k_fold_evaluation.py # Project : Kashgari from sklearn.model_selection import StratifiedKFold import numpy as np from kashgari.corpus import SMP2018ECDTCorpus from kashgari.tasks.classification import BiLSTM_Model # fix random seed for reproducibility seed = 7 np.random.seed(seed) # Combine all data for k-folding train_x, train_y = SMP2018ECDTCorpus.load_data('train') valid_x, valid_y = SMP2018ECDTCorpus.load_data('valid') test_x, test_y = SMP2018ECDTCorpus.load_data('test') X = train_x + valid_x + test_x Y = train_y + valid_y + test_y # define 10-fold cross validation test harness k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed) scores = [] for train_indexs, test_indexs in k_fold.split(X, Y): train_x, train_y = [], [] test_x, test_y = [], [] for i in train_indexs: