示例#1
0
    def test_classification_eval_callback(self):
        train_x, train_y = SMP2018ECDTCorpus.load_data()
        test_x, test_y = SMP2018ECDTCorpus.load_data('test')

        train_x = train_x[:1000]
        train_y = train_y[:1000]
        model = BLSTMModel()
        eval_callback = callbacks.EvalCallBack(model, test_x, test_y, step=1)
        model.fit(train_x, train_y, callbacks=[eval_callback], epochs=1)
示例#2
0
    def test_load_data(self):
        train_x, train_y = SMP2018ECDTCorpus.load_data()
        assert len(train_x) == len(train_y)
        assert len(train_x) > 0
        assert train_x[:5] != train_y[:5]

        test_x, test_y = SMP2018ECDTCorpus.load_data('test')
        assert len(test_x) == len(test_y)
        assert len(test_x) > 0

        test_x, test_y = SMP2018ECDTCorpus.load_data('valid')
        assert len(test_x) == len(test_y)
        assert len(test_x) > 0
    def get_labels_info(self):
        labels = []
        all_labels = []
        label_map = {}
        lines = self.SMP2018ECDTCorpus2lines(SMP2018ECDTCorpus.load_data(subset_name='train',shuffle= True))

        for line in lines:
            text = line[1]
            label = line[0]
            if label not in labels:
                labels.append(label)
            all_labels.append(label) # for cal intent_weights

        labels = sorted(set(labels), reverse=False)
        num_labels = sorted(set(labels), reverse=True).__len__()

        intent_class_weights = class_weight.compute_class_weight('balanced',
                                                                 labels,
                                                                 all_labels)


        label_map_file = os.path.join('output', "label_map.txt")
        with tf.gfile.GFile(label_map_file, "w") as writer:
            for (i, label) in enumerate(labels):
                label_map[label] = i
                writer.write("{}:{}\n".format(i, label))


        return label_map, num_labels, intent_class_weights
示例#4
0
    def test_embed(self):
        embedding = self.embedding_class(task=kashgari.CLASSIFICATION,
                                         **self.config)

        valid_x, valid_y = SMP2018ECDTCorpus.load_data('valid')
        embedding.analyze_corpus(valid_x, valid_y)

        assert embedding.embed_one(['我', '想', '看']).shape == (15, 50257)

        assert embedding.embed([
            ['我', '想', '看'],
            ['我', '想', '看', '权力的游戏'],
            ['Hello', 'world']
        ]).shape == (3, 15, 50257)

        embedding = self.embedding_class(task=kashgari.LABELING,
                                         sequence_length=10,
                                         **self.config)

        valid_x, valid_y = ChineseDailyNerCorpus.load_data('valid')
        embedding.analyze_corpus(valid_x, valid_y)

        assert embedding.embed_one(['我', '想', '看']).shape == (10, 50257)

        assert embedding.embed([
            ['我', '想', '看'],
            ['我', '想', '看', '权力的游戏'],
            ['Hello', 'world']
        ]).shape == (3, 10, 50257)
示例#5
0
    def test_basic_use(self):
        model = self.TASK_MODEL_CLASS(sequence_length=20)
        train_x, train_y = SMP2018ECDTCorpus.load_data()
        valid_x, valid_y = train_x, train_y

        model.fit(train_x,
                  train_y,
                  x_validate=valid_x,
                  y_validate=valid_y,
                  epochs=self.EPOCH_COUNT)

        model_path = os.path.join(tempfile.gettempdir(), str(time.time()))
        original_y = model.predict(train_x[:20])
        model.save(model_path)

        # Make sure use sigmoid as activation function
        assert model.tf_model.layers[-1].activation.__name__ == 'softmax'

        del model
        new_model = self.TASK_MODEL_CLASS.load_model(model_path)
        new_model.tf_model.summary()
        new_y = new_model.predict(train_x[:20])
        assert new_y == original_y

        report = new_model.evaluate(valid_x, valid_y)
        for key in ['precision', 'recall', 'f1-score', 'support', 'detail']:
            assert key in report

        # Make sure use sigmoid as activation function
        assert new_model.tf_model.layers[-1].activation.__name__ == 'softmax'
    def test_init_with_processor(self):
        valid_x, valid_y = SMP2018ECDTCorpus.load_data('valid')

        processor = ClassificationProcessor()
        processor.analyze_corpus(valid_x, valid_y)

        embedding = self.embedding_class(sequence_length=20,
                                         processor=processor,
                                         **self.config)
        embedding.analyze_corpus(valid_x, valid_y)
        assert embedding.embed_one(['我', '想', '看']).shape == (20, self.embedding_size)
示例#7
0
    def run_with_model_class(self, model_class: Type[ABCClassificationModel], epochs: int):
        bert_path = get_bert_path()

        train_x, train_y = SMP2018ECDTCorpus.load_data('train')
        valid_x, valid_y = SMP2018ECDTCorpus.load_data('valid')
        test_x, test_y = SMP2018ECDTCorpus.load_data('test')

        bert_embed = BertEmbedding(bert_path)
        model = model_class(bert_embed)

        log_path = os.path.join(log_root, model_class.__name__)
        file_writer = tf.summary.create_file_writer(log_path + "/metrics")
        file_writer.set_as_default()
        callbacks = [EvalCallBack(model, test_x, test_y, step=1)]

        model.fit(train_x, train_y, valid_x, valid_y, epochs=epochs, callbacks=callbacks)

        report = model.evaluate(test_x, test_y)
        del model
        del bert_embed
        return report
示例#8
0
    def test_with_model(self):
        x, y = SMP2018ECDTCorpus.load_data('test')
        embedding = self.build_embedding()

        model = BiGRU_Model(embedding=embedding)
        model.build_model(x, y)
        model_summary = []
        embedding.embed_model.summary(
            print_fn=lambda x: model_summary.append(x))
        logger.debug('\n'.join(model_summary))

        model.fit(x, y, epochs=1)

        model_path = os.path.join(tempfile.gettempdir(), str(time.time()))
        model.save(model_path)
示例#9
0
    def test_init_with_processor(self):
        valid_x, valid_y = SMP2018ECDTCorpus.load_data('valid')

        processor = ClassificationProcessor()
        processor.analyze_corpus(valid_x, valid_y)
        if self.embedding_class is BareEmbedding:
            self.config['embedding_size'] = 55

        embedding = self.embedding_class(sequence_length=20,
                                         processor=processor,
                                         **self.config)
        if self.embedding_class is BERTEmbedding:
            seq_len = 16
        else:
            seq_len = 20

        assert embedding.embed_one(['我', '想',
                                    '看']).shape == (seq_len,
                                                    embedding.embedding_size)
示例#10
0
    def test_variable_length_embed(self):
        if self.embedding_class is BareEmbedding:
            self.config['embedding_size'] = 128

        embedding = self.embedding_class(task=kashgari.CLASSIFICATION,
                                         sequence_length='variable',
                                         **self.config)

        valid_x, valid_y = SMP2018ECDTCorpus.load_data('valid')
        embedding.analyze_corpus(valid_x, valid_y)

        assert embedding.embed_one(['我', '想', '看'
                                    ]).shape == (3, embedding.embedding_size)

        assert embedding.embed_one(['Hello', 'World'
                                    ]).shape == (2, embedding.embedding_size)

        assert embedding.embed([['我', '想', '看'], ['我', '想', '看', '权力的游戏'],
                                ['Hello', 'world']
                                ]).shape == (3, 4, embedding.embedding_size)
示例#11
0
    def test_with_word_embedding(self):
        model = self.TASK_MODEL_CLASS(embedding=self.w2v_embedding)
        train_x, train_y = SMP2018ECDTCorpus.load_data()
        valid_x, valid_y = train_x, train_y

        model.fit(train_x,
                  train_y,
                  x_validate=valid_x,
                  y_validate=valid_y,
                  epochs=self.EPOCH_COUNT)

        model_path = os.path.join(tempfile.gettempdir(), str(time.time()))
        _ = model.predict(valid_x[:20])
        model.save(model_path)

        del model

        new_model = self.TASK_MODEL_CLASS.load_model(model_path)
        new_model.tf_model.summary()
        _ = new_model.predict(valid_x[:20])
示例#12
0
    def test_base_cases(self):
        embedding = self.build_embedding()
        x, y = SMP2018ECDTCorpus.load_data()
        processor = SequenceProcessor()
        processor.build_vocab(x, y)
        embedding.setup_text_processor(processor)

        samples = random.sample(x, sample_count)
        res = embedding.embed(samples)
        max_len = max([len(i) for i in samples]) + 2

        if embedding.max_position is not None:
            max_len = embedding.max_position

        assert res.shape == (len(samples), max_len, embedding.embedding_size)

        # Test Save And Load
        embed_dict = embedding.to_dict()
        embedding2 = load_data_object(embed_dict)
        embedding2.setup_text_processor(processor)
        assert embedding2.embed(samples).shape == (len(samples), max_len,
                                                   embedding.embedding_size)
示例#13
0
        return x0, x1


if __name__ == "__main__":
    logging.basicConfig(level=logging.DEBUG)

    # bert_model_path = os.path.join(utils.get_project_path(), 'tests/test-data/bert')

    b = BERTEmbedding(
        task=kashgari.CLASSIFICATION,
        model_folder=
        '/Users/brikerman/.kashgari/embedding/bert/chinese_L-12_H-768_A-12',
        sequence_length=12)

    from kashgari.corpus import SMP2018ECDTCorpus

    test_x, test_y = SMP2018ECDTCorpus.load_data('valid')

    b.analyze_corpus(test_x, test_y)
    data1 = 'all work and no play makes'.split(' ')
    data2 = '你 好 啊'.split(' ')
    r = b.embed([data1], True)

    tokens = b.process_x_dataset([['语', '言', '模', '型']])[0]
    target_index = [101, 6427, 6241, 3563, 1798, 102]
    target_index = target_index + [0] * (12 - len(target_index))
    assert list(tokens[0]) == list(target_index)
    print(tokens)
    print(r)
    print(r.shape)
示例#14
0
# file: test_processor.py
# time: 2019-05-23 17:02
import os
import time
import logging
import tempfile
import unittest
import numpy as np
import random
from kashgari import utils
from kashgari.processors import ClassificationProcessor, LabelingProcessor, ScoringProcessor
from kashgari.corpus import SMP2018ECDTCorpus, ChineseDailyNerCorpus
from kashgari.tasks.classification import BiGRU_Model

ner_train_x, ner_train_y = ChineseDailyNerCorpus.load_data('valid')
class_train_x, class_train_y = SMP2018ECDTCorpus.load_data('valid')

sample_train_x = [
    list('语言学(英语:linguistics)是一门关于人类语言的科学研究'),
    list('语言学(英语:linguistics)是一门关于人类语言的科学研究'),
    list('语言学(英语:linguistics)是一门关于人类语言的科学研究'),
    list('语言学包含了几种分支领域。'),
    list('在语言结构(语法)研究与意义(语义与语用)研究之间存在一个重要的主题划分'),
]

sample_train_y = [['b', 'c'], ['a'], ['a', 'c'], ['a', 'b'], ['c']]

sample_eval_x = [
    list('语言学是一门关于人类语言的科学研究。'),
    list('语言学包含了几种分支领域。'),
    list('在语言结构研究与意义研究之间存在一个重要的主题划分。'),
示例#15
0
        for layer in layers_rnn:
            tensor_rnn = layer(tensor_rnn)
        tensor_sensors = [layer(tensor_rnn) for layer in layers_sensor]
        tensor_output = layer_allviews(tensor_sensors)
        for layer in layers_full_connect:
            tensor_output = layer(tensor_output)

        self.tf_model = tf.keras.Model(embed_model.inputs, tensor_output)


if __name__ == "__main__":
    print(BiLSTM_Model.get_default_hyper_parameters())
    logging.basicConfig(level=logging.DEBUG)
    from kashgari.corpus import SMP2018ECDTCorpus

    x, y = SMP2018ECDTCorpus.load_data()

    import kashgari
    from kashgari.processors.classification_processor import ClassificationProcessor
    from kashgari.embeddings import BareEmbedding

    processor = ClassificationProcessor(multi_label=False)
    embed = BareEmbedding(task=kashgari.CLASSIFICATION,
                          sequence_length=30,
                          processor=processor)
    m = BiLSTM_Model(embed)
    # m.build_model(x, y)
    m.fit(x, y, epochs=2)
    print(m.predict(x[:10]))
    # m.evaluate(x, y)
    print(m.predict_top_k_class(x[:10]))
示例#16
0
 def test_jieba_load(self):
     train_x, train_y = SMP2018ECDTCorpus.load_data(cutter='jieba')
     assert len(train_x) == len(train_y)
     assert len(train_x) > 0
 def get_test_examples(self,_):
     """See base class."""
     return self._create_examples(self.SMP2018ECDTCorpus2lines(SMP2018ECDTCorpus.load_data(subset_name='test',shuffle= True)),'test') 
示例#18
0
                'return_sequences': False
            },
            'layer_dense': {
                'activation': 'linear'
            }
        }

    def build_model_arc(self):
        output_dim = self.processor.output_dim
        config = self.hyper_parameters
        embed_model = self.embedding.embed_model

        layer_bi_lstm = L.Bidirectional(L.LSTM(**config['layer_bi_lstm']))
        layer_dense = L.Dense(output_dim, **config['layer_dense'])

        tensor = layer_bi_lstm(embed_model.output)
        output_tensor = layer_dense(tensor)

        self.tf_model = keras.Model(embed_model.inputs, output_tensor)


if __name__ == "__main__":
    from kashgari.corpus import SMP2018ECDTCorpus
    import numpy as np

    x, y = SMP2018ECDTCorpus.load_data('valid')
    y = np.random.random((len(x), 4))
    model = BiLSTM_Model()
    model.fit(x, y)
    print(model.predict(x[:10]))
示例#19
0
# Time    : 2020/9/3 7:23 下午
# File    : k_fold_evaluation.py
# Project : Kashgari

from sklearn.model_selection import StratifiedKFold
import numpy as np
from kashgari.corpus import SMP2018ECDTCorpus
from kashgari.tasks.classification import BiLSTM_Model

# fix random seed for reproducibility
seed = 7
np.random.seed(seed)

# Combine all data for k-folding

train_x, train_y = SMP2018ECDTCorpus.load_data('train')
valid_x, valid_y = SMP2018ECDTCorpus.load_data('valid')
test_x, test_y = SMP2018ECDTCorpus.load_data('test')

X = train_x + valid_x + test_x
Y = train_y + valid_y + test_y

# define 10-fold cross validation test harness
k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=seed)
scores = []

for train_indexs, test_indexs in k_fold.split(X, Y):
    train_x, train_y = [], []
    test_x, test_y = [], []

    for i in train_indexs: