def train(self):
        x_items, train_y, valid_x, valid_y = self.read_message('car/train.csv')
        # 获取bert字向量

        model = CNNModel()
        # 输入模型训练数据 标签 步数
        model.fit(x_items,
                  train_y,
                  valid_x,
                  valid_y,
                  batch_size=64,
                  epochs=12,
                  callbacks=[tf_board_callback])
        # 保存模型
        file = pd.read_csv("car/test.csv", encoding='utf-8').values.tolist()
        test_data = []
        id_list = []
        for i in file:
            test_data.append(jieba.lcut(str(i[1]) + str(i[2])))
            id_list.append(i[0])
        predict_answers = model.predict(x_data=test_data)
        file = open("data/test_predict_bert_car.csv", 'w', encoding='utf-8')
        for i, j in zip(id_list, predict_answers):
            i = i.strip()
            file.write(str(i) + "," + str(j) + "\n")
        model.save("../model/news-classification-bert-model")
Exemplo n.º 2
0
 def train(self):
     x_items, train_y = read_message()
     # 获取bert字向量
     bert = BERTEmbedding(self.bert_place, sequence_length=256)
     model = CNNModel(bert)
     # 输入模型训练数据 标签 步数
     model.fit(x_items,
               train_y,
               epochs=200,
               batch_size=32,
               fit_kwargs={'callbacks': [tf_board_callback]})
     # 保存模型
     model.save("output/classification-model")
     model.evaluate(x_items, train_y)
def train():
    x_items, train_y = read_message()
    # 获取bert字向量
    model = CNNModel(bert)
    # 输入模型训练数据 标签 步数
    model.fit(x_items,
              train_y,
              epochs=20,
              class_weight=True,
              fit_kwargs={'callbacks': [tf_board_callback]})
    # 保存模型
    model.save("../classification-model")
    for i in x_items:
        result = model.predict(i)
        print("\n" + result)
    def train(self):
        x_train, train_y = self.read_message('../data/西药执业药师/train.txt')
        x_dev, dev_y = self.read_message('../data/西药执业药师/test.txt')
        x_test, test_y = self.read_message('../data/西药执业药师/dev.txt')
        # 获取bert字向量
        bert = BERTEmbedding('bert-base-chinese', sequence_length=100)
        # 获取词向量
        # embedding = WordEmbeddings('sgns.weibo.bigram.bz2', 50)

        long_model = CNNModel(bert)
        # 输入模型训练数据 标签 步数
        long_model.fit(x_train,
                       train_y,
                       x_dev,
                       dev_y,
                       epochs=20,
                       batch_size=128,
                       fit_kwargs={'callbacks': [tf_board_callback]})
        # 保存模型
        long_model.save("../classification-model")
        result = long_model.evaluate(x_test, test_y)
        return result
Exemplo n.º 5
0
import tqdm
import jieba
from kashgari.tasks.classification import CNNModel


def read_data_file(path):
    lines = open(path, 'r', encoding='utf-8').read().splitlines()
    x_list = []
    y_list = []
    for line in tqdm.tqdm(lines):
        rows = line.split('\t')
        if len(rows) >= 2:
            y_list.append(rows[0])
            x_list.append(list(jieba.cut('\t'.join(rows[1:]))))
        else:
            print(rows)
    return x_list, y_list


test_x, test_y = read_data_file('cnews/cnews.test.txt')
train_x, train_y = read_data_file('cnews/cnews.train.txt')
val_x, val_y = read_data_file('cnews/cnews.val.txt')

model = CNNModel()
model.fit(train_x, train_y, val_x, val_y, batch_size=128)
result = model.evaluate(test_x, test_y)
model.save('model/kashgari/cnn')
Exemplo n.º 6
0
        word2idx[k.BOS] = word2idx['pad']
        word2idx[k.EOS] = word2idx['pad']
        self.token2idx = word2idx

    def build_token2idx_dict(self,
                             x_data: List[TextSeqType],
                             min_count: int = 5):
        logging.debug(
            "word2vec embedding no need to build token2idx with corpus")


if __name__ == '__main__':
    train_x = [
        list('语言学(英语:linguistics)是一门关于人类语言的科学研究'),
        list('语言学(英语:linguistics)是一门关于人类语言的科学研究'),
        list('语言学(英语:linguistics)是一门关于人类语言的科学研究'),
        list('语言学包含了几种分支领域。'),
        list('在语言结构(语法)研究与意义(语义与语用)研究之间存在一个重要的主题划分'),
    ]
    train_y = ['a', 'a', 'a', 'b', 'c']

    from kashgari.utils.logger import init_logger
    from kashgari.tasks.classification import CNNModel
    init_logger()
    embedding = GPT2Embedding(
        '/Users/brikerman/Desktop/python/gpt-2/models/117M', 10)
    r = embedding.embed(['hello', 'world'])
    model = CNNModel(embedding)
    model.fit(train_x, train_y, epochs=20)
    print(r.shape)
Exemplo n.º 7
0
tf_board_callback = keras.callbacks.TensorBoard(log_dir='tf_dir', update_freq=10)

from kashgari.tasks.classification import CNNLSTMModel, CNNModel

save = ModelCheckpoint(
    os.path.join('model_dir', 'CNNModel_bert.h5'),
    monitor='val_acc',
    verbose=1,
    save_best_only=True,
    mode='auto'
)
early_stopping = EarlyStopping(
    monitor='val_acc',
    min_delta=0,
    patience=8,
    verbose=1,
    mode='auto'
)
model = CNNModel(embed)

# ------------ build model ------------
model.fit(
    train_features, train_labels,
    valid_features, valid_labels,
    epochs=60,
    batch_size=256,
    callbacks=[tf_board_callback, save, early_stopping]
)
model.evaluate(test_features, test_labels)