Пример #1
0
import json
import numpy as np
import codecs
from bert4keras.backend import keras, set_gelu
from bert4keras.tokenizer import SpTokenizer
from bert4keras.bert import build_bert_model
from bert4keras.optimizers import Adam, extend_with_piecewise_linear_lr
from bert4keras.snippets import sequence_padding, get_all_attributes

locals().update(get_all_attributes(keras.layers))
set_gelu('tanh')

maxlen = 256
config_path = 'models/albert_base/albert_config.json'
checkpoint_path = 'models/albert_base/variables/variables'
spm_path = 'models/albert_base/assets/30k-clean.model'


def load_data(filename):
    D = []
    with codecs.open(filename, encoding='utf-8') as f:
        for l in f:
            text, label = l.strip().split('\t')
            D.append((text, int(label)))
    return D


train_data = load_data('datasets/IMDB_trainshuffle.data')
valid_data = load_data('datasets/IMDB_valshuffle.data')
test_data = load_data('datasets/IMDB_testshuffle.data')
Пример #2
0
# !/usr/bin/python
# -*- coding: utf-8 -*-
# @time    : 2020/4/29 23:43
# @author  : Mo
# @function: text-classification(tc)

from macadam.tc.t00_predict import ModelPredict
from macadam.tc.t00_trainer import trainer
from macadam.tc.t00_map import graph_map
from bert4keras.backend import set_gelu
set_gelu("tanh")  # "erf" or "tanh"
Пример #3
0
# val_acc: 0.887071, test_acc: 0.870320

import json
import numpy as np
from random import choice
import re, os, codecs
from bert4keras.backend import set_gelu, K
from bert4keras.utils import Tokenizer, load_vocab
from bert4keras.bert import build_bert_model
from bert4keras.train import PiecewiseLinearLearningRate
from keras.layers import *
from keras.models import Model
from keras.optimizers import Adam
from keras.callbacks import Callback

set_gelu('tanh') # 切换gelu版本


maxlen = 128
config_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_wwm_L-12_H-768_A-12/vocab.txt'


def load_data(filename):
    D = []
    with codecs.open(filename, encoding='utf-8') as f:
        for l in f:
            text1, text2, label = l.strip().split('\t')
            D.append((text1, text2, int(label)))
    return D
Пример #4
0
def train(args):
    if "bert" in args.model_type:
        set_gelu("tanh")  # 切换gelu版本

        # Step1: Load Data
        data_generator = None
        if "siamese" in args.model_type:
            data_generator = SiameseDataGenerator
        elif "albert" in args.model_type:
            data_generator = BertDataGenerator

        train_ds = data_generator(data_path=args.train_data_path,
                                  batch_size=args.batch_size,
                                  dict_path=args.bert_dict_path,
                                  maxlen=args.query_len)
        dev_ds = data_generator(data_path=args.dev_data_path,
                                batch_size=args.batch_size,
                                maxlen=args.query_len,
                                dict_path=args.bert_dict_path)
        test_ds = data_generator(data_path=args.test_data_path,
                                 batch_size=args.batch_size,
                                 maxlen=args.query_len,
                                 dict_path=args.bert_dict_path)

        # Step2: Load Model
        model = None
        if "siamese" in args.model_type:
            model = SiameseBertModel(config_path=args.bert_config_path,
                                     checkpoint_path=args.bert_checkpoint_path,
                                     dense_units=args.dense_units)
        elif "albert" in args.model_type:
            model = BertModel(config_path=args.bert_config_path,
                              checkpoint_path=args.bert_checkpoint_path)

        model_name = model.__class__.__name__
        model = model.get_model()

        from bert4keras.optimizers import Adam
        model.compile(
            loss='sparse_categorical_crossentropy',
            optimizer=Adam(2e-5),  # 用足够小的学习率
            # optimizer=PiecewiseLinearLearningRate(Adam(5e-5), {10000: 1, 30000: 0.1}),
            metrics=['accuracy'],
        )

        evaluator = Evaluator(dev_ds=dev_ds,
                              model_name=model_name,
                              is_bert_model=True,
                              test_ds=test_ds)
        logger.info("***** Running training *****")
        logger.info("  Model Class Name = %s", model_name)
        logger.info("  Num Epochs = %d", args.epoch)
        model.fit_generator(train_ds.forfit(),
                            steps_per_epoch=len(train_ds),
                            epochs=args.epoch,
                            callbacks=[evaluator],
                            verbose=2)

        model.load_weights('./checkpoints/best_{}.weight'.format(model_name))
        logger.info("***** Test Reslt *****")
        logger.info("  Model = %s", model_name)
        logger.info("  Batch Size = %d", args.batch_size)
        logger.info("  Final Test Acc:%05f",
                    cal_acc(data=test_ds, model=model, is_bert_model=True))

    elif "NN" in args.model_type:
        # Step 1 : Loda Data
        train_data = pd.read_csv(args.train_data_path)
        dev_data = pd.read_csv(args.dev_data_path)
        test_data = pd.read_csv(args.test_data_path)

        category_count = len(train_data["category"].value_counts())
        category_encoder = category_OneHotEncoder(data_df=train_data)

        loader = LoadData(w2v_path=args.w2v_path, query_len=args.query_len)
        word2idx = loader.word2idx
        emd_matrix = loader.emb_matrix
        """
        注意:
        shuffle的顺序很重要:一般建议是先执行shuffle方法,接着采用batch方法。
        这样是为了保证在整体数据打乱之后再取出batch_size大小的数据。
        如果先采取batch方法再采用shuffle方法,那么此时就只是对batch进行shuffle,
        而batch里面的数据顺序依旧是有序的,那么随机程度会减弱。
        """
        train_ds = loader.dataset(encoder=category_encoder, data_df=train_data)
        train_ds = train_ds.shuffle(buffer_size=len(train_data)).batch(
            batch_size=args.batch_size).repeat()

        dev_ds = loader.dataset(encoder=category_encoder, data_df=dev_data)
        dev_ds = dev_ds.batch(batch_size=args.batch_size)
        test_ds = loader.dataset(encoder=category_encoder, data_df=test_data)
        test_ds = test_ds.batch(batch_size=args.batch_size)

        # Step2: Load Model
        model = None
        if "siamese_CNN" in args.model_type:
            model = SiameseCnnModel(emb_matrix=emd_matrix,
                                    word2idx=word2idx,
                                    filters_nums=args.filters_nums,
                                    kernel_sizes=args.kernel_sizes,
                                    dense_units=args.dense_units,
                                    label_count=args.label_count,
                                    category_count=category_count,
                                    query_len=args.query_len,
                                    shared=args.feature_shared,
                                    add_feature=args.add_features)
        elif "siamese_RNN" in args.model_type:
            model = SiameseRnnModel(emb_matrix=emd_matrix,
                                    word2idx=word2idx,
                                    hidden_units=args.hidden_units,
                                    dense_units=args.dense_units,
                                    label_count=args.label_count,
                                    category_count=category_count,
                                    query_len=args.query_len,
                                    mask_zero=args.mask_zero,
                                    bidirection=args.bi_direction,
                                    shared=args.feature_shared,
                                    add_feature=args.add_features)
        model_name = model.__class__.__name__
        model = model.get_model()

        logger.info("***** Running training *****")
        logger.info("  Model Class Name = %s", model_name)
        logger.info("  Num examples = %d", len(train_data))
        logger.info("  Num Epochs = %d", args.epoch)

        model.compile(optimizer='adam',
                      loss="binary_crossentropy",
                      metrics=["acc"])
        early_stopping = EarlyStopping(monitor="val_acc",
                                       patience=3,
                                       mode="max")
        evaluator = Evaluator(dev_ds=dev_ds,
                              model_name=model_name,
                              is_bert_model=False,
                              dev_label=dev_data['label'])

        # Step3: Train Model
        history = model.fit(train_ds,
                            callbacks=[early_stopping, evaluator],
                            epochs=args.epoch,
                            steps_per_epoch=len(train_data) // args.batch_size,
                            validation_data=dev_ds,
                            validation_steps=len(dev_data) // args.batch_size)

        # Step4 : Save model and trainLogs
        logger.info("***** Training Logs *****")

        for epoch in history.epoch:
            logger.info("Epoch %d", epoch)
            logger.info("train_loss:%f train_acc:%f val_loss:%f val_acc:%f",
                        history.history.get("loss")[epoch],
                        history.history.get("acc")[epoch],
                        history.history.get("val_loss")[epoch],
                        history.history.get("val_acc")[epoch])
        #
        # time_stamp = datetime.datetime.now().strftime('%m-%d_%H-%M-%S')
        # path = './checkpoints/{}_{}.h5'.format(model_name, time_stamp)
        # model.save(path)

        model = load_model('./checkpoints/best_{}.h5'.format(model_name))
        y_pred = model.predict(test_ds)
        y_true = test_data["label"].values.reshape((-1, 1))

        y_pred[y_pred > 0.5] = 1
        y_pred[y_pred < 0.5] = 0

        acc = accuracy_score(y_true, y_pred)
        precision = precision_score(y_true, y_pred)
        recall = recall_score(y_true, y_pred)
        f1 = f1_score(y_true, y_pred)

        logger.info("***** Pramaters *****")
        logger.info("  ModelName = %s", args.model_type)
        logger.info("  Add Features = %s", args.add_features)
        logger.info("  Embedding dims = %d", len(emd_matrix[0]))
        logger.info("  BatchSize = %d", args.batch_size)

        if "CNN" in args.model_type:
            logger.info("  kernel_sizes = %s", args.kernel_sizes)
            logger.info("  filters_nums = %s", args.filters_nums)
        elif "RNN" in args.model_type:
            logger.info("  hidden_units = %s", args.hidden_units)
            logger.info("  bi_direction = %s", args.bi_direction)

        logger.info("  dense_units = %s", args.dense_units)
        logger.info("  feature_shared = %s", args.feature_shared)
        logger.info("***** Testing Results *****")
        logger.info("  Acc = %f", acc)
        logger.info("  Precision = %f", precision)
        logger.info("  Recall = %f", recall)
        logger.info("  F1-score = %f", f1)
Пример #5
0
# -*- coding: utf-8 -*-

from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from bert4keras.backend import set_gelu
from bert4keras.bert import build_bert_model
from keras.layers import *
from keras.models import Model
from keras_contrib.layers import CRF
set_gelu("tanh")


class NER_Model(object):
    def __init__(self, model_configs):
        self.bert_config = model_configs.get("bert_config")
        self.bert_checkpoint = model_configs.get("bert_checkpoint")
        self.albert = model_configs.get("albert")
        self.model_type = model_configs.get("model_type")
        self.cell_type = model_configs.get("cell_type")
        self.rnn_units = model_configs.get("rnn_units")
        self.rnn_layers = model_configs.get("rnn_layers")
        self.cnn_filters = model_configs.get("cnn_filters")
        self.cnn_kernel_size = model_configs.get("cnn_kernel_size")
        self.cnn_blocks = model_configs.get("cnn_blocks")
        self.crf_only = model_configs.get("crf_only")
        self.dropout_rate = model_configs.get("dropout_rate")
        self.max_len = model_configs.get("max_len")
        self.numb_tags = model_configs.get("numb_tags")
Пример #6
0
#! -*- coding:utf-8 -*-
# 评估脚本
# 数据集:IFLYTEK' 长文本分类 (https://github.com/CLUEbenchmark/CLUE)

import json
from io import open
import numpy as np
from bert4keras.backend import keras, set_gelu
from bert4keras.tokenizer import Tokenizer
from bert4keras.bert import build_bert_model
from bert4keras.optimizers import Adam
from bert4keras.snippets import sequence_padding, DataGenerator
from keras.layers import *

set_gelu('tanh')  # 切换tanh版本


num_classes = 119
maxlen = 128
batch_size = 32

# RoBERTa small
config_path = '/root/kg/bert/chinese_roberta_L-6_H-384_A-12/bert_config.json'
checkpoint_path = '/root/kg/bert/chinese_roberta_L-6_H-384_A-12/bert_model.ckpt'
dict_path = '/root/kg/bert/chinese_roberta_L-6_H-384_A-12/vocab.txt'
model_type = 'bert'

"""
# albert small
config_path = '/root/kg/bert/albert_small_zh_google/albert_config.json'
checkpoint_path = '/root/kg/bert/albert_small_zh_google/albert_model.ckpt'
Пример #7
0
console.setFormatter(formatter)
logging.getLogger('').addHandler(console)
#################################################################################################

# command parameters
parser = argparse.ArgumentParser(description='classifier albert')
parser.add_argument('--model', type=str, default='large',  help='pre train model: large or xxlarge')
parser.add_argument('--do_train', type=int, default=0,  help='do train')
parser.add_argument('--do_predict', type=int, default=0, help='do predict')
parser.add_argument('--bert_path', type=str, default='../ALbert/albert_xxlarge/', help='bert_path')
parser.add_argument('--file_pre', type=str, default='a', help='data file name')
parser.add_argument('--maxlen', type=int, default=128, help='maxlen')
parser.add_argument('--batch_size', type=int, default=32, help='batch_size')
args = parser.parse_args()

set_gelu('tanh')  # gelu

do_train = args.do_train
do_predict = args.do_predict
logging.info('do_train:%d, do_predict:%d' % (do_train, do_predict))

file_pre = args.file_pre
# the number of class
dic_nums = {'a':2, 'b':2, 'c':3}
num_classes = dic_nums[file_pre]
maxlen = args.maxlen
batch_size = args.batch_size
logging.info('Running Parm: File: Training_%s, num_classes:%d, maxlen: %d, batch_size: %d' % 
            (file_pre, num_classes, maxlen, batch_size))

# pre-train model
Пример #8
0
import numpy as np
import pandas as pd
import tensorflow as tf
from bert4keras.backend import keras, set_gelu, K
from bert4keras.tokenizers import Tokenizer
from bert4keras.models import build_transformer_model
from bert4keras.optimizers import Adam
from bert4keras.snippets import sequence_padding, DataGenerator
from bert4keras.snippets import open
from keras.layers import Dropout, Dense

tf.config.list_physical_devices('GPU')

set_gelu('tanh')  # 切换gelu激活函数的版本
maxlen = 128
batch_size = 32
config_path = './chinese_L-12_H-768_A-12/bert_config.json'
checkpoint_path = './chinese_L-12_H-768_A-12/bert_model.ckpt'
dict_path = './chinese_L-12_H-768_A-12/vocab.txt'
print(batch_size)

# 加载数据集
train_df = pd.read_csv('./data/train.csv')
valid_df = pd.read_csv('./data/dev.csv')
test_df = pd.read_csv('./data/test.csv')

train_df.dropna(axis=0, inplace=True)

train_data = train_df[['query1', 'query2', 'label']].values
valid_data = valid_df[['query1', 'query2', 'label']].values
test_data = test_df[['query1', 'query2', 'label']].values