def call(self, inputs): pos_ids = self.compute_position_ids(inputs) return K.gather(self.embeddings, pos_ids)
train_path=PATH+'generate/pkl/train_all.pkl' train_file=pickle.load(open(train_path,'rb')) # train_len=int(0.1*len(train_file)) # train_file=train_file[:train_len] np.random.shuffle(train_file) val_len=int(0.8*len(train_file)) valid_data=get_valid(train_file[-50000:]) # print(valid_data[:10]) train_generator = data_generator(train_file[:val_len], batch_size=batch_size) valid_generator=data_generator(train_file[val_len:],batch_size=batch_size) model,Seq_crf,Tag_crf = build_model(embeddings=200,vocab_size=vocab_size,rnn_units=300) Seq_ner = NamedEntityRecognizer(trans=K.eval(Seq_crf.trans)) Tag_ner = NamedEntityRecognizer(trans=K.eval(Tag_crf.trans)) evaluator = Evaluator(valid_data,model,Seq_ner,Tag_ner) early_stopping = EarlyStopping(monitor='val_tag_crf_Sparse_accuracy', patience=10) # 早停法,防止过拟合 plateau = ReduceLROnPlateau(monitor='val_tag_crf_Sparse_accuracy', verbose=1, mode='max', factor=0.5, patience=3) # 当评价指标不在提升时,减少学习率 # checkpoint = ModelCheckpoint('./model/best_0105.hdf5', monitor='val_tag_crf_Sparse_accuracy', verbose=2, save_best_only=True, mode='max', # save_weights_only=True) # 保存最好的模型 model.fit( train_generator.forfit(), steps_per_epoch=len(train_generator), epochs=epochs, validation_data=valid_generator.forfit(),
# 加载预训练模型 bert = build_transformer_model( config_path=config_path, checkpoint_path=checkpoint_path, return_keras_model=False, ) # 预测subject output = Dense(units=2, activation='sigmoid', kernel_initializer=bert.initializer)(bert.model.output) subject_preds = Lambda(lambda x: x**2)(output) mask = bert.model.get_layer('Embedding-Token').output_mask mask = K.cast(mask, K.floatx()) # subject_loss = K.binary_crossentropy(subject_labels, subject_preds) # subject_loss = K.mean(subject_loss, 2) # subject_loss = K.sum(subject_loss * mask) / K.sum(mask) subject_model = Model(bert.model.inputs, subject_preds) subject_model.compile( # loss = subject_loss, loss="binary_crossentropy", optimizer=Adam(learning_rate), # metrics=['accuracy'] ) # subject_model.load_weights('best_model.weights')
def compute_loss(self, inputs, mask=None): subject_labels, object_labels = inputs[:2] subject_preds, object_preds, _ = inputs[2:] if mask[4] is None: mask = 1.0 else: mask = K.cast(mask[4], K.floatx()) # sujuect部分loss subject_loss = K.binary_crossentropy(subject_labels, subject_preds) subject_loss = K.mean(subject_loss, 2) subject_loss = K.sum(subject_loss * mask) / K.sum(mask) # object部分loss object_loss = K.binary_crossentropy(object_labels, object_preds) object_loss = K.sum(K.mean(object_loss, 3), 2) object_loss = K.sum(object_loss * mask) / K.sum(mask) # 总的loss return subject_loss + object_loss
model = build_transformer_model( config_path, checkpoint_path, application='unilm', keep_tokens=keep_tokens, # 只保留keep_tokens中的字,精简原字表 ) model.summary() # 交叉熵作为loss,并mask掉输入部分的预测 y_true = model.input[0][:, 1:] # 目标tokens y_mask = model.input[1][:, 1:] y_pred = model.output[:, :-1] # 预测tokens,预测与目标错开一位 cross_entropy = K.sparse_categorical_crossentropy(y_true, y_pred) cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask) model.add_loss(cross_entropy) model.compile(optimizer=Adam(1e-5)) class AutoTitle(AutoRegressiveDecoder): """seq2seq解码器 """ @AutoRegressiveDecoder.set_rtype('probas') def predict(self, inputs, output_ids, step): token_ids, segment_ids = inputs token_ids = np.concatenate([token_ids, output_ids], 1) segment_ids = np.concatenate( [segment_ids, np.ones_like(output_ids)], 1)
def new_update(x, new_x): new_x = K.switch(cond, new_x, x) return old_update(x, new_x)
def reset_old_weights(self): """恢复模型到旧权重。 """ K.batch_set_value(zip(self.model_weights, self.old_weights))
def dense_loss(self, y_true, y_pred): """y_true需要是one hot形式 """ y_true = K.argmax(y_true, 2) return self.sparse_loss(y_true, y_pred)
def basic_accuracy(self, y_true, y_pred, go_backwards=False): """训练过程中显示逐帧准确率的函数,排除了mask的影响 此处y_true需要是整数形式(非one hot) """ # 导出mask并转换数据类型 mask = K.all(K.greater(y_pred, -1e6), axis=2) mask = K.cast(mask, K.floatx()) # y_true需要重新明确一下shape和dtype y_true = K.reshape(y_true, K.shape(y_pred)[:-1]) y_true = K.cast(y_true, 'int32') # 反转相关 if self.hidden_dim is None: if go_backwards: # 是否反转序列 y_true, y_pred = self.reverse_sequence([y_true, y_pred], mask) trans = K.transpose(self.trans) else: trans = self.trans histoty = K.gather(trans, y_true) else: if go_backwards: # 是否反转序列 y_true, y_pred = self.reverse_sequence([y_true, y_pred], mask) r_trans, l_trans = self.l_trans, self.r_trans else: l_trans, r_trans = self.l_trans, self.r_trans histoty = K.gather(l_trans, y_true) histoty = tf.einsum('bnd,kd->bnk', histoty, r_trans) # 计算逐标签accuracy histoty = K.concatenate([y_pred[:, :1], histoty[:, :-1]], 1) y_pred = (y_pred + histoty) / 2 y_pred = K.cast(K.argmax(y_pred, 2), 'int32') isequal = K.cast(K.equal(y_true, y_pred), K.floatx()) return K.sum(isequal * mask) / K.sum(mask)
def reverse_sequence(self, inputs, mask=None): if mask is None: return [x[:, ::-1] for x in inputs] else: length = K.cast(K.sum(mask, 1), 'int32') return [tf.reverse_sequence(x, length, seq_axis=1) for x in inputs]
def basic_loss(self, y_true, y_pred, go_backwards=False): """y_true需要是整数形式(非one hot) """ # 导出mask并转换数据类型 mask = K.all(K.greater(y_pred, -1e6), axis=2) mask = K.cast(mask, K.floatx()) # y_true需要重新明确一下shape和dtype y_true = K.reshape(y_true, K.shape(y_pred)[:-1]) y_true = K.cast(y_true, 'int32') # 反转相关 if self.hidden_dim is None: if go_backwards: # 是否反转序列 y_true, y_pred = self.reverse_sequence([y_true, y_pred], mask) trans = K.transpose(self.trans) else: trans = self.trans histoty = K.gather(trans, y_true) else: if go_backwards: # 是否反转序列 y_true, y_pred = self.reverse_sequence([y_true, y_pred], mask) r_trans, l_trans = self.l_trans, self.r_trans else: l_trans, r_trans = self.l_trans, self.r_trans histoty = K.gather(l_trans, y_true) histoty = tf.einsum('bnd,kd->bnk', histoty, r_trans) # 计算loss histoty = K.concatenate([y_pred[:, :1], histoty[:, :-1]], 1) y_pred = (y_pred + histoty) / 2 loss = K.sparse_categorical_crossentropy(y_true, y_pred, from_logits=True) return K.sum(loss * mask) / K.sum(mask)
def call(self, inputs, mask=None): if mask is not None: mask = K.cast(mask, K.floatx()) return sequence_masking(inputs, mask, 1, 1)
def sparse_accuracy(self, y_true, y_pred): """训练过程中显示逐帧准确率的函数,排除了mask的影响 此处y_true需要是整数形式(非one hot) """ # 导出mask并转换数据类型 mask = K.all(K.greater(y_pred, -1e6), axis=2) mask = K.cast(mask, K.floatx()) # y_true需要重新明确一下shape和dtype y_true = K.reshape(y_true, K.shape(y_pred)[:-1]) y_true = K.cast(y_true, 'int32') # 逐标签取最大来粗略评测训练效果 y_pred = K.cast(K.argmax(y_pred, 2), 'int32') isequal = K.cast(K.equal(y_true, y_pred), K.floatx()) return K.sum(isequal * mask) / K.sum(mask)
def compute_position_ids(self, inputs): """T5的相对位置分桶(直接翻译自官方T5源码) """ q, v = inputs # 计算位置差 q_idxs = K.arange(0, K.shape(q)[1], dtype='int32') q_idxs = K.expand_dims(q_idxs, 1) v_idxs = K.arange(0, K.shape(v)[1], dtype='int32') v_idxs = K.expand_dims(v_idxs, 0) pos_ids = v_idxs - q_idxs # 后处理操作 num_buckets, max_distance = self.input_dim, self.max_distance ret = 0 n = -pos_ids if self.bidirectional: num_buckets //= 2 ret += K.cast(K.less(n, 0), 'int32') * num_buckets n = K.abs(n) else: n = K.maximum(n, 0) # now n is in the range [0, inf) max_exact = num_buckets // 2 is_small = K.less(n, max_exact) val_if_large = max_exact + K.cast( K.log(K.cast(n, K.floatx()) / max_exact) / np.log(max_distance / max_exact) * (num_buckets - max_exact), 'int32', ) val_if_large = K.minimum(val_if_large, num_buckets - 1) ret += K.switch(is_small, n, val_if_large) return ret
def _resource_apply_sparse(self, grad, var, indices): grad = tf.IndexedSlices(grad, indices, K.shape(var)) grad = tf.convert_to_tensor(grad) return self._resource_apply_dense(grad, var)
def dense_accuracy(self, y_true, y_pred): """训练过程中显示逐帧准确率的函数,排除了mask的影响 此处y_true需要是one hot形式 """ y_true = K.argmax(y_true, 2) return self.sparse_accuracy(y_true, y_pred)
def _decayed_lr(self, var_dtype): lr_multiplier = piecewise_linear(self.iterations, self.lr_schedule) lr_t = super(NewOptimizer, self)._decayed_lr(var_dtype) return lr_t * K.cast(lr_multiplier, var_dtype)
if label > 0: if label % 2 == 1: starting = True entities.append([[i], id2label[str((label - 1) // 2)]]) elif starting: entities[-1][0].append(i) else: starting = False else: starting = False return [(text[mapping[w[0]][0]:mapping[w[-1]][-1] + 1], l) for w, l in entities] NER = NamedEntityRecognizer(trans=K.eval(CRF.trans), starts=[0], ends=[0]) def evaluate(data): """评测函数 """ X, Y, Z = 1e-10, 1e-10, 1e-10 for d in tqdm(data): text = ''.join([i[0] for i in d]) R = set(NER.recognize(text)) T = set([tuple(i) for i in d if i[1] != 'O']) X += len(R & T) Y += len(R) Z += len(T) f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z return f1, precision, recall
def new_update(x, new_x): if is_one_of(x, params) and self._do_lazy_optimization(x): g = self.grads[x] r = K.any(K.not_equal(g, 0.0), axis=-1, keepdims=True) new_x = x + (new_x - x) * K.cast(r, K.floatx()) return old_update(x, new_x)
output = Dense(units=len(predicate2id) * 2, activation='sigmoid', kernel_initializer=bert.initializer)(output) output = Lambda(lambda x: x**4)(output) object_preds = Reshape((-1, len(predicate2id), 2))(output) object_model = Model(bert.model.inputs + [subject_ids], object_preds) # 训练模型 train_model = Model( bert.model.inputs + [subject_labels, subject_ids, object_labels], [subject_preds, object_preds]) train_model.summary() mask = bert.model.get_layer('Embedding-Token').output_mask mask = K.cast(mask, K.floatx()) subject_loss = K.binary_crossentropy(subject_labels, subject_preds) subject_loss = K.mean(subject_loss, 2) subject_loss = K.sum(subject_loss * mask) / K.sum(mask) object_loss = K.binary_crossentropy(object_labels, object_preds) object_loss = K.sum(K.mean(object_loss, 3), 2) object_loss = K.sum(object_loss * mask) / K.sum(mask) train_model.add_loss(subject_loss + object_loss) optimizer = Adam(learning_rate) train_model.compile(optimizer=optimizer)
from bert4keras.models import build_transformer_model from bert4keras.optimizers import Adam from bert4keras.snippets import sequence_padding, DataGenerator # from bert4keras.snippets import open, groupby from keras.layers import Input, Dense, Lambda, Reshape from keras.models import Model from tqdm import tqdm import os import tensorflow as tf # 设置gpu os.environ["CUDA_VISIBLE_DEVICES"] = "0,1" # 使用编号为1,2号的GPU config = tf.ConfigProto() config.gpu_options.per_process_gpu_memory_fraction = 0.98 session = tf.Session(config=config) K.set_session(session) # 基本信息 maxlen = 320 epochs = 20 batch_size = 16 learning_rate = 2e-5 # bert配置 path = "../bert/" # path = "data/" config_path = path + 'chinese_L-12_H-768_A-12/bert_config.json' checkpoint_path = path + 'chinese_L-12_H-768_A-12/bert_model.ckpt' dict_path = path + 'chinese_L-12_H-768_A-12/vocab.txt'
def beta2(self): if self._beta2 is None: iterations = K.cast(self.iterations + 1, K.floatx()) return 1.0 - K.pow(iterations, -0.8) else: return self._beta2
Z += len(T) f1, precision, recall = 2 * X / (Y + Z), X / Y, X / Z return f1, precision, recall #构建自己的验证集 #dev_data = json.load(open("/home/wq/ner/valid_data.json",encoding="utf-8"))["valid_data"] #print(dev_data[0]) if __name__ == '__main__': import sys arg = sys.argv if arg[1] == "train": normal_train = True cross_train = False model, CRF = build_model() NER = NamedEntityRecognizer(trans=K.eval(CRF.trans), starts=[0], ends=[0]) train_data = load_data('/home/wq/ner/train/ner.train') valid_data = load_data('/home/wq/ner/train/ner.valid') dev_data = json.load( open("/home/wq/ner/valid_data.json", encoding="utf-8"))["valid_data"] evaluator = Evaluator() train_generator = data_generator(train_data, batch_size) model.fit_generator(train_generator.forfit(), steps_per_epoch=len(train_generator), epochs=epochs, callbacks=[evaluator])
def __init__(self, *args, **kwargs): super(AdaFactorV1, self).__init__(*args, **kwargs) with K.name_scope(self.__class__.__name__): self.iterations = K.variable(0, dtype='int64', name='iterations')
batch_token_ids, batch_segment_ids = [], [] model = build_bert_model( config_path, checkpoint_path, application='seq2seq', keep_tokens=keep_tokens, # 只保留keep_tokens中的字,精简原字表 ) model.summary() # 交叉熵作为loss,并mask掉输入部分的预测 y_in = model.input[0][:, 1:] # 目标tokens y_mask = model.input[1][:, 1:] y = model.output[:, :-1] # 预测tokens,预测与目标错开一位 cross_entropy = K.sparse_categorical_crossentropy(y_in, y) cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask) model.add_loss(cross_entropy) model.compile(optimizer=Adam(1e-5)) class ReadingComprehension(AutoRegressiveDecoder): """beam search解码来生成答案 passages为多篇章组成的list,从多篇文章中自动决策出最优的答案, 如果没答案,则返回空字符串。 mode是extractive时,按照抽取式执行,即答案必须是原篇章的一个片段。 """ def __init__(self, start_id, end_id, maxlen, mode='extractive'): super(ReadingComprehension, self).__init__(start_id, end_id, maxlen) self.mode = mode
def get_updates(self, loss, params): grads = self.get_gradients(loss, params) self.updates = [K.update_add(self.iterations, 1)] self.weights = [self.iterations] lr = self.learning_rate for i, (p, g) in enumerate(zip(params, grads)): g2 = K.square(g) + self.epsilon1 shape, d_type = K.int_shape(p), K.dtype(p) factored_shape = self.factored_shape(shape) if factored_shape is None: # 定义参数 v = K.zeros(shape, dtype=d_type, name='v_' + str(i)) self.weights.append(v) # 定义更新 v_t = self.beta2 * v + (1.0 - self.beta2) * g2 self.updates.append(K.update(v, v_t)) else: # 定义参数 shape1, axis1, shape2, axis2 = factored_shape vr = K.zeros(shape1, dtype=d_type, name='vr_' + str(i)) vc = K.zeros(shape2, dtype=d_type, name='vc_' + str(i)) self.weights.extend([vr, vc]) # 定义更新 vr_t = self.beta2 * vr + K.mean(g2, axis=axis1, keepdims=True) vc_t = self.beta2 * vc + K.mean(g2, axis=axis2, keepdims=True) self.updates.extend([K.update(vr, vr_t), K.update(vc, vc_t)]) # 合成矩阵 v_t = vr_t * vc_t / K.mean(vr_t, axis=axis2, keepdims=True) # 增量主体 u = g / K.sqrt(v_t) # 增量裁剪 if self.clipping_threshold is not None: u_rms = K.mean(K.sum(K.square(u))) d = self.clipping_threshold u = u / K.maximum(1.0, u_rms / d) # 增量滑动 if self.beta1 > 0.0: # 定义参数 m = K.zeros(shape, dtype=d_type, name='m_' + str(i)) self.weights.append(m) # 定义更新 m_t = self.beta1 * m + (1.0 - self.beta1) * u self.updates.append(K.update(m, m_t)) u = m_t # 增量调整 if self.multiply_by_parameter_scale: u = u * K.maximum(K.mean(K.sum(K.square(p))), self.epsilon2) # 更新参数 self.updates.append(K.update(p, p - lr * u)) return self.updates
""" y_true = K.reshape(y_true, K.shape(y_pred)[:-1]) y_true = K.cast(y_true, 'int32') y_true = K.one_hot(y_true, K.shape(y_pred)[-1]) return K.categorical_crossentropy(y_true, y_pred) o_in = Input(shape=(None, )) train_model = Model(model.inputs + [o_in], model.outputs + [o_in]) # 交叉熵作为loss,并mask掉输入部分的预测 y_true = train_model.input[2][:, 1:] # 目标tokens y_mask = train_model.input[1][:, 1:] y_pred = train_model.output[0][:, :-1] # 预测tokens,预测与目标错开一位 cross_entropy = sparse_categorical_crossentropy(y_true, y_pred) cross_entropy = K.sum(cross_entropy * y_mask) / K.sum(y_mask) embeddings = search_layer(train_model.output[0], 'Embedding-Token').embeddings gp = K.sum(K.gradients(cross_entropy, [embeddings])[0].values**2) train_model.add_loss(cross_entropy + 0.5 * gp) train_model.compile(optimizer=Adam(1e-5)) # train_model.add_loss(cross_entropy) # train_model.compile(optimizer=Adam(1e-5)) class AutoTitle(AutoRegressiveDecoder): """seq2seq解码器 """ @AutoRegressiveDecoder.set_rtype('probas')
def _resource_apply(self, grad, var, indices=None): lr = self.learning_rate g2 = K.square(grad) + self.epsilon1 shape = K.int_shape(var) factored_shape = self.factored_shape(shape) if factored_shape is None: v = self.get_slot(var, 'v') # 定义更新 v_t = self.beta2 * v + (1.0 - self.beta2) * g2 v_t = K.update(v, v_t) else: shape1, axis1, shape2, axis2 = factored_shape vr = self.get_slot(var, 'vr') vc = self.get_slot(var, 'vc') # 定义更新 vr_t = self.beta2 * vr + K.mean(g2, axis=axis1, keepdims=True) vc_t = self.beta2 * vc + K.mean(g2, axis=axis2, keepdims=True) vr_t, vc_t = K.update(vr, vr_t), K.update(vc, vc_t) # 合成矩阵 v_t = vr_t * vc_t / K.mean(vr_t, axis=axis2, keepdims=True) # 增量主体 u = grad / K.sqrt(v_t) # 增量裁剪 if self.clipping_threshold is not None: u_rms = K.mean(K.sum(K.square(u))) d = self.clipping_threshold u = u / K.maximum(1.0, u_rms / d) # 增量滑动 if self.beta1 > 0.0: m = self.get_slot(var, 'm') # 定义更新 m_t = self.beta1 * m + (1.0 - self.beta1) * u u = K.update(m, m_t) # 增量调整 if self.multiply_by_parameter_scale: u = u * K.maximum(K.mean(K.sum(K.square(var))), self.epsilon2) # 更新参数 return K.update(var, var - lr * u)
# 加载预训练模型 bert = build_bert_model( max_seq_len=MAX_SEQ_LEN, config_path=config_path, checkpoint_path=None, with_pool=True, return_keras_model=False, ) output = Dropout(rate=0.1)(bert.model.output) output = Dense(units=1, kernel_initializer=bert.initializer)(output) model = Model(bert.model.input, output) output = Lambda(lambda x: x[:, 0], name='Squeeze')(output) toutput = Lambda(lambda x: K.reshape(x, [-1, NUM_TRAIN_CANDS]), name='Reshape')(output) tprobs = Softmax(name='Softmax')(toutput) train_model = Model(bert.model.input, tprobs) poutput = Lambda(lambda x: K.reshape(x, [-1, NUM_CANDS]), name='Reshape')(output) pprobs = Softmax(name='Softmax')(poutput) predict_model = Model(bert.model.input, pprobs) valid_data = {} test_data = {} entities2name = {} def _json_object_hook(d):
def call(self, inputs, mask=None, a_mask=None, p_bias=None): """实现多头注意力 q_mask: 对输入的query序列的mask。 主要是将输出结果的padding部分置0。 v_mask: 对输入的value序列的mask。 主要是防止attention读取到padding信息。 a_mask: 对attention矩阵的mask。 不同的attention mask对应不同的应用。 p_bias: 在attention里的位置偏置。 一般用来指定相对位置编码的种类。 """ q, k, v = inputs[:3] q_mask, v_mask, n = None, None, 3 if mask is not None: if mask[0] is not None: q_mask = K.cast(mask[0], K.floatx()) if mask[2] is not None: v_mask = K.cast(mask[2], K.floatx()) if a_mask: a_mask = inputs[n] n += 1 # 线性变换 qw = self.q_dense(q) kw = self.k_dense(k) vw = self.v_dense(v) # 形状变换 qw = K.reshape(qw, (-1, K.shape(q)[1], self.heads, self.key_size)) kw = K.reshape(kw, (-1, K.shape(k)[1], self.heads, self.key_size)) vw = K.reshape(vw, (-1, K.shape(v)[1], self.heads, self.head_size)) # Attention a = tf.einsum('bjhd,bkhd->bhjk', qw, kw) # 处理位置编码 if p_bias == 'typical_relative': pos_embeddings = inputs[n] a = a + tf.einsum('bjhd,jkd->bhjk', qw, pos_embeddings) elif p_bias == 't5_relative': pos_embeddings = K.permute_dimensions(inputs[n], (2, 0, 1)) a = a + K.expand_dims(pos_embeddings, 0) # Attention(续) if self.scaled_dot_product: a = a / self.key_size**0.5 a = sequence_masking(a, v_mask, 1, -1) if a_mask is not None: a = a - (1 - a_mask) * 1e12 a = K.softmax(a) # 完成输出 o = tf.einsum('bhjk,bkhd->bjhd', a, vw) if p_bias == 'typical_relative': o = o + tf.einsum('bhjk,jkd->bjhd', a, pos_embeddings) o = K.reshape(o, (-1, K.shape(o)[1], self.out_dim)) o = self.o_dense(o) # 返回结果 o = sequence_masking(o, q_mask, 0) return o