def test_bert_freeze(self): model_dir = tempfile.TemporaryDirectory().name os.makedirs(model_dir) save_path = MiniBertFactory.create_mini_bert_weights(model_dir) tokenizer = bert.bert_tokenization.FullTokenizer(vocab_file=os.path.join(model_dir, "vocab.txt"), do_lower_case=True) # prepare input max_seq_len = 24 input_str_batch = ["hello, bert!", "how are you doing!"] input_ids, token_type_ids = self.prepare_input_batch(input_str_batch, tokenizer, max_seq_len) bert_ckpt_file = os.path.join(model_dir, "bert_model.ckpt") bert_params = bert.params_from_pretrained_ckpt(model_dir) bert_params.adapter_size = 4 l_bert = bert.BertModelLayer.from_params(bert_params) model = keras.models.Sequential([ l_bert, ]) model.build(input_shape=(None, max_seq_len)) model.summary() l_bert.apply_adapter_freeze() model.summary() bert.load_stock_weights(l_bert, bert_ckpt_file) #l_bert.embeddings_layer.trainable = False model.summary() orig_weight_values = [] for weight in l_bert.weights: orig_weight_values.append(weight.numpy()) model.compile(optimizer=keras.optimizers.Adam(), loss=keras.losses.mean_squared_error, run_eagerly=True) trainable_count = len(l_bert.trainable_weights) orig_pred = model.predict(input_ids) model.fit(x=input_ids, y=np.zeros_like(orig_pred), batch_size=2, epochs=4) trained_count = 0 for ndx, weight in enumerate(l_bert.weights): weight_equal = np.array_equal(weight.numpy(), orig_weight_values[ndx]) print("{}: {}".format(weight_equal, weight.name)) if not weight_equal: trained_count += 1 print(" trained weights:", trained_count) print("trainable weights:", trainable_count) self.assertEqual(trained_count, trainable_count) model.summary()
def test_load_pretrained(self): print("Eager Execution:", tf.executing_eagerly()) bert_params = bert.loader.params_from_pretrained_ckpt( self.bert_ckpt_dir) bert_params.adapter_size = 32 l_bert = bert.BertModelLayer.from_params(bert_params, name="bert") model = keras.models.Sequential([ keras.layers.InputLayer(input_shape=(128, )), l_bert, keras.layers.Lambda(lambda x: x[:, 0, :]), keras.layers.Dense(2) ]) # we need to freeze before build/compile - otherwise keras counts the params twice if bert_params.adapter_size is not None: freeze_bert_layers(l_bert) model.build(input_shape=(None, 128)) model.compile( optimizer=keras.optimizers.Adam(), loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")]) bert.load_stock_weights(l_bert, self.bert_ckpt_file) model.summary()
def predict_on_keras_model(self, input_ids, input_mask, token_type_ids): max_seq_len = input_ids.shape[-1] model, l_bert, k_inputs = self.create_bert_model(max_seq_len) model.build(input_shape=[(None, max_seq_len), (None, max_seq_len)]) bert.load_stock_weights(l_bert, self.bert_ckpt_file) k_res = model.predict([input_ids, token_type_ids]) return k_res
def create_model(max_seq_len, model_dir, model_ckpt, freeze=True, adapter_size=4): bert_params = bert.params_from_pretrained_ckpt(model_dir) print(f'bert params: {bert_params}') bert_params.adapter_size = adapter_size bert_params.adapter_init_scale = 1e-5 l_bert = bert.BertModelLayer.from_params(bert_params, name="bert") input_ids = keras.layers.Input(shape=(max_seq_len, ), dtype='int32', name="input_ids") bert_output = l_bert(input_ids) print("bert shape", bert_output.shape) cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :], name='lambda')(bert_output) cls_out = keras.layers.Dropout(0.5)(cls_out) logits = keras.layers.Dense(name='dense_sin', units=768, activation=tf.math.sin)(cls_out) # logits = keras.layers.Dense(name='dense_tanh', units=768, activation="tanh")(cls_out) # logits = keras.layers.Dense(name='dense_relu', units=256, activation="relu")(cls_out) # logits = keras.layers.Dense(name='dense_gelu', units=256, activation="gelu")(cls_out) logits = keras.layers.BatchNormalization()(logits) logits = keras.layers.Dropout(0.5)(logits) logits = keras.layers.Dense(name='initial_predictions', units=len(classes), activation="softmax")(logits) model = keras.Model(inputs=input_ids, outputs=logits) model.build(input_shape=(None, max_seq_len)) model.summary() if freeze: l_bert.apply_adapter_freeze() l_bert.embeddings_layer.trainable = False model.summary() # Дополнительная инфа https://arxiv.org/abs/1902.00751 # apply global regularization on all trainable dense layers pf.utils.add_dense_layer_loss( model, kernel_regularizer=keras.regularizers.l2(0.01), bias_regularizer=keras.regularizers.l2(0.01)) model.compile( optimizer=pf.optimizers.RAdam(), # loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), # c логитами почему-то не работает совсем loss=keras.losses.SparseCategoricalCrossentropy(from_logits=False), metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")]) bert.load_stock_weights(l_bert, model_ckpt) # bert.load_bert_weights(l_bert, model_ckpt) return model
def create_bert_model(self): bert_params = bert.params_from_pretrained_ckpt(self.model_dir) l_bert = bert.BertModelLayer.from_params(bert_params, name="bert") l_input_ids = tf.keras.layers.Input(shape=(self.max_seq_length, ), dtype='int32') output = l_bert(l_input_ids) model = tf.keras.Model(inputs=l_input_ids, outputs=output) model.build(input_shape=(None, self.max_seq_length)) bert.load_stock_weights(l_bert, self.model_ckpt) return model
def Bert_feature_extraction(ids,texts, max_seq_len, feature_file_name): #https://github.com/kpe/bert-for-tf2 model_dir = ".models/uncased_L-12_H-768_A-12/uncased_L-12_H-768_A-12" bert_ckpt_file = os.path.join(model_dir, "bert_model.ckpt") bert_params = bert.params_from_pretrained_ckpt(model_dir) l_bert = bert.BertModelLayer.from_params(bert_params, name="bert") l_input_ids = keras.layers.Input(shape=(max_seq_len,), dtype='int32') l_token_type_ids = keras.layers.Input(shape=(max_seq_len,), dtype='int32') # using the default token_type/segment id 0 output = l_bert(l_input_ids) # output: [batch_size, max_seq_len, hidden_size] output = keras.layers.GlobalAveragePooling1D()(output) model = keras.Model(inputs=l_input_ids, outputs=output) model.build(input_shape=(None, max_seq_len)) bert.load_stock_weights(l_bert, bert_ckpt_file) do_lower_case = not (model_dir.find("cased") == 0 or model_dir.find("multi_cased") == 0) bert.bert_tokenization.validate_case_matches_checkpoint(do_lower_case, bert_ckpt_file) vocab_file = os.path.join(model_dir, "vocab.txt") tokenizer = bert.bert_tokenization.FullTokenizer(vocab_file, do_lower_case) feature_dict = {} for i in range(len(ids)): id = ids[i] print(id) title = texts[i] tokens = tokenizer.tokenize(title) print(tokens) tokens = ["[CLS]"] + tokens + ["[SEP]"] token_ids = tokenizer.convert_tokens_to_ids(tokens) while len(token_ids) < max_seq_len: token_ids.append(0) if len(token_ids) > max_seq_len: token_ids = token_ids[:max_seq_len] print(token_ids) token_ids = np.array([token_ids], dtype=np.int32) feature = model.predict(token_ids) feature_dict[id] = feature.tolist()[0] np.save(feature_file_name,feature_dict)
def test_extend_pretrained_tokens(self): model_dir = tempfile.TemporaryDirectory().name os.makedirs(model_dir) save_path = MiniBertFactory.create_mini_bert_weights(model_dir) tokenizer = bert.FullTokenizer(vocab_file=os.path.join( model_dir, "vocab.txt"), do_lower_case=True) ckpt_dir = os.path.dirname(save_path) bert_params = bert.params_from_pretrained_ckpt(ckpt_dir) self.assertEqual(bert_params.token_type_vocab_size, 2) bert_params.extra_tokens_vocab_size = 3 l_bert = bert.BertModelLayer.from_params(bert_params) # we dummy call the layer once in order to instantiate the weights l_bert([np.array([[1, 1, 0]]), np.array([[1, 0, 0]])], mask=[[True, True, False]]) mismatched = bert.load_stock_weights(l_bert, save_path) self.assertEqual(0, len(mismatched), "token_type embeddings should have mismatched shape") l_bert([np.array([[1, -3, 0]]), np.array([[1, 0, 0]])], mask=[[True, True, False]])
def get_bert_model(max_length: int, freeze_bert_layers: bool = False, load_bert_weights: bool = True) -> tf.keras.Model: """ Requires a bert folder downloaded from https://github.com/google-research/bert :param max_length: maximum size of a sentence :return: tensorflow model object """ bert_params: BertModelLayer.Params = params_from_pretrained_ckpt(model_dir) l_bert: BertModelLayer = BertModelLayer.from_params(bert_params, name='bert') if freeze_bert_layers: # With all bert weights frozen, the performance is not very good l_bert.apply_adapter_freeze() l_bert.trainable = False l_input_ids: tf.Tensor = tf.keras.layers.Input(shape=(max_length, ), dtype='int32') # If needed, usage of token_type_ids is described here: https://github.com/kpe/bert-for-tf2/blob/master/examples/gpu_movie_reviews.ipynb output: tf.Tensor = l_bert( l_input_ids) # [batch_size, max_seq_len, hidden_size] output = tf.keras.layers.GlobalAveragePooling1D()( output) # [batch_size, hidden_size] # Fine-tune for task output = tf.keras.layers.Dense(class_count, activation='softmax')( output) # [batch_size, class_count] model: tf.keras.Model = tf.keras.Model(inputs=[l_input_ids], outputs=output) # Comment from bert repo: The learning rate we used in the paper was 1e-4. # However, if you are doing additional steps of pre-training starting from an existing BERT checkpoint, you should use a smaller learning rate (e.g., 2e-5) model.compile(input_shape=[(None, max_length), (None, max_length)], loss=tf.keras.losses.SparseCategoricalCrossentropy(), optimizer=tf.keras.optimizers.Adam(lr=1e-5), metrics=['accuracy']) model.summary() if load_bert_weights: bert_ckpt_file: str = os.path.join(model_dir, "bert_model.ckpt") load_stock_weights(l_bert, bert_ckpt_file) return model
def test_extend_pretrained_segments(self): model_dir = tempfile.TemporaryDirectory().name os.makedirs(model_dir) save_path = MiniBertFactory.create_mini_bert_weights(model_dir) tokenizer = bert.FullTokenizer(vocab_file=os.path.join( model_dir, "vocab.txt"), do_lower_case=True) ckpt_dir = os.path.dirname(save_path) bert_params = bert.params_from_pretrained_ckpt(ckpt_dir) self.assertEqual(bert_params.token_type_vocab_size, 2) bert_params.token_type_vocab_size = 4 l_bert = bert.BertModelLayer.from_params(bert_params) # we dummy call the layer once in order to instantiate the weights l_bert([np.array([[1, 1, 0]]), np.array([[1, 0, 0]])]) #, mask=[[True, True, False]]) # # - load the weights from a pre-trained model, # - expect a mismatch for the token_type embeddings # - use the segment/token type id=0 embedding for the missing token types # mismatched = bert.load_stock_weights(l_bert, save_path) self.assertEqual(1, len(mismatched), "token_type embeddings should have mismatched shape") for weight, value in mismatched: if re.match("(.*)embeddings/token_type_embeddings/embeddings:0", weight.name): seg0_emb = value[:1, :] new_segment_embeddings = np.repeat( seg0_emb, (weight.shape[0] - value.shape[0]), axis=0) new_value = np.concatenate([value, new_segment_embeddings], axis=0) keras.backend.batch_set_value([(weight, new_value)]) tte = l_bert.embeddings_layer.token_type_embeddings_layer.weights[0] if not tf.executing_eagerly(): with tf.keras.backend.get_session() as sess: tte, = sess.run((tte, )) self.assertTrue(np.allclose(seg0_emb, tte[0], 1e-6)) self.assertFalse(np.allclose(seg0_emb, tte[1], 1e-6)) self.assertTrue(np.allclose(seg0_emb, tte[2], 1e-6)) self.assertTrue(np.allclose(seg0_emb, tte[3], 1e-6)) bert_params.token_type_vocab_size = 4 print("token_type_vocab_size", bert_params.token_type_vocab_size) print(l_bert.embeddings_layer.trainable_weights[1])
def __init__(self, model_dir, max_length, bert_params, num_layers, trainable): super(EncoderBert, self).__init__(self) assert isinstance(max_length, int) assert bert_params is not None or model_dir is not None if bert_params is None: assert os.path.exists(model_dir) bert_params = params_from_pretrained_ckpt(model_dir) if isinstance(num_layers, int): bert_params.num_layers = num_layers if bert_params.max_position_embeddings < max_length: bert_params.max_position_embeddings = max_length l_bert = BertModelLayer.from_params(bert_params, name="bert") l_input_ids = tf.keras.layers.Input(shape=(max_length, ), dtype='int32') output = l_bert(l_input_ids) model = tf.keras.Model(inputs=l_input_ids, outputs=output) model.build(input_shape=(None, max_length)) def flatten_layers(root_layer): if isinstance(root_layer, tf.keras.layers.Layer): yield root_layer for layer in root_layer._layers: for sub_layer in flatten_layers(layer): yield sub_layer if not trainable: for layer in flatten_layers(l_bert): layer.trainable = False self.model = model if model_dir is not None: bert_ckpt_file = os.path.join(model_dir, "bert_model.ckpt") load_stock_weights(l_bert, bert_ckpt_file)
def BertModel(bertTokensShape): config = configparser.ConfigParser() config.read('conf.txt') bert_model_dir = config['GENERAL']['BERT_MODEL_DIR'] bert_ckpt = config['GENERAL']['BERT_CKPT'] current_dir = os.path.dirname(os.path.dirname(os.path.realpath(__file__))) bert_model_dir = os.path.join(current_dir, "bert_model" ,bert_model_dir) inputs = keras.Input(shape=bertTokensShape, name='bert_token_ids') bert_layer = get_bert_layer(bert_model_dir) bert_vectors = bert_layer(inputs) bert_vectors = keras.layers.Lambda(lambda seq: seq[:, 0, :])(bert_vectors) model = keras.Model(inputs=inputs, outputs=bert_vectors, name="bert_vectors") bert_ckpt_file = os.path.join(bert_model_dir, bert_ckpt) bert.load_stock_weights(bert_layer, bert_ckpt_file) return model
def create_model() -> k.Sequential: bert_layer = create_bert_layer() model = k.Sequential([ k.layers.Input(shape=(MAX_LEN, ), dtype='int32', name='input_ids'), bert_layer, k.layers.TimeDistributed(k.layers.Dense(768 * 3, activation=tf.nn.relu)), k.layers.TimeDistributed( k.layers.Dense(len(CLASSES), activation=tf.nn.softmax)) ]) model.build() bert_layer.apply_adapter_freeze() bert.load_stock_weights(bert_layer, BERT_WEIGHTS_PATH) model.compile(loss='categorical_crossentropy', optimizer=tf.optimizers.Adam(learning_rate=1e-4), metrics=['categorical_accuracy']) return model
def test_multi(self): print(self.bert_ckpt_dir) bert_params = bert.loader.params_from_pretrained_ckpt( self.bert_ckpt_dir) bert_params.adapter_size = 32 l_bert = bert.BertModelLayer.from_params(bert_params, name="bert") max_seq_len = 128 l_input_ids = tf.keras.layers.Input(shape=(max_seq_len, ), dtype='int32', name="input_ids") l_token_type_ids = tf.keras.layers.Input(shape=(max_seq_len, ), dtype='int32', name="token_type_ids") output = l_bert([l_input_ids, l_token_type_ids]) model = tf.keras.Model(inputs=[l_input_ids, l_token_type_ids], outputs=output) model.build(input_shape=[(None, max_seq_len), (None, max_seq_len)]) bert.load_stock_weights(l_bert, self.bert_ckpt_file) model.summary()
def get_bert_model(): bert_params = params_from_pretrained_ckpt(model_dir) l_bert = BertModelLayer.from_params(bert_params, name='bert') # Freeze bert layers l_bert.apply_adapter_freeze() l_bert.trainable = False l_input_ids = tf.keras.layers.Input(shape=(max_length, ), dtype='int32') l_token_type_ids = tf.keras.layers.Input(shape=(max_length, ), dtype='int32') # provide a custom token_type/segment id as a layer input intermediate_output = l_bert([l_input_ids, l_token_type_ids ]) # [batch_size, max_seq_len, hidden_size] averaged_output = tf.keras.layers.GlobalAveragePooling1D()( intermediate_output) l_middle_output = tf.keras.layers.Dense(16, activation='relu') l_output = tf.keras.layers.Dense(1, activation='sigmoid') m_output = l_middle_output(averaged_output) output = l_output(m_output) model = tf.keras.Model(inputs=[l_input_ids, l_token_type_ids], outputs=output) optimizer = tf.keras.optimizers.Adam() model.compile(input_shape=[(None, max_length), (None, max_length)], loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy']) model.summary() bert_ckpt_file = os.path.join(model_dir, "bert_model.ckpt") load_stock_weights(l_bert, bert_ckpt_file) return model
def test_finetuning_workflow(self): # create a BERT layer with config from the checkpoint bert_params = bert.params_from_pretrained_ckpt(self.ckpt_dir) max_seq_len = 12 model, l_bert = self.build_model(bert_params, max_seq_len=max_seq_len) model.summary() # freeze non-adapter weights l_bert.apply_adapter_freeze() model.summary() # load the BERT weights from the pre-trained model bert.load_stock_weights(l_bert, self.ckpt_path) # prepare the data inputs, targets = ["hello world", "goodbye"], [1, 2] tokens = [self.tokenizer.tokenize(toks) for toks in inputs] tokens = [ self.tokenizer.convert_tokens_to_ids(toks) for toks in tokens ] tokens = [toks + [0] * (max_seq_len - len(toks)) for toks in tokens] x = np.array(tokens) y = np.array(targets) # fine tune model.fit(x, y, epochs=3) # preserve the logits for comparison before and after restoring the fine-tuned model logits = model.predict(x) # now store the adapter weights only # old fashion - using saver # finetuned_weights = {w.name: w.value() for w in model.trainable_weights} # saver = tf.compat.v1.train.Saver(finetuned_weights) # fine_path = saver.save(tf.compat.v1.keras.backend.get_session(), fine_ckpt) fine_ckpt = os.path.join(self.ckpt_dir, "fine-tuned.ckpt") finetuned_weights = {w.name: w for w in model.trainable_weights} checkpoint = tf.train.Checkpoint(**finetuned_weights) fine_path = checkpoint.save(file_prefix=fine_ckpt) print("fine tuned ckpt:", fine_path) # build new model tf.compat.v1.keras.backend.clear_session() model, l_bert = self.build_model(bert_params, max_seq_len=max_seq_len) l_bert.apply_adapter_freeze() # load the BERT weights from the pre-trained checkpoint bert.load_stock_weights(l_bert, self.ckpt_path) # load the fine tuned classifier model weights finetuned_weights = {w.name: w for w in model.trainable_weights} checkpoint = tf.train.Checkpoint(**finetuned_weights) load_status = checkpoint.restore(fine_path) load_status.assert_consumed().run_restore_ops() logits_restored = model.predict(x) # check the predictions of the restored model self.assertTrue(np.allclose(logits_restored, logits, 1e-6))
def get_model( lang, model_type, bert_model_path, max_length=300, num_feature=2, saved_epoch_path=None, configs=None, ): if model_type == "vi_attentive_reader": question_size = configs["question_size"] text_size = configs["text_size"] question_input = tf.keras.layers.Input(shape=(question_size)) text_input = tf.keras.layers.Input(shape=(text_size)) inputs = [question_input, text_input] attentive_reader = AttentiveReader( vocab_size=configs["vocab_size"], embedding_dim=200, q_units=200, p_units=200, num_rnn_layer=2, ) output = attentive_reader(inputs) model = tf.keras.Model(inputs=inputs, outputs=output) if saved_epoch_path: # load the saved model # TODO: we will not save bert weights later print("Loading saved_epoch_path: {}".format(saved_epoch_path)) model.load_weights(saved_epoch_path) return model elif model_type == "en_bert_bidaf": input_features = [ tf.keras.layers.Input(shape=(num_feature, max_length)) ] bert_bidaf = EnBertBidaf( bert_model_path=bert_model_path, max_length=max_length, ) output = bert_bidaf(input_features) model = tf.keras.Model(inputs=input_features, outputs=output) if saved_epoch_path: # load the saved model # TODO: we will not save bert weights later print("Loading saved_epoch_path: {}".format(saved_epoch_path)) model.load_weights(saved_epoch_path) else: # load weights for bert model weights_file = "{}/bert_model.ckpt".format(bert_model_path) print("Loading bert weights_file: {}".format(weights_file)) load_stock_weights(bert_bidaf.bert_layer, weights_file) return model
#l_token_type_ids = keras.layers.Input(shape=(max_seq_len,), dtype='int32') # using the default token_type/segment id 0 #output = l_bert([l_input_ids, l_token_type_ids]) # output: [batch_size, max_seq_len, hidden_size] output = l_bert(l_input_ids) # output: [batch_size, max_seq_len, hidden_size] cls_out = keras.layers.Lambda(lambda seq: seq[:, 0, :])(output) #cls_out = keras.layers.Dropout(0.5)(cls_out) #logits = keras.layers.Dense(units=768, activation="tanh")(cls_out) #logits = keras.layers.Dropout(0.5)(logits) logits = keras.layers.Dense(units=3, activation="softmax")(cls_out) model = keras.Model(inputs=l_input_ids, outputs=logits) model.build(input_shape=(None, max_seq_len)) bert_ckpt_file = os.path.join(model_dir, "bert_model.ckpt") bert.load_stock_weights(l_bert, bert_ckpt_file) model.compile( optimizer=keras.optimizers.Adam(), loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True), metrics=[keras.metrics.SparseCategoricalAccuracy(name="acc")]) model.summary() def create_learning_rate_scheduler(max_learn_rate=5e-5, end_learn_rate=1e-7, warmup_epoch_count=10, total_epoch_count=90): def lr_scheduler(epoch): if epoch < warmup_epoch_count:
def _load_bert(config): """ Loads bert model using bert-for-tf2 Args: config: Returns: bert-for-tf2 model """ model_ckpt = config["embedder"]["bert"]["model_ckpt"] bert_params = bert_for_tf2.params_from_pretrained_ckpt( config["embedder"]["bert"]["model_dir"]) max_seq_len = config["embedder"]["bert"]["max_seq_len"] # max_seq_len = bert_params.max_position_embeddings l_bert = bert_for_tf2.BertModelLayer.from_params(bert_params) l_input_ids = keras.layers.Input(shape=(max_seq_len, ), dtype='int32') # l_token_type_ids = keras.layers.Input(shape=(max_seq_len,), dtype='int32') # (1 segment) using the default token_type/segment id 0 bert_output = l_bert( l_input_ids) # output: [batch_size, max_seq_len, hidden_size] # Pooling layer for sentence vector # if pooling == "default": # First token ([CLS]) "This output is usually not a good summary of the semantic content ..." # first_token_tensor = tf.squeeze(bert_output[:, 0:1, :], axis=1) # output = tf.keras.layers.Dense(bert_params.hidden_size, # activation=tf.tanh, # kernel_initializer=tf.keras.initializers.TruncatedNormal( # stddev=bert_params.initializer_range))(first_token_tensor) # if pooling == "average": # output = tf.squeeze( # tf.keras.layers.AveragePooling1D(pool_size=max_seq_len, data_format='channels_last')(bert_output), # axis=1) # elif pooling == "max": # output = tf.squeeze( # tf.keras.layers.MaxPool1D(pool_size=self.max_seq_len, data_format='channels_last')(bert_output), # axis=1) # # else if pooling == "median" : # remove zeros and do something # elif pooling == "none": # output = bert_output # # model = keras.Model(inputs=l_input_ids, outputs=output) # model.build(input_shape=(None, max_seq_len)) first_token_tensor = tf.squeeze(bert_output[:, 0:1, :], axis=1) pooled_output = tf.keras.layers.Dense( bert_params.hidden_size, activation=tf.tanh, kernel_initializer=tf.keras.initializers.TruncatedNormal( stddev=bert_params.initializer_range))(first_token_tensor) pooled_model = keras.Model(inputs=l_input_ids, outputs=pooled_output) pooled_model.build(input_shape=(None, max_seq_len)) model = keras.Model(inputs=l_input_ids, outputs=bert_output) model.build(input_shape=(None, max_seq_len)) l_bert.apply_adapter_freeze() bert_for_tf2.load_stock_weights(l_bert, model_ckpt) return model, pooled_model
def loadBertCheckpoint(): modelsFolder = os.path.join(modelBertDir, "uncased_L-2_H-128_A-2") checkpointName = os.path.join(modelsFolder, "bert_model.ckpt") bert.load_stock_weights(bert_layer, checkpointName)
def load_bert_checkpoint(): # checkpoint_name = os.path.join(models_folder, "bert_model.ckpt") bert.load_stock_weights(bert_layer, checkpoint_name)
def create_estimator(steps=None, warmup_steps=None, model_dir=args.model_dir, num_labels=args.num_labels, max_seq_len=args.max_seq_len, learning_rate=args.learning_rate, name='bert'): def my_auc(labels, predictions): auc_metric = tf.keras.metrics.AUC(name="my_auc") auc_metric.update_state(y_true=labels, y_pred=tf.argmax(predictions, 1)) return {'auc': auc_metric} if name == 'bert': if warmup_steps is None: custom_objects = { 'BertModelLayer': bert.BertModelLayer, 'AdamW': AdamW, 'PruneLowMagnitude': PruneLowMagnitude } if args.prune_enabled: with sparsity.prune_scope(): model = tf.keras.models.load_model(h5py.File(args.keras_model_path), custom_objects=custom_objects) else: model = tf.keras.models.load_model(h5py.File(args.keras_model_path), custom_objects=custom_objects) estimator = tf.keras.estimator.model_to_estimator(model, model_dir=args.output_dir) return estimator, model input_token_ids = tf.keras.Input((max_seq_len,), dtype=tf.int32, name='input_ids') input_segment_ids = tf.keras.Input((max_seq_len,), dtype=tf.int32, name='segment_ids') input_mask = tf.keras.Input((max_seq_len,), dtype=tf.int32, name='input_mask') bert_params = bert.params_from_pretrained_ckpt(model_dir) l_bert = bert.BertModelLayer.from_params(bert_params) bert_output = l_bert(inputs=[input_token_ids, input_segment_ids], mask=input_mask) if args.pool_strategy == 'cls': first_token = tf.keras.layers.Lambda(lambda seq: seq[:, 0, :])(bert_output) pooled_output = tf.keras.layers.Dense(units=first_token.shape[-1], activation=tf.math.tanh)(first_token) dropout = tf.keras.layers.Dropout(rate=0.1)(pooled_output) elif args.pool_strategy == 'avg': seq1_tokens = tf.keras.layers.Lambda(lambda seq: seq[:,1:args.max_seq_len-1,:])(bert_output) seq2_tokens = tf.keras.layers.Lambda(lambda seq: seq[:,args.max_seq_len:2*args.max_seq_len]) pruning_params = { 'pruning_schedule': sparsity.PolynomialDecay(initial_sparsity=0.50, final_sparsity=0.90, begin_step=1000, end_step=2000, frequency=100) } dense = tf.keras.layers.Dense(units=num_labels, name='label_ids') if args.prune_enabled: pruned_dense = sparsity.prune_low_magnitude( dense, **pruning_params) logits = pruned_dense(dropout) else: logits = dense(dropout) output_prob = tf.keras.layers.Softmax(name='output_prob')(logits) model = tf.keras.Model(inputs=[input_token_ids, input_segment_ids, input_mask], outputs=[logits]) model.build(input_shape=[(None, max_seq_len,), (None, max_seq_len,), (None, max_seq_len,)]) # freeze_bert_layers(l_bert) bert.load_stock_weights(l_bert, op.join(model_dir, 'bert_model.ckpt')) weight_decays = get_weight_decays(model) for k, v in weight_decays.items(): if use_weight_decay(k): weight_decays[k] = 0.01 else: del weight_decays[k] opt = create_optimizer( init_lr=learning_rate, steps=steps, weight_decays=weight_decays, warmup_steps=warmup_steps, ) model.compile( optimizer=opt, loss={"{}label_ids".format( 'prune_low_magnitude_' if args.prune_enabled else ''): tf.keras.losses.SparseCategoricalCrossentropy( from_logits=True)}, # for numerical stability metrics=[tf.keras.metrics.SparseCategoricalAccuracy()] ) model.summary() config = tf.compat.v1.ConfigProto() config.gpu_options.allow_growth = True config.gpu_options.per_process_gpu_memory_fraction = args.gpu_memory_fraction config.log_device_placement = False exclude_optimizer_variables = r'^((?!(iter_updates|eta_t)).)*$' ws = tf.estimator.WarmStartSettings( ckpt_to_initialize_from=op.join(args.output_dir, 'keras'), vars_to_warm_start=exclude_optimizer_variables ) estimator = tf.keras.estimator.model_to_estimator(keras_model=model, config=tf.estimator.RunConfig( model_dir=args.output_dir, session_config=config, )) estimator._warm_start_settings = ws return estimator, model raise NotImplemented("* available models: [ bert, ]")
def compile_model(max_seq_len=max_seq_len, adapter_size=adapter_size, batch_size=None, init_ckpt_file=None, init_bert_ckpt_file=bert_ckpt_file): """ :rtype: keras sequential model :param init_ckpt_file: :param max_seq_len: :param init_bert_ckpt_file: :param adapter_size: :type batch_size: integer """ # initializing Sequential model model = Sequential() # adding input_layer model.add(InputLayer(input_shape=(max_seq_len,), batch_size=batch_size, dtype="int32", name="input_ids")) # adding BERT layer bert_params = params_from_pretrained_ckpt(dirname(join(model_dir, 'bert_model.ckpt'))) # create the bert layer bert_params.adapter_size = adapter_size bert_params.adapter_init_scale = 1e-5 bert_layer = BertModelLayer.from_params(bert_params, name="bert") model.add(bert_layer) # adding temporal Dense, Normalization and Activation layers model.add(TimeDistributed(Dense(bert_params.hidden_size // 32))) model.add(TimeDistributed(LayerNormalization())) model.add(TimeDistributed(Activation("tanh"))) model.add(Concat([ Lambda(lambda x: tf.math.reduce_max(x, axis=1, keepdims=False)), GlobalAveragePooling1D()]) ) # dense_hidden_layer model.add(Dense(units=bert_params.hidden_size // 16)) # normalization_layer model.add(LayerNormalization()) # activation_layer model.add(Activation("tanh")) # dense_layer model.add(Dense(units=2)) model.build(input_shape=(batch_size, max_seq_len)) # freeze non-adapter-BERT layers for the case adapter_size is set bert_layer.apply_adapter_freeze() bert_layer.embeddings_layer.trainable = False # True for unfreezing emb LayerNorms # apply global regularization on all trainable dense layers pf.utils.add_dense_layer_loss(model, kernel_regularizer=regularizers.l2(0.01), bias_regularizer=regularizers.l2(0.01)) model.compile(optimizer=RAdam(), loss=SparseCategoricalCrossentropy(from_logits=True), metrics=[SparseCategoricalAccuracy(name="acc")]) # load the pre-trained model weights (once the input_shape is known) if init_ckpt_file: print("Loading model weights from:", init_ckpt_file) model.load_weights(init_ckpt_file) elif init_bert_ckpt_file: print("Loading pre-trained BERT layer from:", init_bert_ckpt_file) load_stock_weights(bert_layer, init_bert_ckpt_file) return model
def loadBertCheckpoint(): modelsFolder = os.path.join('./model/', "multi_cased_L-12_H-768_A-12") checkpointName = os.path.join(modelsFolder, "bert_model.ckpt") bert.load_stock_weights(bert_layer, checkpointName)
def loadBertCheckpoint(): pTrain_dir = pTrain_dir checkpointName = os.path.join(pTrain_dir, "bert_model.ckpt") bert.load_stock_weights(bert_layer, checkpointName)
def build_transformer(transformer, max_seq_length, num_labels, tagging=True, tokenizer_only=False): spm_model_file = None if transformer in zh_albert_models_google: from bert.tokenization.albert_tokenization import FullTokenizer model_url = zh_albert_models_google[transformer] albert = True elif transformer in albert_models_tfhub: from bert.tokenization.albert_tokenization import FullTokenizer with stdout_redirected(to=os.devnull): model_url = fetch_tfhub_albert_model(transformer, os.path.join(hanlp_home(), 'thirdparty', 'tfhub.dev', 'google', transformer)) albert = True spm_model_file = glob.glob(os.path.join(model_url, 'assets', '*.model')) assert len(spm_model_file) == 1, 'No vocab found or unambiguous vocabs found' spm_model_file = spm_model_file[0] elif transformer in bert_models_google: from bert.tokenization.bert_tokenization import FullTokenizer model_url = bert_models_google[transformer] albert = False else: raise ValueError( f'Unknown model {transformer}, available ones: {list(bert_models_google.keys()) + list(zh_albert_models_google.keys()) + list(albert_models_tfhub.keys())}') bert_dir = get_resource(model_url) if spm_model_file: vocab = glob.glob(os.path.join(bert_dir, 'assets', '*.vocab')) else: vocab = glob.glob(os.path.join(bert_dir, '*vocab*.txt')) assert len(vocab) == 1, 'No vocab found or unambiguous vocabs found' vocab = vocab[0] lower_case = any(key in transformer for key in ['uncased', 'multilingual', 'chinese', 'albert']) if spm_model_file: # noinspection PyTypeChecker tokenizer = FullTokenizer(vocab_file=vocab, spm_model_file=spm_model_file, do_lower_case=lower_case) else: tokenizer = FullTokenizer(vocab_file=vocab, do_lower_case=lower_case) if tokenizer_only: return tokenizer if spm_model_file: bert_params = albert_params(bert_dir) else: bert_params = bert.params_from_pretrained_ckpt(bert_dir) l_bert = bert.BertModelLayer.from_params(bert_params, name='albert' if albert else "bert") l_input_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype='int32', name="input_ids") l_mask_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype='int32', name="mask_ids") l_token_type_ids = tf.keras.layers.Input(shape=(max_seq_length,), dtype='int32', name="token_type_ids") output = l_bert([l_input_ids, l_token_type_ids], mask=l_mask_ids) if not tagging: output = tf.keras.layers.Lambda(lambda seq: seq[:, 0, :])(output) if bert_params.hidden_dropout: output = tf.keras.layers.Dropout(bert_params.hidden_dropout, name='hidden_dropout')(output) logits = tf.keras.layers.Dense(num_labels, kernel_initializer=tf.keras.initializers.TruncatedNormal( bert_params.initializer_range))(output) model = tf.keras.Model(inputs=[l_input_ids, l_mask_ids, l_token_type_ids], outputs=logits) model.build(input_shape=(None, max_seq_length)) if not spm_model_file: ckpt = glob.glob(os.path.join(bert_dir, '*.index')) assert ckpt, f'No checkpoint found under {bert_dir}' ckpt, _ = os.path.splitext(ckpt[0]) with stdout_redirected(to=os.devnull): if albert: if spm_model_file: skipped_weight_value_tuples = bert.load_albert_weights(l_bert, bert_dir) else: # noinspection PyUnboundLocalVariable skipped_weight_value_tuples = load_stock_weights(l_bert, ckpt) else: # noinspection PyUnboundLocalVariable skipped_weight_value_tuples = bert.load_bert_weights(l_bert, ckpt) assert 0 == len(skipped_weight_value_tuples), f'failed to load pretrained {transformer}' return model, tokenizer