def test_same_result(self): base_location = './google_bert/downloads/multilingual_L-12_H-768_A-12/' bert_config = BertConfig.from_json_file(base_location + 'bert_config.json') init_checkpoint = base_location + 'bert_model.ckpt' def model_fn_builder(bert_config, init_checkpoint): """Returns `model_fn` closure for TPUEstimator.""" def model_fn(features, labels, mode, params): # pylint: disable=unused-argument """The `model_fn` for TPUEstimator.""" unique_ids = features["unique_ids"] input_ids = features["input_ids"] input_mask = features["input_mask"] input_type_ids = features["input_type_ids"] model = BertModel(config=bert_config, is_training=False, input_ids=input_ids, input_mask=input_mask, token_type_ids=input_type_ids, use_one_hot_embeddings=False) if mode != tf.estimator.ModeKeys.PREDICT: raise ValueError("Only PREDICT modes are supported: %s" % (mode)) tvars = tf.trainable_variables() scaffold_fn = None (assignment_map, _) = get_assignment_map_from_checkpoint( tvars, init_checkpoint) tf.train.init_from_checkpoint(init_checkpoint, assignment_map) predictions = { "unique_id": unique_ids, "seq_out": model.get_sequence_output() } output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions=predictions, scaffold_fn=scaffold_fn) return output_spec return model_fn batch_size = 8 seq_len = 5 xmb = np.random.randint(106, bert_config.vocab_size - 106, (batch_size, seq_len)) xmb2 = np.random.randint(0, 2, (batch_size, seq_len), dtype=np.int32) xmb3 = np.random.randint(0, 2, (batch_size, seq_len), dtype=np.int32) def input_fn(params): d = tf.data.Dataset.from_tensor_slices({ "unique_ids": tf.constant([0, 1, 2], shape=[batch_size], dtype=tf.int32), "input_ids": tf.constant(xmb, shape=[batch_size, seq_len], dtype=tf.int32), "input_mask": tf.constant(xmb2, shape=[batch_size, seq_len], dtype=tf.int32), "input_type_ids": tf.constant(xmb3, shape=[batch_size, seq_len], dtype=tf.int32), }) d = d.batch(batch_size=batch_size, drop_remainder=False) return d model_fn = model_fn_builder(bert_config=bert_config, init_checkpoint=init_checkpoint) is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( master=None, tpu_config=tf.contrib.tpu.TPUConfig( num_shards=8, per_host_input_for_training=is_per_host)) estimator = tf.contrib.tpu.TPUEstimator(use_tpu=False, model_fn=model_fn, config=run_config, predict_batch_size=batch_size) tf_result = [r for r in estimator.predict(input_fn)] import tensorflow.keras.backend as K K.set_learning_phase(0) my_model = load_google_bert(base_location, max_len=seq_len) from data.dataset import create_attention_mask, generate_pos_ids pos = generate_pos_ids(batch_size, seq_len) k_mask = create_attention_mask(xmb2, False, None, None, True) bert_encoder = BERTTextEncoder(base_location + 'vocab.txt') for b in range(len(xmb)): xmb[b] = np.array(bert_encoder.standardize_ids(xmb[b].tolist())) k_output = my_model.predict([xmb, xmb3, pos, k_mask]) max_max = 0 for i in range(batch_size): if k_mask[i].mean( ) != 0: # TODO (when mask == full zero, keras_res != tf_res) new_max = np.abs(k_output[i] - tf_result[i]['seq_out']).max() if new_max > max_max: max_max = new_max assert max_max < 5e-5, max_max # TODO reduce the error (I think it's because of the LayerNorm)
if args.const_folding: outputs = [ tf.identity(tf.identity(logits, name="logits"), name="logits_identity") ] else: outputs = [tf.identity(logits, name="logits")] elif args.model_name == 'bert': print('>> Converting graph bert') batch_size = 1 seq_len = 128 num_layers = 2 bert_config = BertConfig( vocab_size=30522, hidden_size=1024, # 768, num_hidden_layers=num_layers, # 12, num_attention_heads=16, #12, intermediate_size=4096, #3072, type_vocab_size=2, ) input_ids = tf.placeholder(tf.int32, shape=(batch_size, seq_len)) input_mask = tf.placeholder(tf.int32, shape=(batch_size, seq_len)) segment_ids = tf.placeholder(tf.int32, shape=(batch_size, seq_len)) model = BertModel(config=bert_config, is_training=False, input_ids=input_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=False) output_layer = model.get_pooled_output()
def load_bert( base_location: str = './google_bert/model/uncased_L-12_H-768_A-12/', use_attn_mask: bool = True, max_len: int = 512) -> keras.Model: print(1) import tensorflow as tf from google_bert.modeling import BertConfig print(2) bert_config = BertConfig.from_json_file(base_location + 'bert_config.json') init_checkpoint = base_location + 'bert_model.ckpt' print(3) var_names = tf.train.list_variables(init_checkpoint) print(4) check_point = tf.train.load_checkpoint(init_checkpoint) print(5) model = create_transformer( embedding_layer_norm=True, neg_inf=-10000.0, use_attn_mask=use_attn_mask, vocab_size=bert_config.vocab_size - TextEncoder.SPECIAL_COUNT, accurate_gelu=True, ln_epsilon=1e-12, max_len=max_len, use_one_embedding_dropout=True, d_hid=bert_config.intermediate_size, embedding_dim=bert_config.hidden_size, num_layers=bert_config.hidden_size, num_heads=bert_config.num_attention_heads, residual_dropout=bert_config.hidden_dropout_prob, attention_dropout=bert_config.attention_probs_dropout_prob) print(6) if K.backend() == 'tensorflow': weights = [np.zeros(w.shape) for w in model.weights] else: weights = [np.zeros(w.get_value().shape) for w in model.weights] for var_name, _ in var_names: w_id = None qkv = None is_pos_embedding = False unsqueeze = False parts = var_name.split('/') first_vars_size = 5 if parts[1] == 'embeddings': n = parts[-1] if n == 'token_type_embeddings': # TODO handle special_tokens w_id = 0 elif n == 'position_embeddings': w_id = 1 is_pos_embedding = True elif n == 'word_embeddings': w_id = 2 elif n == 'gamma': w_id = 3 elif n == 'beta': w_id = 4 else: raise ValueError() elif parts[2].startswith('layer_'): layer_number = int(parts[2][len('layer_'):]) if parts[3] == 'attention': if parts[-1] == 'beta': w_id = first_vars_size + layer_number * 12 + 5 elif parts[-1] == 'gamma': w_id = first_vars_size + layer_number * 12 + 4 elif parts[-2] == 'dense': if parts[-1] == 'bias': w_id = first_vars_size + layer_number * 12 + 3 elif parts[-1] == 'kernel': w_id = first_vars_size + layer_number * 12 + 2 unsqueeze = True else: raise ValueError() elif parts[-2] == 'key' or parts[-2] == 'query' or parts[ -2] == 'value': w_id = first_vars_size + layer_number * 12 + ( 0 if parts[-1] == 'kernel' else 1) unsqueeze = parts[-1] == 'kernel' qkv = parts[-2][0] else: raise ValueError() elif parts[3] == 'intermediate': if parts[-1] == 'bias': w_id = first_vars_size + layer_number * 12 + 7 elif parts[-1] == 'kernel': w_id = first_vars_size + layer_number * 12 + 6 unsqueeze = True else: raise ValueError() elif parts[3] == 'output': if parts[-1] == 'beta': w_id = first_vars_size + layer_number * 12 + 11 elif parts[-1] == 'gamma': w_id = first_vars_size + layer_number * 12 + 10 elif parts[-1] == 'bias': w_id = first_vars_size + layer_number * 12 + 9 elif parts[-1] == 'kernel': w_id = first_vars_size + layer_number * 12 + 8 unsqueeze = True else: raise ValueError() if w_id is not None and qkv is None: print(var_name, ' -> ', model.weights[w_id].name) if is_pos_embedding: weights[w_id][:max_len, :] = check_point.get_tensor( var_name )[:max_len, :] if not unsqueeze else check_point.get_tensor( var_name)[None, :max_len, :] else: weights[w_id][:] = check_point.get_tensor(var_name) if not unsqueeze else \ check_point.get_tensor(var_name)[ None, ...] elif w_id is not None: print(var_name, ' -> ', model.weights[w_id].name, '::', qkv) p = {'q': 0, 'k': 1, 'v': 2}[qkv] if weights[w_id].ndim == 3: dim_size = weights[w_id].shape[1] weights[w_id][0, :, p * dim_size:(p + 1) * dim_size] = check_point.get_tensor( var_name) if not unsqueeze else \ check_point.get_tensor(var_name)[ None, ...] else: dim_size = weights[w_id].shape[0] // 3 weights[w_id][p * dim_size:(p + 1) * dim_size] = check_point.get_tensor(var_name) else: print( 'not mapped: ', var_name) # TODO pooler, cls/predictions, cls/seq_relationship model.set_weights(weights) return model
def load_google_bert( base_location: str = './google_bert/downloads/multilingual_L-12_H-768_A-12/', use_attn_mask: bool = True, max_len: int = 512, verbose: bool = False) -> keras.Model: bert_config = BertConfig.from_json_file(base_location + 'bert_config.json') init_checkpoint = base_location + 'bert_model.ckpt' var_names = tf.train.list_variables(init_checkpoint) check_point = tf.train.load_checkpoint(init_checkpoint) vocab_size = bert_config.vocab_size - TextEncoder.BERT_SPECIAL_COUNT - TextEncoder.BERT_UNUSED_COUNT model = create_transformer( embedding_layer_norm=True, neg_inf=-10000.0, use_attn_mask=use_attn_mask, vocab_size=vocab_size, accurate_gelu=True, layer_norm_epsilon=1e-12, max_len=max_len, use_one_embedding_dropout=True, d_hid=bert_config.intermediate_size, embedding_dim=bert_config.hidden_size, num_layers=bert_config.num_hidden_layers, num_heads=bert_config.num_attention_heads, residual_dropout=bert_config.hidden_dropout_prob, attention_dropout=bert_config.attention_probs_dropout_prob) if K.backend() == 'tensorflow': weights = [np.zeros(w.shape) for w in model.weights] else: weights = [np.zeros(w.get_value().shape) for w in model.weights] for var_name, _ in var_names: w_id = None qkv = None unsqueeze = False parts = var_name.split('/') first_vars_size = 5 if parts[1] == 'embeddings': n = parts[-1] if n == 'token_type_embeddings': w_id = 0 elif n == 'position_embeddings': w_id = 1 elif n == 'word_embeddings': w_id = 2 elif n == 'gamma': w_id = 3 elif n == 'beta': w_id = 4 else: raise ValueError() elif parts[2].startswith('layer_'): layer_number = int(parts[2][len('layer_'):]) if parts[3] == 'attention': if parts[-1] == 'beta': w_id = first_vars_size + layer_number * 12 + 5 elif parts[-1] == 'gamma': w_id = first_vars_size + layer_number * 12 + 4 elif parts[-2] == 'dense': if parts[-1] == 'bias': w_id = first_vars_size + layer_number * 12 + 3 elif parts[-1] == 'kernel': w_id = first_vars_size + layer_number * 12 + 2 unsqueeze = True else: raise ValueError() elif parts[-2] == 'key' or parts[-2] == 'query' or parts[ -2] == 'value': w_id = first_vars_size + layer_number * 12 + ( 0 if parts[-1] == 'kernel' else 1) unsqueeze = parts[-1] == 'kernel' qkv = parts[-2][0] else: raise ValueError() elif parts[3] == 'intermediate': if parts[-1] == 'bias': w_id = first_vars_size + layer_number * 12 + 7 elif parts[-1] == 'kernel': w_id = first_vars_size + layer_number * 12 + 6 unsqueeze = True else: raise ValueError() elif parts[3] == 'output': if parts[-1] == 'beta': w_id = first_vars_size + layer_number * 12 + 11 elif parts[-1] == 'gamma': w_id = first_vars_size + layer_number * 12 + 10 elif parts[-1] == 'bias': w_id = first_vars_size + layer_number * 12 + 9 elif parts[-1] == 'kernel': w_id = first_vars_size + layer_number * 12 + 8 unsqueeze = True else: raise ValueError() if w_id is not None and qkv is None: if verbose: print(var_name, ' -> ', model.weights[w_id].name) if w_id == 1: # pos embedding weights[w_id][:max_len, :] = check_point.get_tensor( var_name )[:max_len, :] if not unsqueeze else check_point.get_tensor( var_name)[None, :max_len, :] elif w_id == 2: # word embedding # ours: unk, [vocab], pad, msk(mask), bos(cls), del(use sep again), eos(sep) # theirs: pad, 99 unused, unk, cls, sep, mask, [vocab] saved = check_point.get_tensor( var_name) # vocab_size, emb_size # weights[our_position] = saved[their_position] weights[w_id][0] = saved[1 + TextEncoder.BERT_UNUSED_COUNT] # unk weights[w_id][1:vocab_size] = saved[-vocab_size + 1:] weights[w_id][vocab_size + TextEncoder.PAD_OFFSET] = saved[0] weights[w_id][vocab_size + TextEncoder.MSK_OFFSET] = saved[ 4 + TextEncoder.BERT_UNUSED_COUNT] weights[w_id][vocab_size + TextEncoder.BOS_OFFSET] = saved[ 2 + TextEncoder.BERT_UNUSED_COUNT] weights[w_id][vocab_size + TextEncoder.DEL_OFFSET] = saved[ 3 + TextEncoder.BERT_UNUSED_COUNT] weights[w_id][vocab_size + TextEncoder.EOS_OFFSET] = saved[ 3 + TextEncoder.BERT_UNUSED_COUNT] else: weights[w_id][:] = check_point.get_tensor(var_name) if not unsqueeze else \ check_point.get_tensor(var_name)[ None, ...] elif w_id is not None: if verbose: print(var_name, ' -> ', model.weights[w_id].name, '::', qkv) p = {'q': 0, 'k': 1, 'v': 2}[qkv] if weights[w_id].ndim == 3: dim_size = weights[w_id].shape[1] weights[w_id][0, :, p * dim_size:(p + 1) * dim_size] = check_point.get_tensor( var_name) if not unsqueeze else \ check_point.get_tensor(var_name)[ None, ...] else: dim_size = weights[w_id].shape[0] // 3 weights[w_id][p * dim_size:(p + 1) * dim_size] = check_point.get_tensor(var_name) else: if verbose: print('not mapped: ', var_name ) # TODO pooler, cls/predictions, cls/seq_relationship model.set_weights(weights) return model