def __init__(self, old_config: JsonConfig): self.n_layers = 3 self.all_layer_outputs = [] self.last_key_layer = None self.old_config = old_config self.inner_config = self.build_config(old_config, old_config.mid_expanding_factor) self.embedding = None self.layer_list = [] self.initializer = base.create_initializer( self.inner_config.initializer_range) self.token_type_table = tf.compat.v1.get_variable( name="token_type_embeddings", shape=[ self.inner_config.type_vocab_size, self.inner_config.hidden_size ], initializer=self.initializer) self.full_position_embeddings = tf.compat.v1.get_variable( name="position_embeddings", shape=[ self.inner_config.max_position_embeddings, self.inner_config.hidden_size ], initializer=self.initializer) with tf.compat.v1.variable_scope("mid"): for layer_idx in range(self.n_layers): with tf.compat.v1.variable_scope("layer_%d" % layer_idx): layer = ForwardLayer(self.inner_config, self.initializer) self.layer_list.append(layer)
def __init__(self, config, n_layers, use_one_hot_embeddings): super(TopicVectorBert, self).__init__() self.n_layers = n_layers self.all_layer_outputs = [] self.last_key_layer = None self.config = config self.embedding = None self.layer_list = [] self.initializer = base.create_initializer(config.initializer_range) self.attention_mask = None self.use_one_hot_embeddings = use_one_hot_embeddings for layer_idx in range(self.n_layers): layer = ForwardLayer(self.config, self.initializer) self.layer_list.append(layer) self.n_topics = config.n_topics self.use_topic_all_layer = config.use_topic_all_layer self.hidden_size = config.hidden_size topic_emb_len = 4 self.topic_embedding_size = self.hidden_size * topic_emb_len self.n_topics = config.n_topics self.topic_emb_len = topic_emb_len self.use_one_hot_embeddings = use_one_hot_embeddings self.topic_embedding = tf.Variable(lambda: self.initializer( shape=(self.n_topics, self.topic_embedding_size ), dtype=tf.float32), name="topic_embedding")
def init(self, config, is_training, input_ids, input_ids2, input_mask, input_mask2, token_type_ids, segment_ids2, use_one_hot_embeddings): with tf.compat.v1.variable_scope(dual_model_prefix1): model_1 = BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids, use_one_hot_embeddings=use_one_hot_embeddings, ) with tf.compat.v1.variable_scope(dual_model_prefix2): model_2 = BertModel( config=config, is_training=is_training, input_ids=input_ids2, input_mask=input_mask2, token_type_ids=segment_ids2, use_one_hot_embeddings=use_one_hot_embeddings, ) model_1_first_token = model_1.get_sequence_output()[:, 0, :] model_2_first_token = model_2.get_sequence_output()[:, 0, :] rep = tf.concat([model_1_first_token, model_2_first_token], axis=1) self.sequence_output = tf.concat( [model_1.get_sequence_output(), model_2.get_sequence_output()], axis=2) dense_layer = tf.keras.layers.Dense( config.hidden_size, activation=tf.keras.activations.tanh, kernel_initializer=create_initializer(config.initializer_range)) pooled_output = dense_layer(rep) self.pooled_output = pooled_output
def apply_binary_dense(vector): output = tf.keras.layers.Dense( 2, activation=tf.keras.activations.softmax, name="cls_dense", kernel_initializer=create_initializer( config.initializer_range))(vector) return output
def __init__(self, config, input_ids, input_mask, segment_ids, use_one_hot_embeddings): self.config = config self.use_one_hot_embeddings = use_one_hot_embeddings self.input_ids = input_ids self.input_mask = input_mask self.segment_ids = segment_ids self.batch_size, self.seq_length = get_batch_and_seq_length( input_ids, 2) self.initializer = base.create_initializer(config.initializer_range) self.attention_mask = bc.create_attention_mask_from_input_mask( input_ids, self.input_mask)
def __init__(self, config, use_one_hot_embeddings, **kwargs): kwargs['autocast'] = False super(SharedTransformer, self).__init__(kwargs) self.all_layer_outputs = [] self.last_key_layer = None self.config = config self.initializer = base.create_initializer(config.initializer_range) self.attention_mask = None self.use_one_hot_embeddings = use_one_hot_embeddings with tf.compat.v1.variable_scope("layer"): self.layer = ForwardLayer(self.config, self.initializer)
def __init__(self, config, is_training, input_ids, input_mask=None, token_type_ids=None, use_one_hot_embeddings=True, features=None, scope=None): super(DualBertTwoInputIgnoreSecondModel, self).__init__() input_ids2 = tf.zeros_like(input_ids) input_mask2 = tf.zeros_like(input_mask) segment_ids2 = tf.zeros_like(token_type_ids) with tf.compat.v1.variable_scope(dual_model_prefix1): model_1 = BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids, use_one_hot_embeddings=use_one_hot_embeddings, ) with tf.compat.v1.variable_scope(dual_model_prefix2): model_2 = BertModel( config=config, is_training=is_training, input_ids=input_ids2, input_mask=input_mask2, token_type_ids=segment_ids2, use_one_hot_embeddings=use_one_hot_embeddings, ) model_1_first_token = model_1.get_sequence_output()[:, 0, :] model_2_first_token = model_2.get_sequence_output()[:, 0, :] rep = tf.concat([model_1_first_token, model_2_first_token], axis=1) self.sequence_output = tf.concat( [model_1.get_sequence_output(), model_2.get_sequence_output()], axis=2) dense_layer = tf.keras.layers.Dense( config.hidden_size, activation=tf.keras.activations.tanh, kernel_initializer=create_initializer(config.initializer_range)) pooled_output = dense_layer(rep) self.pooled_output = pooled_output
def __init__(self, config, n_layers, **kwargs): kwargs['autocast'] = False super(UpperTransformer, self).__init__(kwargs) self.n_layers = n_layers self.all_layer_outputs = [] self.last_key_layer = None self.config = config self.embedding = None self.layer_list = [] self.initializer = base.create_initializer(config.initializer_range) self.layer_idx_base = 0 for layer_idx in range(self.n_layers): with tf.compat.v1.variable_scope("layer_%d" % layer_idx): layer = ForwardLayer(self.config, self.initializer) self.layer_list.append(layer)
def __init__(self, config, # This is different from BERT config, is_training, input_ids, input_mask, token_type_ids, use_one_hot_embeddings, features, ): super(MultiContextEncoder, self).__init__() self.config = config if not is_training: config.set_attrib("hidden_dropout_prob", 0.0) config.set_attrib("attention_probs_dropout_prob", 0.0) def reform_context(context): return tf.reshape(context, [-1, config.max_context, config.max_context_length]) batch_size, _ = get_shape_list(input_ids) def combine(input_ids, context_input_ids): a = tf.tile(tf.expand_dims(input_ids, 1), [1, config.max_context, 1]) b = reform_context(context_input_ids) rep_3d = tf.concat([a, b], 2) return tf.reshape(rep_3d, [batch_size * config.max_context, -1]) context_input_ids = features["context_input_ids"] context_input_mask = features["context_input_mask"] context_segment_ids = features["context_segment_ids"] context_segment_ids = tf.ones_like(context_segment_ids, tf.int32) * 2 self.module = BertModel(config=config, is_training=is_training, input_ids=combine(input_ids, context_input_ids), input_mask=combine(input_mask, context_input_mask), token_type_ids=combine(token_type_ids, context_segment_ids), use_one_hot_embeddings=use_one_hot_embeddings, ) dense_layer_setup = tf.keras.layers.Dense(config.hidden_size, activation=tf.keras.activations.tanh, kernel_initializer=create_initializer(config.initializer_range)) h1 = self.module.get_pooled_output() h2 = dense_layer_setup(h1) h2 = tf.reshape(h2, [batch_size, config.max_context, -1]) h2 = h2[:, :config.num_context] h3 = tf.reduce_mean(h2, axis=1) h4 = dense_layer_setup(h3) self.pooled_output = h4
def __init__(self, sero_config, config, is_training, input_ids, input_mask=None, token_type_ids=None, use_one_hot_embeddings=True, scope=None): super(DualSeroBertModel, self).__init__() with tf.compat.v1.variable_scope(dual_model_prefix1): model_1 = BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids, use_one_hot_embeddings=use_one_hot_embeddings, ) with tf.compat.v1.variable_scope(dual_model_prefix2): with tf.compat.v1.variable_scope("sero"): model = SeroEpsilon(sero_config, is_training, use_one_hot_embeddings) batch_size, _ = get_shape_list(input_mask) use_context = tf.ones([batch_size, 1], tf.int32) input_ids = tf.expand_dims(input_ids, 1) input_mask = tf.expand_dims(input_mask, 1) segment_ids = tf.expand_dims(token_type_ids, 1) sequence_output2 = model.network_stacked( input_ids, input_mask, segment_ids, use_context) model_1_first_token = model_1.get_sequence_output()[:, 0, :] model_2_first_token = sequence_output2[:, 0, :] rep = tf.concat([model_1_first_token, model_2_first_token], axis=1) dense_layer = tf.keras.layers.Dense( config.hidden_size, activation=tf.keras.activations.tanh, kernel_initializer=create_initializer(config.initializer_range)) pooled_output = dense_layer(rep) self.pooled_output = pooled_output
def __init__( self, config, use_one_hot_embeddings, is_training, masked_input_ids, input_mask, segment_ids, tt_input_ids, tt_input_mask, tt_segment_ids, ): all_input_ids = tf.concat([masked_input_ids, tt_input_ids], axis=0) all_input_mask = tf.concat([input_mask, tt_input_mask], axis=0) all_segment_ids = tf.concat([segment_ids, tt_segment_ids], axis=0) self.config = config self.lm_batch_size, _ = get_shape_list2(masked_input_ids) self.model = BertModel(config, is_training, all_input_ids, all_input_mask, all_segment_ids, use_one_hot_embeddings) initializer = base.create_initializer(config.initializer_range) self.tt_layer = ForwardLayer(config, initializer) self.tt_input_mask = tt_input_mask seq_output = self.model.get_sequence_output()[self.lm_batch_size:] tt_batch_size, seq_length = get_shape_list2(tt_input_ids) tt_attention_mask = create_attention_mask_from_input_mask2( seq_output, self.tt_input_mask) print('tt_attention_mask', tt_attention_mask.shape) print("seq_output", seq_output.shape) seq_output = self.tt_layer.apply_3d(seq_output, tt_batch_size, seq_length, tt_attention_mask) self.tt_feature = mimic_pooling(seq_output, self.config.hidden_size, self.config.initializer_range)
def __init__(self, config, is_training, input_ids, input_mask=None, token_type_ids=None, use_one_hot_embeddings=True, features=None, scope=None): super(TripleBertMasking, self).__init__() input_ids2 = features["input_ids2"] input_mask2 = features["input_mask2"] segment_ids2 = features["segment_ids2"] input_ids3 = features["input_ids3"] input_mask3 = features["input_mask3"] segment_ids3 = features["segment_ids3"] with tf.compat.v1.variable_scope(triple_model_prefix1): model_1 = BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids, use_one_hot_embeddings=use_one_hot_embeddings, ) with tf.compat.v1.variable_scope(triple_model_prefix2): model_2 = BertModel( config=config, is_training=is_training, input_ids=input_ids2, input_mask=input_mask2, token_type_ids=segment_ids2, use_one_hot_embeddings=use_one_hot_embeddings, ) with tf.compat.v1.variable_scope(triple_model_prefix3): model_3 = BertModel( config=config, is_training=is_training, input_ids=input_ids3, input_mask=input_mask3, token_type_ids=segment_ids3, use_one_hot_embeddings=use_one_hot_embeddings, ) model_1_first_token = model_1.get_sequence_output()[:, 0, :] model_2_first_token = model_2.get_sequence_output()[:, 0, :] pooled3 = model_3.get_pooled_output() probs3 = tf.keras.layers.Dense(2, activation=tf.keras.activations.softmax, kernel_initializer=create_initializer( config.initializer_range))(pooled3) mask_scalar = probs3[:, 1:2] self.rel_score = mask_scalar model_2_first_token = mask_scalar * model_2_first_token rep = tf.concat([model_1_first_token, model_2_first_token], axis=1) self.sequence_output = tf.concat( [model_1.get_sequence_output(), model_2.get_sequence_output()], axis=2) dense_layer = tf.keras.layers.Dense( config.hidden_size, activation=tf.keras.activations.tanh, kernel_initializer=create_initializer(config.initializer_range)) pooled_output = dense_layer(rep) self.pooled_output = pooled_output
def __init__(self, config, is_training, input_ids, input_mask=None, token_type_ids=None, use_one_hot_embeddings=True, features=None, scope=None): super(TripleBertWeighted, self).__init__() input_ids2 = features["input_ids2"] input_mask2 = features["input_mask2"] segment_ids2 = features["segment_ids2"] input_ids3 = features["input_ids3"] input_mask3 = features["input_mask3"] segment_ids3 = features["segment_ids3"] def apply_binary_dense(vector): output = tf.keras.layers.Dense( 2, activation=tf.keras.activations.softmax, name="cls_dense", kernel_initializer=create_initializer( config.initializer_range))(vector) return output with tf.compat.v1.variable_scope(triple_model_prefix1): model_1 = BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids, use_one_hot_embeddings=use_one_hot_embeddings, ) model_1_pred = tf.keras.layers.Dense( 3, activation=tf.keras.activations.softmax, name="cls_dense", kernel_initializer=create_initializer( config.initializer_range))(model_1.get_pooled_output()) model_1_pred = model_1_pred[:, :2] with tf.compat.v1.variable_scope(triple_model_prefix2): model_2 = BertModel( config=config, is_training=is_training, input_ids=input_ids2, input_mask=input_mask2, token_type_ids=segment_ids2, use_one_hot_embeddings=use_one_hot_embeddings, ) model_2_pred = apply_binary_dense(model_2.get_pooled_output()) with tf.compat.v1.variable_scope(triple_model_prefix3): model_3 = BertModel( config=config, is_training=is_training, input_ids=input_ids3, input_mask=input_mask3, token_type_ids=segment_ids3, use_one_hot_embeddings=use_one_hot_embeddings, ) model_3_pred = apply_binary_dense(model_3.get_pooled_output()) # Option : initialize dense combined_pred = model_1_pred * model_3_pred[:, 0:1] \ + model_2_pred * model_3_pred[:, 1:2] self.rel_score = model_3_pred[:, 1:2] self.pooled_output = combined_pred
def __init__(self, config, is_training, input_ids, input_mask=None, token_type_ids=None, use_one_hot_embeddings=True, features=None, scope=None): super(DualBertTwoInputModelEx, self).__init__() input_ids2 = features["input_ids2"] input_mask2 = features["input_mask2"] segment_ids2 = features["segment_ids2"] modeling_option = config.model_option with tf.compat.v1.variable_scope(dual_model_prefix1): model_1 = BertModel( config=config, is_training=is_training, input_ids=input_ids, input_mask=input_mask, token_type_ids=token_type_ids, use_one_hot_embeddings=use_one_hot_embeddings, ) with tf.compat.v1.variable_scope(dual_model_prefix2): model_2 = BertModel( config=config, is_training=is_training, input_ids=input_ids2, input_mask=input_mask2, token_type_ids=segment_ids2, use_one_hot_embeddings=use_one_hot_embeddings, ) model_1_first_token = model_1.get_sequence_output()[:, 0, :] model_2_first_token = model_2.get_sequence_output()[:, 0, :] print('model_2_first_token', model_2_first_token) mask_scalar = { "0": 0., "1": 1., "random": tf.random.uniform(shape=[], minval=0., maxval=1.) }[modeling_option] print("Mask_scalar:", mask_scalar) model_2_first_token = mask_scalar * model_2_first_token print('model_2_first_token', model_2_first_token) rep = tf.concat([model_1_first_token, model_2_first_token], axis=1) self.sequence_output = tf.concat( [model_1.get_sequence_output(), model_2.get_sequence_output()], axis=2) dense_layer = tf.keras.layers.Dense( config.hidden_size, activation=tf.keras.activations.tanh, kernel_initializer=create_initializer(config.initializer_range)) pooled_output = dense_layer(rep) self.pooled_output = pooled_output
def __init__(self): config = BertConfig.from_json_file( os.path.join(data_path, "bert_config.json")) self.attention_probs_list = [] input_ids = tf.constant([[101] + [100] * 511]) token_type_ids = tf.constant([[0] * 512]) input_mask = tf.constant([[1] * 512]) attention_mask = create_attention_mask_from_input_mask( input_ids, input_mask) initializer = create_initializer(config.initializer_range) scope = None with tf.compat.v1.variable_scope(scope, default_name="bert"): with tf.compat.v1.variable_scope("embeddings"): # Perform embedding lookup on the word ids. (self.embedding_output, self.embedding_table) = embedding_lookup( input_ids=input_ids, vocab_size=config.vocab_size, embedding_size=config.hidden_size, initializer_range=config.initializer_range, word_embedding_name="word_embeddings", use_one_hot_embeddings=False) # Add positional embeddings and token type embeddings, then layer # normalize and perform dropout. self.embedding_output = embedding_postprocessor( input_tensor=self.embedding_output, use_token_type=True, token_type_ids=token_type_ids, token_type_vocab_size=config.type_vocab_size, token_type_embedding_name="token_type_embeddings", use_position_embeddings=True, position_embedding_name="position_embeddings", initializer_range=config.initializer_range, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) prev_output = reshape_to_matrix(self.embedding_output) with tf.compat.v1.variable_scope("encoder"): for layer_idx in range(12): with tf.compat.v1.variable_scope("layer_%d" % layer_idx): layer_input = prev_output with tf.compat.v1.variable_scope("attention"): attention_heads = [] with tf.compat.v1.variable_scope("self"): attention_head = self.attention_fn(layer_input) attention_heads.append(attention_head) attention_output = None if len(attention_heads) == 1: attention_output = attention_heads[0] else: # In the case where we have other sequences, we just concatenate # them to the self-attention head before the projection. attention_output = tf.concat(attention_heads, axis=-1) # Run a linear projection of `hidden_size` then add a residual # with `layer_input`. with tf.compat.v1.variable_scope("output"): attention_output = dense( hidden_size, initializer)(attention_output) attention_output = layer_norm( attention_output + layer_input) # The activation is only applied to the "intermediate" hidden layer. with tf.compat.v1.variable_scope("intermediate"): intermediate_output = dense( config.intermediate_size, initializer, activation=gelu)(attention_output) # Down-project back to `hidden_size` then add the residual. with tf.compat.v1.variable_scope("output"): layer_output = dense( hidden_size, initializer)(intermediate_output) layer_output = layer_norm(layer_output + attention_output) prev_output = layer_output