示例#1
0
def get_diff_loss(bert_config, input_tensor, masked_lm_positions,
                  masked_lm_weights, loss_base, loss_target):
    base_prob = tf.exp(-loss_base)
    target_prob = tf.exp(-loss_target)

    prob_diff = base_prob - target_prob

    input_tensor = bc.gather_indexes(input_tensor, masked_lm_positions)
    with tf.compat.v1.variable_scope("diff_loss"):

        hidden = bc.dense(bert_config.hidden_size,
                          bc.create_initializer(bert_config.initializer_range),
                          bc.get_activation(
                              bert_config.hidden_act))(input_tensor)

        logits = bc.dense(1,
                          bc.create_initializer(
                              bert_config.initializer_range))(hidden)
        logits = tf.reshape(logits, prob_diff.shape)

    per_example_loss = tf.abs(prob_diff - logits)
    per_example_loss = tf.cast(masked_lm_weights,
                               tf.float32) * per_example_loss
    losses = tf.reduce_sum(per_example_loss, axis=1)
    loss = tf.reduce_mean(losses)

    return loss, per_example_loss, logits
示例#2
0
文件: horizon.py 项目: clover3/Chair
 def __init__(self, hidden_size, intermediate_size, hidden_act,
              hidden_dropout_prob, initializer):
     super(ResidualFeedforward, self).__init__()
     self.intermediate_ff = bc.dense(
         intermediate_size,
         initializer,
         activation=bc.get_activation(hidden_act))
     self.hidden_dropout_prob = hidden_dropout_prob
     self.output_ff = bc.dense(hidden_size, initializer)
示例#3
0
    def __init__(self, bert_config):

        initializer = bc.create_initializer(bert_config.initializer_range)
        self.layer1 = bc.dense(bert_config.hidden_size, initializer,
                               bc.get_activation(bert_config.hidden_act))

        self.logit_dense1 = bc.dense(2, initializer)
        self.logit_dense2 = bc.dense(2, initializer)

        self.graph_built = False
示例#4
0
文件: units.py 项目: clover3/Chair
 def __init__(self, config, initializer):
     self.config = config
     self.self_attention = SelfAttentionLayer(config)
     with tf.compat.v1.variable_scope("intermediate"):
         self.intermediate_ff = bc.dense(self.config.intermediate_size,
                                         initializer,
                                         activation=bc.get_activation(
                                             self.config.hidden_act))
     with tf.compat.v1.variable_scope("output"):
         self.output_ff = bc.dense(config.hidden_size, initializer)
    def build_by_attention(self, key):
        hidden_size = self.config.hidden_size
        with tf.compat.v1.variable_scope("embeddings"):
            lexical_tensor = self.get_lexical_lookup()
            self.embedding_output = self.embedding_postprocessor(
                d_input_ids=self.input_ids,
                input_tensor=lexical_tensor,
                use_token_type=True,
                token_type_ids=self.segment_ids,
                token_type_vocab_size=self.config.type_vocab_size,
                token_type_embedding_name="token_type_embeddings",
                use_position_embeddings=True,
                position_embedding_name="position_embeddings",
                initializer_range=self.config.initializer_range,
                max_position_embeddings=self.config.max_position_embeddings,
                dropout_prob=self.config.hidden_dropout_prob)
            input_tensor = self.embedding_output
            #[ def_per_batch, seq_length, hidden_size]

        with tf.compat.v1.variable_scope("encoder"):
            num_key_tokens = self.ssdr_config.num_key_tokens
            project_dim = hidden_size * num_key_tokens
            raw_key = bc.dense(project_dim, self.initializer)(key)
            key_tokens = tf.reshape(
                raw_key, [self.batch_size, num_key_tokens, hidden_size])

            input_tensor = tf.concat([key_tokens, input_tensor], axis=1)
            input_shape = bc.get_shape_list(input_tensor, expected_rank=3)

            mask_for_key = tf.ones([self.batch_size, num_key_tokens],
                                   dtype=tf.int64)
            self.input_mask = tf.cast(self.input_mask, tf.int64)
            self.input_mask = tf.concat([mask_for_key, self.input_mask],
                                        axis=1)
            self.seq_length = self.seq_length + num_key_tokens

            self.attention_mask = bc.create_attention_mask_from_input_mask(
                input_tensor, self.input_mask)
            prev_output = bc.reshape_to_matrix(input_tensor)
            for layer_idx in range(self.ssdr_config.num_hidden_layers):
                with tf.compat.v1.variable_scope("layer_%d" % layer_idx):
                    intermediate_output, prev_output = self.forward_layer(
                        prev_output)
                    self.all_layer_outputs.append(prev_output)

            final_output = bc.reshape_from_matrix(prev_output, input_shape)
            self.scores = bc.dense(1, self.initializer)(final_output[:, 0, :])

            if self.ssdr_config.info_pooling_method == "first_tokens":
                self.info_output = final_output[:, :num_key_tokens, :]
            elif self.ssdr_config.info_pooling_method == "max_pooling":
                self.info_output = tf.reduce_max(final_output, axis=1)

        return self.scores, self.info_output
    def build_key(self):
        with tf.compat.v1.variable_scope("embeddings"):
            input_tensor = self.get_embeddings(self.input_ids,
                                               self.segment_ids)
            self.input_shape = bc.get_shape_list(input_tensor, expected_rank=3)

        with tf.compat.v1.variable_scope("encoder"):
            self.attention_mask = bc.create_attention_mask_from_input_mask(
                input_tensor, self.input_mask)
            prev_output = bc.reshape_to_matrix(input_tensor)

            for layer_idx in range(self.layers_before_key_pooling):
                with tf.compat.v1.variable_scope("layer_%d" % layer_idx):
                    intermediate_output, prev_output = self.forward_layer(
                        prev_output)
                    intermediate_output = tf.reshape(intermediate_output, [
                        self.batch_size * self.seq_length,
                        self.config.intermediate_size
                    ])
                    final_output = bc.reshape_from_matrix(
                        prev_output, self.input_shape)
                    self.all_layer_outputs.append(final_output)

        self.last_intermediate_output = intermediate_output

        self.last_key_layer = prev_output
        with tf.compat.v1.variable_scope("mr_key"):
            key_vectors = bc.dense(self.key_dimension,
                                   self.initializer)(intermediate_output)
            self.debug1 = key_vectors
            key_vectors = tf.reshape(
                key_vectors,
                [self.batch_size, self.seq_length, self.key_dimension])
            key_output = self.key_pooling(key_vectors)
        return key_output
示例#7
0
def sequence_index_prediction(bert_config, lookup_idx, input_tensor):
    logits = bert_common.dense(2, bert_common.create_initializer(bert_config.initializer_range))(input_tensor)
    log_probs = tf.nn.softmax(logits, axis=2)
    losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logits, labels=lookup_idx)
    per_example_loss = tf.reduce_sum(losses, axis=1)
    loss = tf.reduce_mean(per_example_loss)

    return loss, per_example_loss, log_probs
    def forward_layer_with_added(self, prev_output, added_value, locations):
        hidden_size = self.config.hidden_size
        layer_input = prev_output
        attention_output = self_attention_with_add(
            layer_input, self.attention_mask, self.config, self.batch_size,
            self.seq_length, hidden_size, self.initializer, added_value,
            locations)

        with tf.compat.v1.variable_scope("intermediate"):
            intermediate_output = bc.dense(
                self.config.intermediate_size,
                self.initializer,
                activation=bc.get_activation(
                    self.config.hidden_act))(attention_output)

        with tf.compat.v1.variable_scope("output"):
            layer_output = bc.dense(hidden_size,
                                    self.initializer)(intermediate_output)
            layer_output = bc.dropout(layer_output,
                                      self.config.hidden_dropout_prob)
            layer_output = bc.layer_norm(layer_output + attention_output)
            prev_output = layer_output
        return intermediate_output, layer_output
示例#9
0
    def get_regression_and_loss(hidden_vector, loss_label):
        logits = bc.dense(2,
                          bc.create_initializer(
                              bert_config.initializer_range))(hidden_vector)
        gold_prob = loss_to_prob_pair(loss_label)
        logits = tf.reshape(logits, gold_prob.shape)

        per_example_loss = tf.nn.softmax_cross_entropy_with_logits(gold_prob,
                                                                   logits,
                                                                   axis=-1,
                                                                   name=None)
        per_example_loss = tf.cast(masked_lm_weights,
                                   tf.float32) * per_example_loss
        losses = tf.reduce_sum(per_example_loss, axis=1)
        loss = tf.reduce_mean(losses)

        return loss, per_example_loss, logits
def self_attention_with_add(layer_input, attention_mask, config, batch_size,
                            seq_length, hidden_size, initializer, values,
                            add_locations):
    attention_head_size = int(hidden_size / config.num_attention_heads)
    with tf.compat.v1.variable_scope("attention"):
        attention_heads = []
        with tf.compat.v1.variable_scope("self"):
            attention_head = bc.attention_layer(
                from_tensor=layer_input,
                to_tensor=layer_input,
                attention_mask=attention_mask,
                num_attention_heads=config.num_attention_heads,
                size_per_head=attention_head_size,
                attention_probs_dropout_prob=config.
                attention_probs_dropout_prob,
                initializer_range=config.initializer_range,
                do_return_2d_tensor=True,
                batch_size=batch_size,
                from_seq_length=seq_length,
                to_seq_length=seq_length)
            attention_heads.append(attention_head)

        attention_output = None
        if len(attention_heads) == 1:
            attention_output = attention_heads[0]
        else:
            # In the case where we have other sequences, we just concatenate
            # them to the self-attention head before the projection.
            attention_output = tf.concat(attention_heads, axis=-1)

        # [batch*seq_length, hidden_dim] , [batch, n_locations]
        attention_output = tf.tensor_scatter_nd_add(attention_output,
                                                    add_locations, values)

        # Run a linear projection of `hidden_size` then add a residual
        # with `layer_input`.
        with tf.compat.v1.variable_scope("output"):
            attention_output = bc.dense(hidden_size,
                                        initializer)(attention_output)
            attention_output = bc.dropout(attention_output,
                                          config.hidden_dropout_prob)
            attention_output = bc.layer_norm(attention_output + layer_input)
    return attention_output
示例#11
0
文件: horizon.py 项目: clover3/Chair
    def __init__(self, config, is_training, use_one_hot_embeddings):
        super(HorizontalAlpha, self).__init__()
        if not is_training:
            config.set_attrib("hidden_dropout_prob", 0.0)
            config.set_attrib("attention_probs_dropout_prob", 0.0)

        initializer = bc.create_initializer(config.initializer_range)
        self.embedding_layer = Embedding2()
        self.embedding_projector = bc.dense(config.hidden_size, initializer)
        self.config = config
        num_columns = config.num_columns
        self.column_list = []
        for tower_idx in range(num_columns):
            column = ForwardColumn(config)
            self.column_list.append(column)

        self.num_layers = config.num_hidden_layers
        self.num_columns = config.num_columns
        self.num_column_tokens = config.num_column_tokens
        self.column_embedding_list = []
        self.use_one_hot_embeddings = use_one_hot_embeddings
        self.config = config
        column_mask = []
        for column_idx in range(1, self.num_columns):
            column_embedding = tf.Variable(
                lambda: initializer(shape=(self.num_column_tokens, config.
                                           hidden_size),
                                    dtype=tf.float32),
                name="column_embedding_{}".format(column_idx))
            self.column_embedding_list.append(column_embedding)
            column_mask += [1] * self.num_column_tokens

        self.column_mask = tf.constant(column_mask)
        self.all_raw_layers = []
        self.all_main_layers = []
        self.sequence_output = None
        self.pooled_output = None
示例#12
0
    def build(self):
        with tf.compat.v1.variable_scope("dict"):
            with tf.compat.v1.variable_scope("embeddings"):
                input_tensor = self.get_embeddings(self.input_ids,
                                                   self.segment_ids)

            with tf.compat.v1.variable_scope("encoder"):
                num_key_tokens = self.ssdr_config.num_key_tokens
                input_shape = bc.get_shape_list(input_tensor, expected_rank=3)

                mask_for_key = tf.ones([self.batch_size, num_key_tokens],
                                       dtype=tf.int64)
                self.input_mask = tf.cast(self.input_mask, tf.int64)
                self.input_mask = tf.concat([mask_for_key, self.input_mask],
                                            axis=1)
                self.seq_length = self.seq_length + num_key_tokens

                self.attention_mask = bc.create_attention_mask_from_input_mask(
                    input_tensor, self.input_mask)
                prev_output = bc.reshape_to_matrix(input_tensor)
                for layer_idx in range(self.ssdr_config.num_hidden_layers):
                    with tf.compat.v1.variable_scope("layer_%d" % layer_idx):
                        intermediate_output, prev_output = self.forward_layer(
                            prev_output)
                        self.all_layer_outputs.append(prev_output)

                final_output = bc.reshape_from_matrix(prev_output, input_shape)
                self.scores = bc.dense(1, self.initializer)(final_output[:,
                                                                         0, :])

                if self.ssdr_config.info_pooling_method == "first_tokens":
                    self.info_output = final_output[:, :num_key_tokens, :]
                elif self.ssdr_config.info_pooling_method == "max_pooling":
                    self.info_output = tf.reduce_max(final_output, axis=1)

            return self.scores, self.info_output
示例#13
0
def get_loss_independently(bert_config, input_tensor, masked_lm_positions,
                           masked_lm_weights, loss_base, loss_target):
    input_tensor = bc.gather_indexes(input_tensor, masked_lm_positions)

    hidden = bc.dense(bert_config.hidden_size,
                      bc.create_initializer(bert_config.initializer_range),
                      bc.get_activation(bert_config.hidden_act))(input_tensor)

    def get_regression_and_loss(hidden_vector, loss_label):
        logits = bc.dense(2,
                          bc.create_initializer(
                              bert_config.initializer_range))(hidden_vector)
        gold_prob = loss_to_prob_pair(loss_label)
        logits = tf.reshape(logits, gold_prob.shape)

        per_example_loss = tf.nn.softmax_cross_entropy_with_logits(gold_prob,
                                                                   logits,
                                                                   axis=-1,
                                                                   name=None)
        per_example_loss = tf.cast(masked_lm_weights,
                                   tf.float32) * per_example_loss
        losses = tf.reduce_sum(per_example_loss, axis=1)
        loss = tf.reduce_mean(losses)

        return loss, per_example_loss, logits

    loss1, per_example_loss1, logits1 = get_regression_and_loss(
        hidden, loss_base)
    loss2, per_example_loss2, logits2 = get_regression_and_loss(
        hidden, loss_target)

    prob1 = tf.nn.softmax(logits1)[:, :, 0]
    prob2 = tf.nn.softmax(logits2)[:, :, 0]

    total_loss = loss1 + loss2
    return total_loss, loss1, loss2, per_example_loss1, per_example_loss2, prob1, prob2
示例#14
0
    def model_fn(features, labels, mode, params):    # pylint: disable=unused-argument
        """The `model_fn` for TPUEstimator."""
        logging.info("*** Features ***")
        for name in sorted(features.keys()):
            logging.info("    name = %s, shape = %s" % (name, features[name].shape))

        input_ids = features["input_ids"]
        input_mask = features["input_mask"]
        segment_ids = features["segment_ids"]
        d_input_ids = features["d_input_ids"]
        d_input_mask = features["d_input_mask"]
        d_location_ids = features["d_location_ids"]
        next_sentence_labels = features["next_sentence_labels"]

        if dict_run_config.prediction_op == "loss":
            seed = 0
        else:
            seed = None

        if dict_run_config.prediction_op == "loss_fixed_mask" or train_config.fixed_mask:
            masked_input_ids = input_ids
            masked_lm_positions = features["masked_lm_positions"]
            masked_lm_ids = features["masked_lm_ids"]
            masked_lm_weights = tf.ones_like(masked_lm_positions, dtype=tf.float32)
        else:
            masked_input_ids, masked_lm_positions, masked_lm_ids, masked_lm_weights \
                = random_masking(input_ids, input_mask, train_config.max_predictions_per_seq, MASK_ID, seed)

        if dict_run_config.use_d_segment_ids:
            d_segment_ids = features["d_segment_ids"]
        else:
            d_segment_ids = None

        is_training = (mode == tf.estimator.ModeKeys.TRAIN)

        model = model_class(
                config=bert_config,
                d_config=dbert_config,
                is_training=is_training,
                input_ids=masked_input_ids,
                input_mask=input_mask,
                d_input_ids=d_input_ids,
                d_input_mask=d_input_mask,
                d_location_ids=d_location_ids,
                use_target_pos_emb=dict_run_config.use_target_pos_emb,
                token_type_ids=segment_ids,
                use_one_hot_embeddings=train_config.use_one_hot_embeddings,
                d_segment_ids=d_segment_ids,
                pool_dict_output=dict_run_config.pool_dict_output,
        )

        (masked_lm_loss,
         masked_lm_example_loss, masked_lm_log_probs) = get_masked_lm_output(
                 bert_config, model.get_sequence_output(), model.get_embedding_table(),
                 masked_lm_positions, masked_lm_ids, masked_lm_weights)
        (next_sentence_loss, next_sentence_example_loss,
         next_sentence_log_probs) = get_next_sentence_output(
                 bert_config, model.get_pooled_output(), next_sentence_labels)

        total_loss = masked_lm_loss

        if dict_run_config.train_op == "entry_prediction":
            score_label = features["useful_entry"] # [batch, 1]
            score_label = tf.reshape(score_label, [-1])
            entry_logits = bert_common.dense(2, bert_common.create_initializer(bert_config.initializer_range))\
                (model.get_dict_pooled_output())
            print("entry_logits: ", entry_logits.shape)
            losses = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=entry_logits, labels=score_label)
            loss = tf.reduce_mean(losses)
            total_loss = loss

        if dict_run_config.train_op == "lookup":
            lookup_idx = features["lookup_idx"]
            lookup_loss, lookup_example_loss, lookup_score = \
                sequence_index_prediction(bert_config, lookup_idx, model.get_sequence_output())

            total_loss += lookup_loss

        tvars = tf.compat.v1.trainable_variables()

        init_vars = {}
        scaffold_fn = None
        if train_config.init_checkpoint:
            if dict_run_config.is_bert_checkpoint:
                map1, map2, init_vars = get_bert_assignment_map_for_dict(tvars, train_config.init_checkpoint)

                def load_fn():
                    tf.compat.v1.train.init_from_checkpoint(train_config.init_checkpoint, map1)
                    tf.compat.v1.train.init_from_checkpoint(train_config.init_checkpoint, map2)
            else:
                map1, init_vars = get_assignment_map_as_is(tvars, train_config.init_checkpoint)

                def load_fn():
                    tf.compat.v1.train.init_from_checkpoint(train_config.init_checkpoint, map1)

            if train_config.use_tpu:
                def tpu_scaffold():
                    load_fn()
                    return tf.compat.v1.train.Scaffold()

                scaffold_fn = tpu_scaffold
            else:
                load_fn()

        logging.info("**** Trainable Variables ****")
        for var in tvars:
            init_string = ""
            if var.name in init_vars:
                init_string = ", *INIT_FROM_CKPT*"
            logging.info("    name = %s, shape = %s%s", var.name, var.shape, init_string)
        logging.info("Total parameters : %d" % get_param_num())

        output_spec = None
        if mode == tf.estimator.ModeKeys.TRAIN:
            if train_config.gradient_accumulation == 1:
                train_op = optimization.create_optimizer_from_config(total_loss, train_config)
            else:
                logging.info("Using gradient accumulation : %d" % train_config.gradient_accumulation)
                train_op = get_accumulated_optimizer_from_config(total_loss, train_config,
                                                                 tvars, train_config.gradient_accumulation)
            output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec(
                    mode=mode,
                    loss=total_loss,
                    train_op=train_op,
                    scaffold_fn=scaffold_fn)
        elif mode == tf.estimator.ModeKeys.EVAL:
            eval_metrics = (metric_fn, [
                    masked_lm_example_loss, masked_lm_log_probs, masked_lm_ids,
                    masked_lm_weights, next_sentence_example_loss,
                    next_sentence_log_probs, next_sentence_labels
            ])
            output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec(
                    mode=mode,
                    loss=total_loss,
                    eval_metrics=eval_metrics,
                    scaffold_fn=scaffold_fn)
        else:
            if dict_run_config.prediction_op == "gradient":
                logging.info("Fetching gradient")
                gradient = get_gradients(model, masked_lm_log_probs,
                                         train_config.max_predictions_per_seq, bert_config.vocab_size)
                predictions = {
                        "masked_input_ids": masked_input_ids,
                        #"input_ids": input_ids,
                        "d_input_ids": d_input_ids,
                        "masked_lm_positions": masked_lm_positions,
                        "gradients": gradient,
                }
            elif dict_run_config.prediction_op == "loss" or dict_run_config.prediction_op == "loss_fixed_mask":
                logging.info("Fetching loss")
                predictions = {
                    "masked_lm_example_loss": masked_lm_example_loss,
                }
            else:
                raise Exception("prediction target not specified")

            output_spec = tf.compat.v1.estimator.tpu.TPUEstimatorSpec(
                    mode=mode,
                    loss=total_loss,
                    predictions=predictions,
                    scaffold_fn=scaffold_fn)

        return output_spec
示例#15
0
def binary_prediction(bert_config, input_tensor):
    logits = bert_common.dense(2, bert_common.create_initializer(bert_config.initializer_range))(input_tensor)
    log_probs = tf.nn.softmax(logits, axis=2)
    return logits, log_probs
示例#16
0
 def embedding_projection(self, input_tensor):
     with tf.compat.v1.variable_scope("embedding_projection", reuse=True):
         return bc.dense(self.config.hidden_size,
                         self.initializer)(input_tensor)