def classifier(config, pooled_output, num_labels, labels, dropout_prob, ratio_weight=None, **kargs): output_layer = pooled_output hidden_size = output_layer.shape[-1].value output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("output_bias", [num_labels], initializer=tf.zeros_initializer()) output_layer = tf.nn.dropout(output_layer, keep_prob=1 - dropout_prob) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) if config.get("label_type", "single_label") == "single_label": if config.get("loss", "entropy") == "entropy": print("==standard cross entropy==") per_example_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=tf.stop_gradient(labels)) elif config.get("loss", "entropy") == "focal_loss": print("==multi_label focal loss==") per_example_loss, _ = loss_utils.focal_loss_multi_v1(config, logits=logits, labels=labels) try: per_example_loss = loss_utils.weighted_loss_ratio( config, per_example_loss, labels, ratio_weight) loss = tf.reduce_sum(per_example_loss) print(" == applying weighted loss == ") except: loss = tf.reduce_mean(per_example_loss) if config.get("with_center_loss", "no") == "center_loss": print("==apply with center loss==") center_loss, _ = loss_utils.center_loss_v2(config, features=pooled_output, labels=labels) loss += center_loss * config.get("center_loss_coef", 1e-3) return (loss, per_example_loss, logits) elif config.get("label_type", "single_label") == "multi_label": logits = tf.log_sigmoid(logits) per_example_loss = tf.nn.sigmoid_cross_entropy_with_logits( logits=logits, labels=tf.stop_gradient(labels)) per_example_loss = tf.reduce_sum(per_example_loss, axis=-1) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, logits) else: raise NotImplementedError()
def multi_choice_classifier(config, pooled_output, num_labels, labels, dropout_prob): output_layer = pooled_output final_hidden_shape = bert_utils.get_shape_list(output_layer, expected_rank=2) print(final_hidden_shape, "====multi-choice shape====") output_layer = tf.reshape(output_layer, [-1, num_labels, final_hidden_shape[-1] ]) # batch x num_choices x hidden_dim hidden_size = output_layer.shape[-1].value output_weights = tf.get_variable( "output_weights", [hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("output_bias", [num_labels], initializer=tf.zeros_initializer()) output_layer = tf.nn.dropout(output_layer, keep_prob=1 - dropout_prob) logits = tf.einsum("abc,c->ab", output_layer, output_weights) logits = tf.nn.bias_add(logits, output_bias) # batch x num_labels if config.get("loss_type", "entropy") == "focal_loss": per_example_loss = loss_utils.focal_loss_multi_v1(logits=logits, labels=labels) else: per_example_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=tf.stop_gradient(labels)) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, logits)
def order_classifier_v1(config, output_lst, num_labels, labels, dropout_prob, ratio_weight=None): assert len(output_lst) == 2 seq_output_a = output_lst[0] seq_output_b = output_lst[1] # batch x (hidden x 2) # repres = tf.concat([seq_output_a, seq_output_b], # axis=-1) repres = seq_output_a + seq_output_b hidden_size = repres.shape[-1].value repres = tf.layers.dense(repres, hidden_size, activation=tf.nn.tanh, name="output_dense") output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("output_bias", [num_labels], initializer=tf.zeros_initializer()) output_layer = tf.nn.dropout(repres, keep_prob=1 - dropout_prob) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) if config.get("label_type", "single_label") == "single_label": if config.get("loss", "entropy") == "entropy": per_example_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=tf.stop_gradient(labels)) elif config.get("loss", "entropy") == "focal_loss": tf.logging.info("===apply multi-class focal loss===") print("===apply multi-class focal loss===") per_example_loss = loss_utils.focal_loss_multi_v1(config, logits=logits, labels=labels) try: per_example_loss = loss_utils.weighted_loss_ratio( config, per_example_loss, labels, ratio_weight) loss = tf.reduce_sum(per_example_loss) except: loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, logits)
def siamese_classifier(config, pooled_output, num_labels, labels, dropout_prob, ratio_weight=None): if config.get("output_layer", "interaction") == "interaction": print("==apply interaction layer==") repres_a = pooled_output[0] repres_b = pooled_output[1] output_layer = tf.concat([ repres_a, repres_b, tf.abs(repres_a - repres_b), repres_a * repres_b ], axis=-1) hidden_size = output_layer.shape[-1].value output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("output_bias", [num_labels], initializer=tf.zeros_initializer()) output_layer = tf.nn.dropout(output_layer, keep_prob=1 - dropout_prob) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) print("==logits shape==", logits.get_shape()) if config.get("label_type", "single_label") == "single_label": if config.get("loss", "entropy") == "entropy": per_example_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=tf.stop_gradient(labels)) elif config.get("loss", "entropy") == "focal_loss": per_example_loss, _ = loss_utils.focal_loss_multi_v1( config, logits=logits, labels=labels) print("==per_example_loss shape==", per_example_loss.get_shape()) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, logits) elif config.get("label_type", "single_label") == "multi_label": logits = tf.log_sigmoid(logits) per_example_loss = tf.nn.sigmoid_cross_entropy_with_logits( logits=logits, labels=tf.stop_gradient(labels)) per_example_loss = tf.reduce_mean(per_example_loss, axis=-1) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, logits) else: raise NotImplementedError()
def distributed_classifier(config, pooled_output, num_labels, labels, dropout_prob, ratio_weight=None): output_layer = pooled_output hidden_size = output_layer.shape[-1].value output_weights = tf.get_variable( "output_weights", [num_labels, hidden_size], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("output_bias", [num_labels], initializer=tf.zeros_initializer()) output_layer = tf.nn.dropout(output_layer, keep_prob=1 - dropout_prob) logits = tf.matmul(output_layer, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) if config.get("label_type", "single_label") == "single_label": if config.get("loss", "entropy") == "entropy": per_example_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( logits=logits, labels=tf.stop_gradient(labels)) elif config.get("loss", "entropy") == "focal_loss": per_example_loss = loss_utils.focal_loss_multi_v1(config, logits=logits, labels=labels) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, logits) elif config.get("label_type", "single_label") == "multi_label": logits = tf.log_sigmoid(logits) per_example_loss = tf.nn.sigmoid_cross_entropy_with_logits( logits=logits, labels=tf.stop_gradient(labels)) per_example_loss = tf.reduce_mean(per_example_loss, axis=-1) loss = tf.reduce_mean(per_example_loss) return (loss, per_example_loss, logits) else: raise NotImplementedError()
def multi_position_classifier(config, features, sequence_output, num_labels, dropout_prob): final_hidden_shape = bert_utils.get_shape_list(sequence_output, expected_rank=3) print(final_hidden_shape, "====multi-choice shape====") answer_pos = tf.cast(features['label_positions'], tf.int32) cls_pos = tf.zeros_like(answer_pos) input_tensor = bert_utils.gather_indexes(sequence_output, answer_pos) cls_tensor = bert_utils.gather_indexes(sequence_output, cls_pos) answer_cls_tensor = tf.concat([cls_tensor, input_tensor], axis=-1) input_tensor = tf.layers.dense( answer_cls_tensor, units=config.hidden_size, activation=bert_modules.get_activation(config.hidden_act), kernel_initializer=bert_modules.create_initializer( config.initializer_range)) input_tensor = bert_modules.layer_norm(input_tensor) output_weights = tf.get_variable( "output_weights", [num_labels, final_hidden_shape[-1]], initializer=tf.truncated_normal_initializer(stddev=0.02)) output_bias = tf.get_variable("output_bias", shape=[num_labels], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) label_ids = tf.reshape(tf.cast(features['label_ids'], tf.int32), [-1]) label_weights = tf.reshape(tf.cast(features['label_weights'], tf.float32), [-1]) if config.get('class_weights', None): class_weights = tf.constant( np.array(config.class_weights).astype(np.float32)) if config.get("loss", "entropy") == "focal_loss": per_example_loss, _ = loss_utils.focal_loss_multi_v1( config, logits=logits, labels=tf.stop_gradient(label_ids)) elif config.get("loss", "smoothed_ce") == 'smoothed_ce': per_example_loss = loss_utils.ce_label_smoothing( config, logits=logits, labels=tf.stop_gradient(label_ids)) elif config.get('loss', 'class_balanced_focal') == 'class_balanced_focal': per_example_loss, _ = loss_utils.class_balanced_focal_loss_multi_v1( config, logits=logits, labels=label_ids, label_weights=class_weights) else: per_example_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( labels=tf.stop_gradient(label_ids), logits=logits) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return (loss, per_example_loss, logits)
def get_masked_lm_output(config, input_tensor, output_weights, positions, label_ids, label_weights, reuse=None): """Get loss and log probs for the masked LM.""" input_tensor = tf.cast(input_tensor, tf.float32) positions = tf.cast(positions, tf.int32) label_ids = tf.cast(label_ids, tf.int32) label_weights = tf.cast(label_weights, tf.float32) input_tensor = bert_utils.gather_indexes(input_tensor, positions) """ flatten masked lm ids with positions """ with tf.variable_scope("cls/predictions", reuse=reuse): # We apply one more non-linear transformation before the output layer. # This matrix is not used after pre-training. with tf.variable_scope("transform"): input_tensor = tf.layers.dense( input_tensor, units=config.hidden_size, activation=bert_modules.get_activation(config.hidden_act), kernel_initializer=bert_modules.create_initializer( config.initializer_range)) input_tensor = bert_modules.layer_norm(input_tensor) # The output weights are the same as the input embeddings, but there is # an output-only bias for each token. output_bias = tf.get_variable("output_bias", shape=[config.vocab_size], initializer=tf.zeros_initializer()) logits = tf.matmul(input_tensor, output_weights, transpose_b=True) logits = tf.nn.bias_add(logits, output_bias) log_probs = tf.nn.log_softmax(logits, axis=-1) label_ids = tf.reshape(label_ids, [-1]) label_weights = tf.cast(tf.reshape(label_weights, [-1]), tf.float32) per_example_loss, target_predictions = loss_utils.focal_loss_multi_v1( config, logits, label_ids) # one_hot_labels = tf.one_hot( # label_ids, depth=config.vocab_size, dtype=tf.float32) # tsa_start = 0.5 / config.vocab_size # tsa_threshold = tsa.get_tsa_threshold( # config.tsa, # tf.train.get_or_create_global_step(), # config.num_train_steps, # tsa_start, end=1) # larger_than_threshold = tf.greater( # target_predictions, tsa_threshold) # loss_mask = label_weights * (1 - tf.cast(larger_than_threshold, tf.float32)) # per_example_loss = tf.nn.sparse_softmax_cross_entropy_with_logits( # labels=tf.stop_gradient(label_ids), # logits=logits) # per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) numerator = tf.reduce_sum(label_weights * per_example_loss) denominator = tf.reduce_sum(label_weights) + 1e-5 # The `positions` tensor might be zero-padded (if the sequence is too # short to have the maximum number of predictions). The `label_weights` # tensor has a value of 1.0 for every real prediction and 0.0 for the # padding predictions. # per_example_loss = -tf.reduce_sum(log_probs * one_hot_labels, axis=[-1]) # numerator = tf.reduce_sum(label_weights * per_example_loss) # denominator = tf.reduce_sum(label_weights) + 1e-5 loss = numerator / denominator return (loss, per_example_loss, log_probs, label_weights)