def testPredict(self): """Tests predict()""" dataset = data_fn.input_fn_tfrecord( input_pattern=self.data_dir, batch_size=self.batch_size, mode=tf.estimator.ModeKeys.EVAL, feature_type2name=self.feature_type2name, feature_name2num=self.feature_name2num, input_pipeline_context=None, ) detext_model = model.create_detext_model(self.feature_type2name, task_type=self.task_type, **self.deep_match_param) predicted_output = train_flow_helper.predict_with_additional_info( dataset, detext_model, self.feature_type2name) for output in predicted_output: for key in [ train_flow_helper._SCORES, self.feature_type2name.get( InputFtrType.WEIGHT_COLUMN_NAME, Constant()._DEFAULT_WEIGHT_FTR_NAME), self.feature_type2name.get( InputFtrType.UID_COLUMN_NAME, Constant()._DEFAULT_UID_FTR_NAME), self.feature_type2name[InputFtrType.LABEL_COLUMN_NAME] ]: self.assertIn(key, output)
def _predict_with_additional_info(inputs, label): """Predicts scores with additional info (uid, weight, label) """ uid_ftr_name = feature_type2name.get(InputFtrType.UID_COLUMN_NAME, Constant()._DEFAULT_UID_FTR_NAME) weight_ftr_name = feature_type2name.get( InputFtrType.WEIGHT_COLUMN_NAME, Constant()._DEFAULT_WEIGHT_FTR_NAME) label_ftr_name = feature_type2name[InputFtrType.LABEL_COLUMN_NAME] return { _SCORES: predict_step_fn(inputs, model, feature_type2name), uid_ftr_name: label[uid_ftr_name], weight_ftr_name: label[weight_ftr_name], label_ftr_name: label[label_ftr_name] }
def _process_data(record, features_schema): example = tf.io.parse_single_example(serialized=record, features=features_schema) example = _cast_features_to_smaller_dtype(example, feature_type2name) example = _convert_ftrs_to_dense_tensor( example, feature_type2name, Constant()._CLASSIFICATION_FTR_TYPE_TO_DENSE_DEFAULT_VAL) example = _assemble_sparse_ftrs_classification(example, feature_type2name, feature_name2num) feature_type_to_squeeze = [ InputFtrType.QUERY_COLUMN_NAME, InputFtrType.USER_ID_COLUMN_NAMES, InputFtrType.USER_TEXT_COLUMN_NAMES, InputFtrType.DOC_TEXT_COLUMN_NAMES, InputFtrType.DOC_ID_COLUMN_NAMES, InputFtrType.WEIGHT_COLUMN_NAME, InputFtrType.TASK_ID_COLUMN_NAME, InputFtrType.UID_COLUMN_NAME ] example = _squeeze_ftrs(example, feature_type2name, feature_type_to_squeeze) example = _read_specified_features(example, feature_type2name) features, labels = _split_features_and_labels(example, feature_type2name) return features, labels
def get_weight(features, labels, feature_type2name, task_ids, task_weights): """ Returns the weights adjusted with task_weights :param features: dict containing the features in data :param labels: dict containing the labels in data :param feature_type2name: dict containing mapping from feature names to feature types :param task_ids Task ids :param task_weights Task weights """ # For multitask training weight_ftr_name = feature_type2name.get( InputFtrType.WEIGHT_COLUMN_NAME, Constant()._DEFAULT_WEIGHT_FTR_NAME) weight = labels[weight_ftr_name] # Update the weight with each task's weight such that weight per document = weight * task_weight if task_ids is not None: task_id_field = features[feature_type2name[ InputFtrType.TASK_ID_COLUMN_NAME]] # shape=[batch_size,] task_ids = task_ids # e.g. [0, 1, 2] task_weights = task_weights # e.g. [0.1, 0.3, 0.6] # Expand task_id_field with shape [batch_size, num_tasks] expanded_task_id_field = tf.transpose( tf.broadcast_to( task_id_field, [len(task_ids), tf.shape(task_id_field)[0]])) task_mask = tf.cast(tf.equal(expanded_task_id_field, task_ids), dtype=tf.float32) weight *= tf.reduce_sum(task_mask * task_weights, 1) # shape=[batch_size,] return weight
def ranking_transform_fn(dataset, batch_size, mode, feature_type2name, feature_name2num, output_buffer_size, prefetch_size=tf.data.experimental.AUTOTUNE, num_parallel_calls=tf.data.experimental.AUTOTUNE): """ Preprocesses datasets for ranking task including 1. dataset shuffling 2. record parsing 3. padding and batching """ if mode == tf.estimator.ModeKeys.TRAIN: dataset = dataset.shuffle(output_buffer_size) dataset = dataset.repeat() def _process_data(record, features_schema, feature_name2num): example = tf.io.parse_single_example(serialized=record, features=features_schema) example = _cast_features_to_smaller_dtype(example, feature_type2name) example = _convert_ftrs_to_dense_tensor( example, feature_type2name, Constant()._RANKING_FTR_TYPE_TO_DENSE_DEFAULT_VAL) example = _assemble_sparse_ftrs_ranking(example, feature_type2name, feature_name2num) feature_type_to_squeeze = [ InputFtrType.QUERY_COLUMN_NAME, InputFtrType.USER_ID_COLUMN_NAMES, InputFtrType.USER_TEXT_COLUMN_NAMES, InputFtrType.WEIGHT_COLUMN_NAME, InputFtrType.TASK_ID_COLUMN_NAME, InputFtrType.UID_COLUMN_NAME ] example = _squeeze_ftrs(example, feature_type2name, feature_type_to_squeeze) example = _reshape_ftrs_to_group_wise(example, feature_type2name, feature_name2num) example = _read_specified_features(example, feature_type2name) features, labels = _split_features_and_labels(example, feature_type2name) return features, labels features_schema = _get_tfrecord_feature_parsing_schema( feature_type2name, Constant()._RANKING_FTR_TYPE_TO_SCHEMA, TaskType.RANKING) dataset = dataset.map(partial(_process_data, features_schema=features_schema, feature_name2num=feature_name2num), num_parallel_calls=num_parallel_calls) dataset = batch_dataset(dataset, feature_type2name, feature_name2num, batch_size).map( partial(_add_default_ftr_field, feature_type2name=feature_type2name), num_parallel_calls=num_parallel_calls) dataset = dataset.prefetch(prefetch_size) return dataset
def classification_transform_fn( dataset, batch_size, mode, feature_type2name, feature_name2num, output_buffer_size, prefetch_size=tf.data.experimental.AUTOTUNE, num_parallel_calls=tf.data.experimental.AUTOTUNE): """ Preprocesses datasets for classification task including 1. dataset shuffling 2. record parsing 3. padding and batching """ if mode == tf.estimator.ModeKeys.TRAIN: dataset = dataset.shuffle(output_buffer_size) dataset = dataset.repeat() def _process_data(record, features_schema): example = tf.io.parse_single_example(serialized=record, features=features_schema) example = _cast_features_to_smaller_dtype(example, feature_type2name) example = _convert_ftrs_to_dense_tensor( example, feature_type2name, Constant()._CLASSIFICATION_FTR_TYPE_TO_DENSE_DEFAULT_VAL) example = _assemble_sparse_ftrs_classification(example, feature_type2name, feature_name2num) feature_type_to_squeeze = [ InputFtrType.QUERY_COLUMN_NAME, InputFtrType.USER_ID_COLUMN_NAMES, InputFtrType.USER_TEXT_COLUMN_NAMES, InputFtrType.DOC_TEXT_COLUMN_NAMES, InputFtrType.DOC_ID_COLUMN_NAMES, InputFtrType.WEIGHT_COLUMN_NAME, InputFtrType.TASK_ID_COLUMN_NAME, InputFtrType.UID_COLUMN_NAME ] example = _squeeze_ftrs(example, feature_type2name, feature_type_to_squeeze) example = _read_specified_features(example, feature_type2name) features, labels = _split_features_and_labels(example, feature_type2name) return features, labels features_schema = _get_tfrecord_feature_parsing_schema( feature_type2name, Constant()._CLASSIFICATION_FTR_TYPE_TO_SCHEMA, TaskType.CLASSIFICATION) dataset = dataset.map(partial(_process_data, features_schema=features_schema), num_parallel_calls=num_parallel_calls) # drop_remainder=True to avoid input batch_size=0 issue in evaluation mode in multi gpu training dataset = dataset.batch(batch_size, drop_remainder=True).map( partial(_add_default_ftr_field, feature_type2name=feature_type2name), num_parallel_calls=num_parallel_calls).prefetch(prefetch_size) return dataset
def _add_default_ftr_field(features, labels, feature_type2name: dict): """ Adds default feature fields if not exist""" # Default weight as feature. Set to 1.0 if not present in data if InputFtrType.WEIGHT_COLUMN_NAME not in feature_type2name: labels.setdefault( Constant()._DEFAULT_WEIGHT_FTR_NAME, tf.ones(tf.shape( labels[feature_type2name[InputFtrType.LABEL_COLUMN_NAME]])[0], dtype=tf.float32)) # Default uid as feature for detext integration, will be -1 by default if not present in data if InputFtrType.UID_COLUMN_NAME not in feature_type2name: labels.setdefault( Constant()._DEFAULT_UID_FTR_NAME, -tf.ones(tf.shape( labels[feature_type2name[InputFtrType.LABEL_COLUMN_NAME]])[0], dtype=tf.int64)) return features, labels
def compute_softmax_loss(scores, labels): """ It computes the sum of negative log softmax loss: -sum_i lables_i * log(exp(scores_i) / (exp(scores_1) + ... + exp(scores_n))) """ # mask the padded documents mask = tf.cast(labels != Constant()._LABEL_PADDING, dtype=tf.float32) # softmax loss loss = mask * labels * (-scores + tf.expand_dims(compute_logsumexp_mask(scores, mask), axis=1)) return loss
def compute_sigmoid_cross_entropy_loss(scores, labels): """ Compute loss for pointwise ranking :param scores: Tensor Shape=[batch_size, max_group_size] :param labels: Tensor Shape=[batch_size, max_group_size] :param group_size: Tensor Shape=[batch_size] :return: Tensor Shape=[batch_size, max_group_size] """ mask = tf.cast(labels != Constant()._LABEL_PADDING, dtype=tf.float32) loss = mask * tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=scores) return tf.reduce_sum(input_tensor=loss, axis=-1)
def testComputeSoftmaxLoss(self): scores_lst = [ tf.constant([[0, 1]], dtype=tf.dtypes.float32), tf.constant([[2, 1, 3, 0], [5, 2, 7, 9]], dtype=tf.dtypes.float32), ] labels_lst = [ tf.constant([[0, 1]], dtype=tf.dtypes.float32), tf.constant( [[1, 0, Constant()._LABEL_PADDING, Constant()._LABEL_PADDING], [1, 3, 2, Constant()._LABEL_PADDING]], dtype=tf.dtypes.float32), ] group_size_lst = [ tf.constant([2]), tf.constant([2, 3]), ] self.assertTrue( len(scores_lst) == len(labels_lst) == len(group_size_lst)) for scores, labels, group_size in zip(scores_lst, labels_lst, group_size_lst): self._testComputeSoftmaxLoss(scores, labels, group_size)
def call(self, scores, labels): """ Compute the pairwise loss. :param scores: A tensor with shape [batch_size, max_group_size]. For each batch, the first element is the score of correct answer. :param labels: A matrix with shape [batch_size, max_group_size]. The true scores of each document. :return: lambdarank loss and mask. Each with shape [batch_size, max_group_size, max_group_size] """ # for a query, compute the pairwise doc score diff of any two documents. pair_score_diff = tf.expand_dims(scores, axis=2) - tf.expand_dims(scores, axis=1) # compute the loss loss = -1 * tf.math.log_sigmoid(pair_score_diff) # now loss is a [batch_size, max_group_size, max_group_size] tensor that contains all pairwise loss. # we only need to keep a subset of the pairs. # the first mask is from group_size group_size_mask = tf.cast(labels != Constant()._LABEL_PADDING, dtype=tf.float32) group_size = tf.reduce_sum(group_size_mask, axis=-1) group_size_mask = tf.expand_dims(group_size_mask, axis=2) * tf.expand_dims(group_size_mask, axis=1) # the second mask is from label; only keep the pairs that 1st label value is larger than 2nd value label_mask = tf.expand_dims(labels, axis=2) - tf.expand_dims(labels, axis=1) label_mask = tf.cast(tf.greater(label_mask, tf.zeros_like(label_mask)), dtype=tf.float32) pairwise_mask = group_size_mask * label_mask loss *= pairwise_mask if self.lambda_metric: # compute each element's rank rank_mat = self.compute_rank(scores, group_size) if self.lambda_metric['metric'] == 'ndcg': # ideal dcg idcg = self.compute_dcg(labels, labels, group_size, self.lambda_metric['topk']) # delta_score delta_score = tf.expand_dims(labels, axis=2) - tf.expand_dims(labels, axis=1) # delta rank reci_log_rank = tf.math.log(2.0) / tf.math.log(tf.cast(rank_mat, dtype=tf.float32) + 1) delta_rank = tf.expand_dims(reci_log_rank, axis=2) - tf.expand_dims(reci_log_rank, axis=1) # delta_ndcg = |delta_score * delta_rank| / idcg delta_ndcg = tf.abs(delta_score * delta_rank) / tf.expand_dims(tf.expand_dims(idcg, 1), 1) # lambda loss loss *= delta_ndcg return loss, pairwise_mask
def _get_padded_shapes_and_values(feature_type2name: dict, feature_name2num: dict): """Returns padded_shape and padd_values for each feature :param feature_type2name Map from feature types to feature names EXCLUDING 'label' """ ftr_name2padded_shapes = dict() ftr_name2padded_values = dict() for ftr_type, ftr_name_lst in iterate_items_with_list_val( feature_type2name): # Do not handle sparse features. It will be handled separately in the sparse batch function if ftr_type in [ InputFtrType.SPARSE_FTRS_COLUMN_NAMES, InputFtrType.SHALLOW_TOWER_SPARSE_FTRS_COLUMN_NAMES ]: continue # Padded shapes and values already known and initialized if ftr_type in (InputFtrType.LABEL_COLUMN_NAME, InputFtrType.WEIGHT_COLUMN_NAME, InputFtrType.UID_COLUMN_NAME): continue # The last dimension of dense features is known and could be different from each other. Therefore, we put the # padded shape for each dense features column separately if ftr_type == InputFtrType.DENSE_FTRS_COLUMN_NAMES: for ftr_name in ftr_name_lst: ftr_name2padded_shapes[ftr_name] = tf.TensorShape( [None, feature_name2num[ftr_name]]) ftr_name2padded_values[ftr_name] = Constant( )._FTR_TYPE2PADDED_VALUE[ftr_type] continue for ftr_name in ftr_name_lst: ftr_name2padded_shapes[ftr_name] = Constant( )._FTR_TYPE2PADDED_SHAPE[ftr_type] ftr_name2padded_values[ftr_name] = Constant( )._FTR_TYPE2PADDED_VALUE[ftr_type] label_padded_shapes = { feature_type2name[InputFtrType.LABEL_COLUMN_NAME]: tf.TensorShape([None]), } label_padded_values = { feature_type2name[InputFtrType.LABEL_COLUMN_NAME]: Constant()._LABEL_PADDING, } if InputFtrType.WEIGHT_COLUMN_NAME in feature_type2name: label_padded_shapes[feature_type2name[ InputFtrType.WEIGHT_COLUMN_NAME]] = tf.TensorShape([]) label_padded_values[feature_type2name[ InputFtrType.WEIGHT_COLUMN_NAME]] = 1.0 if InputFtrType.UID_COLUMN_NAME in feature_type2name: label_padded_shapes[feature_type2name[ InputFtrType.UID_COLUMN_NAME]] = tf.TensorShape([]) label_padded_values[feature_type2name[ InputFtrType.UID_COLUMN_NAME]] = tf.cast(0, tf.int64) ftr_name2padded_shapes = (ftr_name2padded_shapes, label_padded_shapes) ftr_name2padded_values = (ftr_name2padded_values, label_padded_values) return ftr_name2padded_shapes, ftr_name2padded_values