示例#1
0
    def testPredict(self):
        """Tests predict()"""
        dataset = data_fn.input_fn_tfrecord(
            input_pattern=self.data_dir,
            batch_size=self.batch_size,
            mode=tf.estimator.ModeKeys.EVAL,
            feature_type2name=self.feature_type2name,
            feature_name2num=self.feature_name2num,
            input_pipeline_context=None,
        )

        detext_model = model.create_detext_model(self.feature_type2name,
                                                 task_type=self.task_type,
                                                 **self.deep_match_param)
        predicted_output = train_flow_helper.predict_with_additional_info(
            dataset, detext_model, self.feature_type2name)

        for output in predicted_output:
            for key in [
                    train_flow_helper._SCORES,
                    self.feature_type2name.get(
                        InputFtrType.WEIGHT_COLUMN_NAME,
                        Constant()._DEFAULT_WEIGHT_FTR_NAME),
                    self.feature_type2name.get(
                        InputFtrType.UID_COLUMN_NAME,
                        Constant()._DEFAULT_UID_FTR_NAME),
                    self.feature_type2name[InputFtrType.LABEL_COLUMN_NAME]
            ]:
                self.assertIn(key, output)
示例#2
0
 def _predict_with_additional_info(inputs, label):
     """Predicts scores with additional info (uid, weight, label) """
     uid_ftr_name = feature_type2name.get(InputFtrType.UID_COLUMN_NAME,
                                          Constant()._DEFAULT_UID_FTR_NAME)
     weight_ftr_name = feature_type2name.get(
         InputFtrType.WEIGHT_COLUMN_NAME,
         Constant()._DEFAULT_WEIGHT_FTR_NAME)
     label_ftr_name = feature_type2name[InputFtrType.LABEL_COLUMN_NAME]
     return {
         _SCORES: predict_step_fn(inputs, model, feature_type2name),
         uid_ftr_name: label[uid_ftr_name],
         weight_ftr_name: label[weight_ftr_name],
         label_ftr_name: label[label_ftr_name]
     }
示例#3
0
    def _process_data(record, features_schema):
        example = tf.io.parse_single_example(serialized=record,
                                             features=features_schema)

        example = _cast_features_to_smaller_dtype(example, feature_type2name)
        example = _convert_ftrs_to_dense_tensor(
            example, feature_type2name,
            Constant()._CLASSIFICATION_FTR_TYPE_TO_DENSE_DEFAULT_VAL)
        example = _assemble_sparse_ftrs_classification(example,
                                                       feature_type2name,
                                                       feature_name2num)

        feature_type_to_squeeze = [
            InputFtrType.QUERY_COLUMN_NAME, InputFtrType.USER_ID_COLUMN_NAMES,
            InputFtrType.USER_TEXT_COLUMN_NAMES,
            InputFtrType.DOC_TEXT_COLUMN_NAMES,
            InputFtrType.DOC_ID_COLUMN_NAMES, InputFtrType.WEIGHT_COLUMN_NAME,
            InputFtrType.TASK_ID_COLUMN_NAME, InputFtrType.UID_COLUMN_NAME
        ]
        example = _squeeze_ftrs(example, feature_type2name,
                                feature_type_to_squeeze)
        example = _read_specified_features(example, feature_type2name)
        features, labels = _split_features_and_labels(example,
                                                      feature_type2name)
        return features, labels
示例#4
0
def get_weight(features, labels, feature_type2name, task_ids, task_weights):
    """ Returns the weights adjusted with task_weights

    :param features: dict containing the features in data
    :param labels: dict containing the labels in data
    :param feature_type2name: dict containing mapping from feature names to feature types
    :param task_ids Task ids
    :param task_weights Task weights
    """
    # For multitask training
    weight_ftr_name = feature_type2name.get(
        InputFtrType.WEIGHT_COLUMN_NAME,
        Constant()._DEFAULT_WEIGHT_FTR_NAME)
    weight = labels[weight_ftr_name]

    # Update the weight with each task's weight such that weight per document = weight * task_weight
    if task_ids is not None:
        task_id_field = features[feature_type2name[
            InputFtrType.TASK_ID_COLUMN_NAME]]  # shape=[batch_size,]
        task_ids = task_ids  # e.g. [0, 1, 2]
        task_weights = task_weights  # e.g. [0.1, 0.3, 0.6]
        # Expand task_id_field with shape [batch_size, num_tasks]
        expanded_task_id_field = tf.transpose(
            tf.broadcast_to(
                task_id_field,
                [len(task_ids), tf.shape(task_id_field)[0]]))
        task_mask = tf.cast(tf.equal(expanded_task_id_field, task_ids),
                            dtype=tf.float32)
        weight *= tf.reduce_sum(task_mask * task_weights,
                                1)  # shape=[batch_size,]

    return weight
示例#5
0
def ranking_transform_fn(dataset,
                         batch_size,
                         mode,
                         feature_type2name,
                         feature_name2num,
                         output_buffer_size,
                         prefetch_size=tf.data.experimental.AUTOTUNE,
                         num_parallel_calls=tf.data.experimental.AUTOTUNE):
    """ Preprocesses datasets for ranking task including
        1. dataset shuffling
        2. record parsing
        3. padding and batching
    """
    if mode == tf.estimator.ModeKeys.TRAIN:
        dataset = dataset.shuffle(output_buffer_size)
        dataset = dataset.repeat()

    def _process_data(record, features_schema, feature_name2num):
        example = tf.io.parse_single_example(serialized=record,
                                             features=features_schema)

        example = _cast_features_to_smaller_dtype(example, feature_type2name)
        example = _convert_ftrs_to_dense_tensor(
            example, feature_type2name,
            Constant()._RANKING_FTR_TYPE_TO_DENSE_DEFAULT_VAL)
        example = _assemble_sparse_ftrs_ranking(example, feature_type2name,
                                                feature_name2num)

        feature_type_to_squeeze = [
            InputFtrType.QUERY_COLUMN_NAME, InputFtrType.USER_ID_COLUMN_NAMES,
            InputFtrType.USER_TEXT_COLUMN_NAMES,
            InputFtrType.WEIGHT_COLUMN_NAME, InputFtrType.TASK_ID_COLUMN_NAME,
            InputFtrType.UID_COLUMN_NAME
        ]
        example = _squeeze_ftrs(example, feature_type2name,
                                feature_type_to_squeeze)
        example = _reshape_ftrs_to_group_wise(example, feature_type2name,
                                              feature_name2num)

        example = _read_specified_features(example, feature_type2name)
        features, labels = _split_features_and_labels(example,
                                                      feature_type2name)
        return features, labels

    features_schema = _get_tfrecord_feature_parsing_schema(
        feature_type2name,
        Constant()._RANKING_FTR_TYPE_TO_SCHEMA, TaskType.RANKING)
    dataset = dataset.map(partial(_process_data,
                                  features_schema=features_schema,
                                  feature_name2num=feature_name2num),
                          num_parallel_calls=num_parallel_calls)

    dataset = batch_dataset(dataset, feature_type2name, feature_name2num,
                            batch_size).map(
                                partial(_add_default_ftr_field,
                                        feature_type2name=feature_type2name),
                                num_parallel_calls=num_parallel_calls)
    dataset = dataset.prefetch(prefetch_size)
    return dataset
示例#6
0
def classification_transform_fn(
        dataset,
        batch_size,
        mode,
        feature_type2name,
        feature_name2num,
        output_buffer_size,
        prefetch_size=tf.data.experimental.AUTOTUNE,
        num_parallel_calls=tf.data.experimental.AUTOTUNE):
    """ Preprocesses datasets for classification task including
        1. dataset shuffling
        2. record parsing
        3. padding and batching
    """
    if mode == tf.estimator.ModeKeys.TRAIN:
        dataset = dataset.shuffle(output_buffer_size)
        dataset = dataset.repeat()

    def _process_data(record, features_schema):
        example = tf.io.parse_single_example(serialized=record,
                                             features=features_schema)

        example = _cast_features_to_smaller_dtype(example, feature_type2name)
        example = _convert_ftrs_to_dense_tensor(
            example, feature_type2name,
            Constant()._CLASSIFICATION_FTR_TYPE_TO_DENSE_DEFAULT_VAL)
        example = _assemble_sparse_ftrs_classification(example,
                                                       feature_type2name,
                                                       feature_name2num)

        feature_type_to_squeeze = [
            InputFtrType.QUERY_COLUMN_NAME, InputFtrType.USER_ID_COLUMN_NAMES,
            InputFtrType.USER_TEXT_COLUMN_NAMES,
            InputFtrType.DOC_TEXT_COLUMN_NAMES,
            InputFtrType.DOC_ID_COLUMN_NAMES, InputFtrType.WEIGHT_COLUMN_NAME,
            InputFtrType.TASK_ID_COLUMN_NAME, InputFtrType.UID_COLUMN_NAME
        ]
        example = _squeeze_ftrs(example, feature_type2name,
                                feature_type_to_squeeze)
        example = _read_specified_features(example, feature_type2name)
        features, labels = _split_features_and_labels(example,
                                                      feature_type2name)
        return features, labels

    features_schema = _get_tfrecord_feature_parsing_schema(
        feature_type2name,
        Constant()._CLASSIFICATION_FTR_TYPE_TO_SCHEMA, TaskType.CLASSIFICATION)
    dataset = dataset.map(partial(_process_data,
                                  features_schema=features_schema),
                          num_parallel_calls=num_parallel_calls)

    # drop_remainder=True to avoid input batch_size=0 issue in evaluation mode in multi gpu training
    dataset = dataset.batch(batch_size, drop_remainder=True).map(
        partial(_add_default_ftr_field, feature_type2name=feature_type2name),
        num_parallel_calls=num_parallel_calls).prefetch(prefetch_size)
    return dataset
示例#7
0
def _add_default_ftr_field(features, labels, feature_type2name: dict):
    """ Adds default feature fields if not exist"""

    # Default weight as feature. Set to 1.0 if not present in data
    if InputFtrType.WEIGHT_COLUMN_NAME not in feature_type2name:
        labels.setdefault(
            Constant()._DEFAULT_WEIGHT_FTR_NAME,
            tf.ones(tf.shape(
                labels[feature_type2name[InputFtrType.LABEL_COLUMN_NAME]])[0],
                    dtype=tf.float32))

    # Default uid as feature for detext integration, will be -1 by default if not present in data
    if InputFtrType.UID_COLUMN_NAME not in feature_type2name:
        labels.setdefault(
            Constant()._DEFAULT_UID_FTR_NAME, -tf.ones(tf.shape(
                labels[feature_type2name[InputFtrType.LABEL_COLUMN_NAME]])[0],
                                                       dtype=tf.int64))

    return features, labels
示例#8
0
def compute_softmax_loss(scores, labels):
    """
    It computes the sum of negative log softmax loss:
    -sum_i lables_i * log(exp(scores_i) / (exp(scores_1) + ... + exp(scores_n)))
    """
    # mask the padded documents
    mask = tf.cast(labels != Constant()._LABEL_PADDING, dtype=tf.float32)
    # softmax loss
    loss = mask * labels * (-scores + tf.expand_dims(compute_logsumexp_mask(scores, mask), axis=1))
    return loss
示例#9
0
def compute_sigmoid_cross_entropy_loss(scores, labels):
    """ Compute loss for pointwise ranking

    :param scores: Tensor  Shape=[batch_size, max_group_size]
    :param labels: Tensor  Shape=[batch_size, max_group_size]
    :param group_size: Tensor  Shape=[batch_size]
    :return: Tensor  Shape=[batch_size, max_group_size]
   """
    mask = tf.cast(labels != Constant()._LABEL_PADDING, dtype=tf.float32)
    loss = mask * tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=scores)
    return tf.reduce_sum(input_tensor=loss, axis=-1)
示例#10
0
 def testComputeSoftmaxLoss(self):
     scores_lst = [
         tf.constant([[0, 1]], dtype=tf.dtypes.float32),
         tf.constant([[2, 1, 3, 0], [5, 2, 7, 9]], dtype=tf.dtypes.float32),
     ]
     labels_lst = [
         tf.constant([[0, 1]], dtype=tf.dtypes.float32),
         tf.constant(
             [[1, 0,
               Constant()._LABEL_PADDING,
               Constant()._LABEL_PADDING],
              [1, 3, 2, Constant()._LABEL_PADDING]],
             dtype=tf.dtypes.float32),
     ]
     group_size_lst = [
         tf.constant([2]),
         tf.constant([2, 3]),
     ]
     self.assertTrue(
         len(scores_lst) == len(labels_lst) == len(group_size_lst))
     for scores, labels, group_size in zip(scores_lst, labels_lst,
                                           group_size_lst):
         self._testComputeSoftmaxLoss(scores, labels, group_size)
示例#11
0
    def call(self, scores, labels):
        """ Compute the pairwise loss.

        :param scores: A tensor with shape [batch_size, max_group_size].  For each batch, the first element is the score of
        correct answer.
        :param labels: A matrix with shape [batch_size, max_group_size].  The true scores of each document.
        :return: lambdarank loss and mask. Each with shape [batch_size, max_group_size, max_group_size]
        """

        # for a query, compute the pairwise doc score diff of any two documents.
        pair_score_diff = tf.expand_dims(scores, axis=2) - tf.expand_dims(scores, axis=1)
        # compute the loss
        loss = -1 * tf.math.log_sigmoid(pair_score_diff)
        # now loss is a [batch_size, max_group_size, max_group_size] tensor that contains all pairwise loss.
        # we only need to keep a subset of the pairs.
        # the first mask is from group_size
        group_size_mask = tf.cast(labels != Constant()._LABEL_PADDING, dtype=tf.float32)
        group_size = tf.reduce_sum(group_size_mask, axis=-1)
        group_size_mask = tf.expand_dims(group_size_mask, axis=2) * tf.expand_dims(group_size_mask, axis=1)
        # the second mask is from label; only keep the pairs that 1st label value is larger than 2nd value
        label_mask = tf.expand_dims(labels, axis=2) - tf.expand_dims(labels, axis=1)
        label_mask = tf.cast(tf.greater(label_mask, tf.zeros_like(label_mask)), dtype=tf.float32)
        pairwise_mask = group_size_mask * label_mask
        loss *= pairwise_mask

        if self.lambda_metric:
            # compute each element's rank
            rank_mat = self.compute_rank(scores, group_size)
            if self.lambda_metric['metric'] == 'ndcg':
                # ideal dcg
                idcg = self.compute_dcg(labels, labels, group_size, self.lambda_metric['topk'])
                # delta_score
                delta_score = tf.expand_dims(labels, axis=2) - tf.expand_dims(labels, axis=1)
                # delta rank
                reci_log_rank = tf.math.log(2.0) / tf.math.log(tf.cast(rank_mat, dtype=tf.float32) + 1)
                delta_rank = tf.expand_dims(reci_log_rank, axis=2) - tf.expand_dims(reci_log_rank, axis=1)
                # delta_ndcg = |delta_score * delta_rank| / idcg
                delta_ndcg = tf.abs(delta_score * delta_rank) / tf.expand_dims(tf.expand_dims(idcg, 1), 1)
                # lambda loss
                loss *= delta_ndcg

        return loss, pairwise_mask
示例#12
0
def _get_padded_shapes_and_values(feature_type2name: dict,
                                  feature_name2num: dict):
    """Returns padded_shape and padd_values for each feature

    :param feature_type2name Map from feature types to feature names EXCLUDING 'label'
    """

    ftr_name2padded_shapes = dict()
    ftr_name2padded_values = dict()

    for ftr_type, ftr_name_lst in iterate_items_with_list_val(
            feature_type2name):
        # Do not handle sparse features. It will be handled separately in the sparse batch function
        if ftr_type in [
                InputFtrType.SPARSE_FTRS_COLUMN_NAMES,
                InputFtrType.SHALLOW_TOWER_SPARSE_FTRS_COLUMN_NAMES
        ]:
            continue

        # Padded shapes and values already known and initialized
        if ftr_type in (InputFtrType.LABEL_COLUMN_NAME,
                        InputFtrType.WEIGHT_COLUMN_NAME,
                        InputFtrType.UID_COLUMN_NAME):
            continue

        # The last dimension of dense features is known and could be different from each other. Therefore, we put the
        #   padded shape for each dense features column separately
        if ftr_type == InputFtrType.DENSE_FTRS_COLUMN_NAMES:
            for ftr_name in ftr_name_lst:
                ftr_name2padded_shapes[ftr_name] = tf.TensorShape(
                    [None, feature_name2num[ftr_name]])
                ftr_name2padded_values[ftr_name] = Constant(
                )._FTR_TYPE2PADDED_VALUE[ftr_type]
            continue

        for ftr_name in ftr_name_lst:
            ftr_name2padded_shapes[ftr_name] = Constant(
            )._FTR_TYPE2PADDED_SHAPE[ftr_type]
            ftr_name2padded_values[ftr_name] = Constant(
            )._FTR_TYPE2PADDED_VALUE[ftr_type]

    label_padded_shapes = {
        feature_type2name[InputFtrType.LABEL_COLUMN_NAME]:
        tf.TensorShape([None]),
    }
    label_padded_values = {
        feature_type2name[InputFtrType.LABEL_COLUMN_NAME]:
        Constant()._LABEL_PADDING,
    }

    if InputFtrType.WEIGHT_COLUMN_NAME in feature_type2name:
        label_padded_shapes[feature_type2name[
            InputFtrType.WEIGHT_COLUMN_NAME]] = tf.TensorShape([])
        label_padded_values[feature_type2name[
            InputFtrType.WEIGHT_COLUMN_NAME]] = 1.0

    if InputFtrType.UID_COLUMN_NAME in feature_type2name:
        label_padded_shapes[feature_type2name[
            InputFtrType.UID_COLUMN_NAME]] = tf.TensorShape([])
        label_padded_values[feature_type2name[
            InputFtrType.UID_COLUMN_NAME]] = tf.cast(0, tf.int64)

    ftr_name2padded_shapes = (ftr_name2padded_shapes, label_padded_shapes)
    ftr_name2padded_values = (ftr_name2padded_values, label_padded_values)
    return ftr_name2padded_shapes, ftr_name2padded_values