Пример #1
0
def _dssm_model(features, labels, mode, params):
    """
    Returns:
        构建双塔模型
    """
    user_emb = tf.feature_column.input_layer(
        features, params['feature_columns']['user_columns'])
    good_emb = tf.feature_column.input_layer(
        features, params['feature_columns']['good_columns'])

    with tf.name_scope('user'):
        user_emb = build_deep_layers(user_emb, params, mode, name='user')
    with tf.name_scope('goods'):
        good_emb = build_deep_layers(good_emb, params, mode, name='good')

    head = head_lib._binary_logistic_or_multi_class_head(
        n_classes=2,
        weight_column=None,
        label_vocabulary=None,
        loss_reduction=losses.Reduction.SUM)
    logits = tf.layers.dense(
        tf.multiply(user_emb, good_emb),
        units=head.logits_dimension,
        kernel_initializer=tf.glorot_uniform_initializer())
    preds = tf.sigmoid(logits)

    # similarity = tf.reduce_sum(tf.multiply(user_emb, good_emb), axis=-1)
    # predictions = tf.nn.sigmoid(similarity)

    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {'probabilities': preds}
        return tf.estimator.EstimatorSpec(mode, predictions=predictions)

    loss = tf.reduce_sum(
        tf.nn.sigmoid_cross_entropy_with_logits(
            labels=tf.cast(labels['ctr_label'], tf.float32),
            logits=tf.cast(logits, tf.float32)))

    auc = tf.metrics.auc(labels['ctr_label'], preds)
    metrics = {'auc': auc}
    tf.summary.scalar('auc', auc[1])

    if mode == tf.estimator.ModeKeys.EVAL:
        return tf.estimator.EstimatorSpec(mode,
                                          loss=loss,
                                          eval_metric_ops=metrics)

    assert mode == tf.estimator.ModeKeys.TRAIN
    statrt_learning_rate = params['learning_rate']
    global_step = tf.train.get_global_step()
    learning_rate = tf.train.exponential_decay(
        learning_rate=statrt_learning_rate,
        global_step=global_step,
        decay_steps=params['decay_steps'],
        decay_rate=params['decay_rate'],
        staircase=False)
    tf.summary.scalar('learning_rate', learning_rate)
    optimizer = tf.train.AdagradOptimizer(learning_rate=learning_rate)
    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
Пример #2
0
def xdeepfm_model_fn(features, labels, mode, params):
    net = tf.feature_column.input_layer(features, params['feature_columns'])

    last_deep_layer = _build_deep_layers(net, params)
    last_xdeepfm_layer = _build_xdeepfm_layers(net, params)

    if params['use_xdeepfm']:
        print('--use xdeepfm layer--')
        last_layer = tf.concat([last_deep_layer, last_xdeepfm_layer], 1)
    else:
        last_layer = last_deep_layer

    # head = tf.contrib.estimator.binary_classification_head(loss_reduction=losses.Reduction.SUM)
    head = head_lib._binary_logistic_or_multi_class_head(  # pylint: disable=protected-access
        n_classes=2,
        weight_column=None,
        label_vocabulary=None,
        loss_reduction=losses.Reduction.SUM)
    logits = tf.layers.dense(
        last_layer,
        units=head.logits_dimension,
        kernel_initializer=tf.glorot_uniform_initializer())
    optimizer = tf.train.AdagradOptimizer(
        learning_rate=params['learning_rate'])
    # optimizer = tf.train.AdamOptimizer(learning_rate=params['learning_rate'])
    preds = tf.sigmoid(logits)
    user_id = features['user_id']
    label = features['label']

    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'probabilities': preds,
            'user_id': user_id,
            'label': label
        }
        export_outputs = {
            'regression':
            tf.estimator.export.RegressionOutput(predictions['probabilities'])
        }
        return tf.estimator.EstimatorSpec(mode,
                                          predictions=predictions,
                                          export_outputs=export_outputs)

    loss = tf.reduce_sum(
        tf.nn.sigmoid_cross_entropy_with_logits(logits=logits, labels=labels))
    # loss = focal_loss(logits=logits, labels=labels, alpha=0.5, gamma=6, beta=1)
    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
    if mode == tf.estimator.ModeKeys.TRAIN:
        return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)

    return head.create_estimator_spec(
        features=features,
        mode=mode,
        labels=labels,
        logits=logits,
        train_op_fn=lambda loss: optimizer.minimize(
            loss, global_step=tf.train.get_global_step()))
Пример #3
0
def din_model_fn(features, labels, mode, params):
    net = tf.feature_column.input_layer(features, params['feature_columns'])

    attention_keyword = tf.string_to_hash_bucket_fast(features["keyword_attention"], 500000)
    attention_keyword_embeddings = tf.get_variable(name="attention_keyword_embeddings", dtype=tf.float32,
                                 shape=[500000, 20])
    # shape(batch_size, len, embedding_size)
    attention_keyword_emb =  tf.nn.embedding_lookup(attention_keyword_embeddings, attention_keyword)


    attention_creativeid = tf.string_to_hash_bucket_fast(tf.as_string(features["creative_id"]), 200000)
    attention_creativeid_embeddings = tf.get_variable(name="attention_creativeid_embeddings", dtype=tf.float32,
                                 shape=[200000, 20])
    # shape(batch_size, 1, embedding_size)
    attention_creativeid_emb = tf.nn.embedding_lookup(attention_creativeid_embeddings, attention_creativeid)

    keyword_creativeid_attention = attention_layer(attention_creativeid_emb,
                                                   attention_keyword_emb)  # (batchsize,embedding_size)


    last_deep_layer = build_deep_layers(net, params)
    last_cross_layer = build_cross_layers(net, params)

    last_layer = tf.concat([last_deep_layer, last_cross_layer, keyword_creativeid_attention], 1)


    # head = tf.contrib.estimator.binary_classification_head(loss_reduction=losses.Reduction.SUM)
    head = head_lib._binary_logistic_or_multi_class_head(  # pylint: disable=protected-access
        n_classes=2, weight_column=None, label_vocabulary=None, loss_reduction=losses.Reduction.SUM)
    logits = tf.layers.dense(last_layer, units=head.logits_dimension,
                             kernel_initializer=tf.glorot_uniform_initializer())
    optimizer = tf.train.AdagradOptimizer(learning_rate=params['learning_rate'])
    preds = tf.sigmoid(logits)
    user_id = features['user_id']
    label = features['label']

    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'probabilities': preds,
            'user_id': user_id,
            'label': label
        }
        export_outputs = {
            'regression': tf.estimator.export.RegressionOutput(predictions['probabilities'])
        }
        return tf.estimator.EstimatorSpec(mode, predictions=predictions, export_outputs=export_outputs)

    return head.create_estimator_spec(
        features=features,
        mode=mode,
        labels=labels,
        logits=logits,
        train_op_fn=lambda loss: optimizer.minimize(loss, global_step=tf.train.get_global_step())
    )
Пример #4
0
def esmm_model_fn(features, labels, mode, params):
  net = tf.feature_column.input_layer(features, params['feature_columns'])
  last_ctr_layer = build_deep_layers(net, params)
  last_cvr_layer = build_deep_layers(net, params)

  #head = tf.contrib.estimator.binary_classification_head(loss_reduction=losses.Reduction.SUM)
  head = head_lib._binary_logistic_or_multi_class_head(  # pylint: disable=protected-access
                  n_classes=2, weight_column=None, label_vocabulary=None, loss_reduction=losses.Reduction.SUM)
  ctr_logits = tf.layers.dense(last_ctr_layer, units=head.logits_dimension,
                               kernel_initializer=tf.glorot_uniform_initializer())
  cvr_logits = tf.layers.dense(last_cvr_layer, units=head.logits_dimension,
                               kernel_initializer=tf.glorot_uniform_initializer())
  ctr_preds = tf.sigmoid(ctr_logits)
  cvr_preds = tf.sigmoid(cvr_logits)
  ctcvr_preds = tf.multiply(ctr_preds, cvr_preds)

  optimizer = tf.train.AdagradOptimizer(learning_rate=params['learning_rate'])
  ctr_label = labels['ctr_label']
  cvr_label = labels['cvr_label']
  ctr_loss = tf.reduce_sum(tf.nn.sigmoid_cross_entropy_with_logits(labels=ctr_label,logits=ctr_logits))
  ctcvr_loss = tf.reduce_sum(tf.losses.log_loss(labels=cvr_label,predictions=ctcvr_preds))
  loss = ctr_loss + ctcvr_loss    # loss这儿可以加一个参数,参考multi-task损失的方法

  user_id = features['user_id']
  click_label = features['label']
  conversion_label = features['is_conversion']


  if mode == tf.estimator.ModeKeys.PREDICT:
    predictions = {
      'ctr_preds': ctr_preds,
      'cvr_preds': cvr_preds,
      'ctcvr_preds': ctcvr_preds
      'user_id': user_id,
      'click_label': click_label,
      'conversion_label': conversion_label
    }
    export_outputs = {
      'regression': tf.estimator.export.RegressionOutput(predictions['cvr_preds'])  #线上预测需要的
    }
    return tf.estimator.EstimatorSpec(mode, predictions=predictions, export_outputs=export_outputs)

  elif mode == tf.estimator.ModeKeys.EVAL:
    return tf.estimator.EstimatorSpec(mode, loss=loss)

  else:
    train_op = optimizer.minimize(loss, global_step=tf.train.get_global_step())
    return tf.estimator.EstimatorSpec(mode, loss=loss, train_op=train_op)
  """
Пример #5
0
def afm_model_fn(features, labels, mode, params):
    net = tf.feature_column.input_layer(features, params['feature_columns'])

    last_deep_layer = build_deep_layers(net, params)

    last_layer = build_afm_layers(net, params)
    # head = tf.contrib.estimator.binary_classification_head(loss_reduction=losses.Reduction.SUM)
    head = head_lib._binary_logistic_or_multi_class_head(  # pylint: disable=protected-access
        n_classes=2,
        weight_column=None,
        label_vocabulary=None,
        loss_reduction=losses.Reduction.SUM)
    logits = tf.layers.dense(
        last_layer,
        units=head.logits_dimension,
        kernel_initializer=tf.glorot_uniform_initializer())
    optimizer = tf.train.AdagradOptimizer(
        learning_rate=params['learning_rate'])
    preds = tf.sigmoid(logits)
    user_id = features['user_id']
    label = features['label']

    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'probabilities': preds,
            'user_id': user_id,
            'label': label
        }
        export_outputs = {
            'regression':
            tf.estimator.export.RegressionOutput(predictions['probabilities'])
        }
        return tf.estimator.EstimatorSpec(mode,
                                          predictions=predictions,
                                          export_outputs=export_outputs)

    return head.create_estimator_spec(
        features=features,
        mode=mode,
        labels=labels,
        logits=logits,
        train_op_fn=lambda loss: optimizer.minimize(
            loss, global_step=tf.train.get_global_step()))
Пример #6
0
def dfm_model_fn(features, labels, mode, params):
    net = tf.feature_column.input_layer(
        features, params['feature_columns']
    )  # shape(batch_size, column_num * embedding_size)
    last_deep_layer = build_deep_layers(net, params)

    column_num, dimension = _check_fm_columns(params['feature_columns'])
    feature_embeddings = tf.reshape(
        net,
        (-1, column_num, dimension))  # (batch_size,column_num, embedding_size)

    # sum_square part
    summed_feature_embeddings = tf.reduce_sum(feature_embeddings,
                                              1)  # (batch_size,embedding_size)
    summed_square_feature_embeddings = tf.square(summed_feature_embeddings)

    # squre-sum part
    squared_feature_embeddings = tf.square(feature_embeddings)
    squared_sum_feature_embeddings = tf.reduce_sum(squared_feature_embeddings,
                                                   1)

    fm_second_order = 0.5 * tf.subtract(summed_square_feature_embeddings,
                                        squared_sum_feature_embeddings)
    # print(tf.shape(fm_second_order))
    # print(fm_second_order.get_shape())

    if params['use_fm']:
        print('--use fm--')
        last_layer = tf.concat([fm_second_order, last_deep_layer], 1)
    else:
        last_layer = last_deep_layer
    # head = tf.contrib.estimator.binary_classification_head(loss_reduction=losses.Reduction.SUM)
    head = head_lib._binary_logistic_or_multi_class_head(
        # pylint: disable=protected-access
        n_classes=2,
        weight_column=None,
        label_vocabulary=None,
        loss_reduction=losses.Reduction.SUM)
    logits = tf.layers.dense(
        last_layer,
        units=head.logits_dimension,
        kernel_initializer=tf.glorot_uniform_initializer())
    optimizer = tf.train.AdagradOptimizer(
        learning_rate=params['learning_rate'])

    preds = tf.sigmoid(logits)
    # print(tf.shape(preds))
    # print(preds.get_shape())
    user_id = features['user_id']
    label = features['label']
    if mode == tf.estimator.ModeKeys.EVAL:
        accuracy = tf.metrics.accuracy(labels=labels['class'],
                                       predictions=tf.to_float(
                                           tf.greater_equal(preds, 0.5)))
        auc = tf.metrics.auc(labels['class'], preds)
        label_mean = metrics_lib.mean(labels['class'])
        prediction_mean = metrics_lib.mean(preds)

        prediction_squared_difference = tf.math.squared_difference(
            preds, prediction_mean[0])
        prediction_squared_sum = tf.reduce_sum(prediction_squared_difference)
        num_predictions = tf.to_float(tf.size(preds))
        s_deviation = tf.sqrt(prediction_squared_sum / num_predictions), \
                      accuracy[0]  # 标准差

        c_variation = tf.to_float(s_deviation[0] / prediction_mean[0]), \
                      accuracy[0]  # 变异系数

        # group_auc = tf.to_float(cal_group_auc(labels['class'], preds, labels['user_id'])), accuracy[0] # group auc

        metrics = {
            'accuracy': accuracy,
            'auc': auc,
            'label/mean': label_mean,
            'prediction/mean': prediction_mean,
            'standard deviation': s_deviation,
            'coefficient of variation': c_variation
        }
        #          'group auc': group_auc}
        tf.summary.scalar('accuracy', accuracy[1])
        tf.summary.scalar('auc', auc[1])
        tf.summary.scalar('label/mean', label_mean[1])
        tf.summary.scalar('prediction/mean', prediction_mean[1])
        tf.summary.scalar('s_deviation', s_deviation[1])
        tf.summary.scalar('c_variation', c_variation[1])
        # tf.summary.scalar('group_auc', group_auc[1])

        loss = tf.reduce_sum(
            tf.nn.sigmoid_cross_entropy_with_logits(labels=labels['class'],
                                                    logits=logits))
        # print(tf.shape(loss))
        # print(loss.get_shape())
        return tf.estimator.EstimatorSpec(mode,
                                          loss=loss,
                                          eval_metric_ops=metrics)

    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'probabilities': preds,
            'user_id': user_id,
            'label': label
        }
        export_outputs = {
            'prediction': tf.estimator.export.PredictOutput(predictions)
        }
        return tf.estimator.EstimatorSpec(mode,
                                          predictions=predictions,
                                          export_outputs=export_outputs)

    return head.create_estimator_spec(
        features=features,
        mode=mode,
        labels=labels,
        logits=logits,
        train_op_fn=lambda loss: optimizer.minimize(
            loss, global_step=tf.train.get_global_step()))
Пример #7
0
  def __init__(
      self,
      hidden_units,
      feature_columns,
      model_dir=None,
      n_classes=2,
      weight_column=None,
      label_vocabulary=None,
      optimizer='Adagrad',
      activation_fn=nn.relu,
      dropout=None,
      input_layer_partitioner=None,
      config=None,
      warm_start_from=None,
      loss_reduction=losses.Reduction.SUM,
  ):
    """Initializes a `DNNClassifier` instance.

    Args:
      hidden_units: Iterable of number hidden units per layer. All layers are
        fully connected. Ex. `[64, 32]` means first layer has 64 nodes and
        second one has 32.
      feature_columns: An iterable containing all the feature columns used by
        the model. All items in the set should be instances of classes derived
        from `_FeatureColumn`.
      model_dir: Directory to save model parameters, graph and etc. This can
        also be used to load checkpoints from the directory into a estimator to
        continue training a previously saved model.
      n_classes: Number of label classes. Defaults to 2, namely binary
        classification. Must be > 1.
      weight_column: A string or a `_NumericColumn` created by
        `tf.feature_column.numeric_column` defining feature column representing
        weights. It is used to down weight or boost examples during training. It
        will be multiplied by the loss of the example. If it is a string, it is
        used as a key to fetch weight tensor from the `features`. If it is a
        `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
        then weight_column.normalizer_fn is applied on it to get weight tensor.
      label_vocabulary: A list of strings represents possible label values. If
        given, labels must be string type and have any value in
        `label_vocabulary`. If it is not given, that means labels are
        already encoded as integer or float within [0, 1] for `n_classes=2` and
        encoded as integer values in {0, 1,..., n_classes-1} for `n_classes`>2 .
        Also there will be errors if vocabulary is not provided and labels are
        string.
      optimizer: An instance of `tf.Optimizer` used to train the model. Can also
        be a string (one of 'Adagrad', 'Adam', 'Ftrl', 'RMSProp', 'SGD'), or
        callable. Defaults to Adagrad optimizer.
      activation_fn: Activation function applied to each layer. If `None`, will
        use `tf.nn.relu`.
      dropout: When not `None`, the probability we will drop out a given
        coordinate.
      input_layer_partitioner: Optional. Partitioner for input layer. Defaults
        to `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
      config: `RunConfig` object to configure the runtime settings.
      warm_start_from: A string filepath to a checkpoint to warm-start from, or
        a `WarmStartSettings` object to fully configure warm-starting.  If the
        string filepath is provided instead of a `WarmStartSettings`, then all
        weights are warm-started, and it is assumed that vocabularies and Tensor
        names are unchanged.
      loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
        to reduce training loss over batch. Defaults to `SUM`.
    """
    head = head_lib._binary_logistic_or_multi_class_head(  # pylint: disable=protected-access
        n_classes, weight_column, label_vocabulary, loss_reduction)
    def _model_fn(features, labels, mode, config):
      """Call the defined shared _dnn_model_fn."""
      return _dnn_model_fn(
          features=features,
          labels=labels,
          mode=mode,
          head=head,
          hidden_units=hidden_units,
          feature_columns=tuple(feature_columns or []),
          optimizer=optimizer,
          activation_fn=activation_fn,
          dropout=dropout,
          input_layer_partitioner=input_layer_partitioner,
          config=config)

    super(DNNClassifier, self).__init__(
        model_fn=_model_fn, model_dir=model_dir, config=config,
        warm_start_from=warm_start_from)
Пример #8
0
    def __init__(
        self,
        hidden_units,
        feature_columns,
        model_dir=None,
        n_classes=2,
        weight_column=None,
        label_vocabulary=None,
        optimizer='Adagrad',
        activation_fn=nn.relu,
        dropout=None,
        input_layer_partitioner=None,
        config=None,
        warm_start_from=None,
        loss_reduction=losses.Reduction.SUM,
        batch_norm=False,
    ):
        """Initializes a `DNNClassifier` instance.

    Args:
      hidden_units: Iterable of number hidden units per layer. All layers are
        fully connected. Ex. `[64, 32]` means first layer has 64 nodes and
        second one has 32.
      feature_columns: An iterable containing all the feature columns used by
        the model. All items in the set should be instances of classes derived
        from `_FeatureColumn`.
      model_dir: Directory to save model parameters, graph and etc. This can
        also be used to load checkpoints from the directory into a estimator to
        continue training a previously saved model.
      n_classes: Number of label classes. Defaults to 2, namely binary
        classification. Must be > 1.
      weight_column: A string or a `_NumericColumn` created by
        `tf.feature_column.numeric_column` defining feature column representing
        weights. It is used to down weight or boost examples during training. It
        will be multiplied by the loss of the example. If it is a string, it is
        used as a key to fetch weight tensor from the `features`. If it is a
        `_NumericColumn`, raw tensor is fetched by key `weight_column.key`,
        then weight_column.normalizer_fn is applied on it to get weight tensor.
      label_vocabulary: A list of strings represents possible label values. If
        given, labels must be string type and have any value in
        `label_vocabulary`. If it is not given, that means labels are
        already encoded as integer or float within [0, 1] for `n_classes=2` and
        encoded as integer values in {0, 1,..., n_classes-1} for `n_classes`>2 .
        Also there will be errors if vocabulary is not provided and labels are
        string.
      optimizer: An instance of `tf.Optimizer` used to train the model. Can also
        be a string (one of 'Adagrad', 'Adam', 'Ftrl', 'RMSProp', 'SGD'), or
        callable. Defaults to Adagrad optimizer.
      activation_fn: Activation function applied to each layer. If `None`, will
        use `tf.nn.relu`.
      dropout: When not `None`, the probability we will drop out a given
        coordinate.
      input_layer_partitioner: Optional. Partitioner for input layer. Defaults
        to `min_max_variable_partitioner` with `min_slice_size` 64 << 20.
      config: `RunConfig` object to configure the runtime settings.
      warm_start_from: A string filepath to a checkpoint to warm-start from, or
        a `WarmStartSettings` object to fully configure warm-starting.  If the
        string filepath is provided instead of a `WarmStartSettings`, then all
        weights are warm-started, and it is assumed that vocabularies and Tensor
        names are unchanged.
      loss_reduction: One of `tf.losses.Reduction` except `NONE`. Describes how
        to reduce training loss over batch. Defaults to `SUM`.
      batch_norm: Whether to use batch normalization after each hidden layer.
    """
        head = head_lib._binary_logistic_or_multi_class_head(  # pylint: disable=protected-access
            n_classes, weight_column, label_vocabulary, loss_reduction)

        shared_state_manager = feature_column_v2.maybe_create_shared_state_manager(
            feature_columns)

        def _model_fn(features, labels, mode, config):
            """Call the defined shared _dnn_model_fn."""
            return _dnn_model_fn(
                features=features,
                labels=labels,
                mode=mode,
                head=head,
                hidden_units=hidden_units,
                feature_columns=tuple(feature_columns or []),
                optimizer=optimizer,
                activation_fn=activation_fn,
                dropout=dropout,
                input_layer_partitioner=input_layer_partitioner,
                config=config,
                batch_norm=batch_norm,
                shared_state_manager=shared_state_manager)

        super(DNNClassifier, self).__init__(model_fn=_model_fn,
                                            model_dir=model_dir,
                                            config=config,
                                            warm_start_from=warm_start_from)
Пример #9
0
def transformer_model_fn(features, labels, mode, params):
    net = tf.feature_column.input_layer(features, params['feature_columns'])

    last_click_creativeid = tf.string_to_hash_bucket_fast(
        features["user_click_creatives_att"], 200000)
    creativeid_embeddings = tf.get_variable(
        name="attention_creativeid_embeddings",
        dtype=tf.float32,
        shape=[200000, 20])
    last_click_creativeid_emb = tf.nn.embedding_lookup(creativeid_embeddings,
                                                       last_click_creativeid)

    last_click_productid = tf.string_to_hash_bucket_fast(
        features["user_click_products_att"], 40000)
    productid_embeddings = tf.get_variable(
        name="attention_productid_embeddings",
        dtype=tf.float32,
        shape=[40000, 20])
    last_click_productid_emb = tf.nn.embedding_lookup(productid_embeddings,
                                                      last_click_productid)

    his_click_emb = tf.concat(
        [last_click_creativeid_emb, last_click_productid_emb],
        2)  # (batch_size,10,emb_size*2)

    transformerNetwork_click = TransformerNetwork(
        params['transformer_num_units'],
        params['num_blocks'],
        params['num_heads'],
        max_len=10,
        dropout_rate=params['dropout_rate'],
        pos_fixed=True)
    mask_click = tf.expand_dims(
        tf.to_float(
            tf.cast(tf.not_equal(features["user_click_creatives_att"], "0"),
                    tf.float32)), -1)  # (batch_size, 10, 1)

    transformer_click_outputs = transformerNetwork_click(
        his_click_emb, mask_click)  # (batch_size, max_len, num_units)
    transformer_click_outputs = tf.reshape(
        tf.reduce_sum(transformer_click_outputs, 1),
        shape=[-1, params['transformer_num_units']])

    last_deep_layer = build_deep_layers(net, params)
    last_cross_layer = build_cross_layers(net, params)

    last_layer = tf.concat(
        [last_deep_layer, last_cross_layer, transformer_click_outputs], 1)

    # head = tf.contrib.estimator.binary_classification_head(loss_reduction=losses.Reduction.SUM)
    head = head_lib._binary_logistic_or_multi_class_head(  # pylint: disable=protected-access
        n_classes=2,
        weight_column=None,
        label_vocabulary=None,
        loss_reduction=losses.Reduction.SUM)
    logits = tf.layers.dense(
        last_layer,
        units=head.logits_dimension,
        kernel_initializer=tf.glorot_uniform_initializer())
    optimizer = tf.train.AdagradOptimizer(
        learning_rate=params['learning_rate'])
    preds = tf.sigmoid(logits)
    user_id = features['user_id']
    label = features['label']

    if mode == tf.estimator.ModeKeys.PREDICT:
        predictions = {
            'probabilities': preds,
            'user_id': user_id,
            'label': label
        }
        export_outputs = {
            'regression':
            tf.estimator.export.RegressionOutput(predictions['probabilities'])
        }
        return tf.estimator.EstimatorSpec(mode,
                                          predictions=predictions,
                                          export_outputs=export_outputs)

    return head.create_estimator_spec(
        features=features,
        mode=mode,
        labels=labels,
        logits=logits,
        train_op_fn=lambda loss: optimizer.minimize(
            loss, global_step=tf.train.get_global_step()))