Exemplo n.º 1
0
def test_long_dense_vector():
    #构造特征
    feature_columns = [
        SparseFeat(
            'user_id',
            4,
        ),
        SparseFeat(
            'item_id',
            5,
        ),
        DenseFeat("pic_vec", 5)
    ]
    fixlen_feature_names = get_feature_names(feature_columns)

    #构造样本
    user_id = np.array([[1], [0], [1]])
    item_id = np.array([[3], [2], [1]])
    pic_vec = np.array([[0.1, 0.5, 0.4, 0.3, 0.2], [0.1, 0.5, 0.4, 0.3, 0.2],
                        [0.1, 0.5, 0.4, 0.3, 0.2]])
    label = np.array([1, 0, 1])
    input_dict = {'user_id': user_id, 'item_id': item_id, 'pic_vec': pic_vec}
    model_input = [input_dict[name] for name in fixlen_feature_names]

    #创建模型model
    model = DeepFM(feature_columns, feature_columns[:-1])

    # model.summary()
    #tf.keras.utils.plot_model(model, "test_compu")

    #训练模型
    model.compile('adagrad', 'binary_crossentropy')
    model.fit(model_input, label)
Exemplo n.º 2
0
    def make_feature_cols(self, dataset, embedding_dim):
        '''Return deepctr.feature_column.

        Parameters
        ----------
        dataset :
            A dataset instance.
        Returns
        -------
        dnn_features : 
            A list of feature_column instance for dnn inputs.
        linear_features :
            A list of feature_column instance for linear inputs.
            
        '''
        fixlen_feature_columns = [
            SparseFeat(feat,
                       vocabulary_size=dataset.nunique[feat],
                       embedding_dim=embedding_dim)
            for feat in dataset.sparse_features
        ]
        fixlen_feature_columns += [
            DenseFeat(feat, 1) for feat in dataset.dense_features
        ]

        dnn_feature_columns = fixlen_feature_columns
        linear_feature_columns = fixlen_feature_columns
        return dnn_feature_columns, linear_feature_columns
Exemplo n.º 3
0
def test_long_dense_vector():
    feature_columns = [
        SparseFeat(
            'user_id',
            4,
        ),
        SparseFeat(
            'item_id',
            5,
        ),
        DenseFeat("pic_vec", 5)
    ]
    fixlen_feature_names = get_feature_names(feature_columns)

    user_id = np.array([[1], [0], [1]])
    item_id = np.array([[3], [2], [1]])
    pic_vec = np.array([[0.1, 0.5, 0.4, 0.3, 0.2], [0.1, 0.5, 0.4, 0.3, 0.2],
                        [0.1, 0.5, 0.4, 0.3, 0.2]])
    label = np.array([1, 0, 1])

    input_dict = {'user_id': user_id, 'item_id': item_id, 'pic_vec': pic_vec}
    model_input = [input_dict[name] for name in fixlen_feature_names]

    model = DeepFM(feature_columns, feature_columns[:-1])
    model.compile('adagrad', 'binary_crossentropy')
    model.fit(model_input, label)
Exemplo n.º 4
0
def get_xy_random2(X, y, cols_family={}):
    # X = np.random.rand(100,30)
    # y = np.random.binomial(n=1, p=0.5, size=[100])

    ## PREPROCESSING STEPS
    # change into dataframe
    target = 'y'
    cols      = [str(i) for i in range(X.shape[1])]  # define column pd dataframe, need to be string type
    data      = pd.DataFrame(X, columns=cols)  # need to convert into df, following the step from documentation
    #data['y'] = y

    # define which feature columns sparse or dense type
    # since our data categorize as Dense Features, we define the sparse features as empty list
    #cols_sparse_features = []
    #cols_dense_features  = [str(i) for i in range(X.shape[1])]

    cols_sparse_features = cols_family['colsparse']
    cols_dense_features  = cols_family['coldense']


    # convert feature type into SparseFeat or DenseFeat type, adjusting from DeepCTR library
    sparse_feat_l = [SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4)
                     for i,feat in enumerate(cols_sparse_features)]
                    
    dense_feat_l       = [DenseFeat(feat, dimension=1) for feat in cols_dense_features]
    feature_col        = sparse_feat_l + dense_feat_l

    linear_feat_col = feature_col  # containing all the features used by linear part of the model
    dnn_feat_col    = feature_col  # containing all the features used by deep part of the model
    feature_names    = get_feature_names(linear_feat_col + dnn_feat_col)

    train_model_input  = {name: data[name] for name in feature_names}
    X_train, y_train   = train_model_input, y.values

    return X_train, y_train, linear_feat_col, dnn_feat_col
Exemplo n.º 5
0
def get_xy_fd(hash_flag=False):
    feature_columns = [SparseFeat('user', 3, embedding_dim=10), SparseFeat(
        'gender', 2, embedding_dim=4), SparseFeat('item_id', 3 + 1, embedding_dim=8),
                       SparseFeat('cate_id', 2 + 1, embedding_dim=4), DenseFeat('pay_score', 1)]
    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item_id', vocabulary_size=3 + 1, embedding_dim=8, embedding_name='item_id'),
                         maxlen=4, length_name="seq_length"),
        VarLenSparseFeat(SparseFeat('hist_cate_id', 2 + 1, embedding_dim=4, embedding_name='cate_id'), maxlen=4,
                         length_name="seq_length")]
    # Notice: History behavior sequence feature name must start with "hist_".
    behavior_feature_list = ["item_id", "cate_id"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    cate_id = np.array([1, 2, 2])  # 0 is mask value
    pay_score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [3, 2, 1, 0], [1, 2, 0, 0]])
    hist_cate_id = np.array([[1, 2, 2, 0], [2, 2, 1, 0], [1, 2, 0, 0]])
    seq_length = np.array([3, 3, 2])  # the actual length of the behavior sequence

    feature_dict = {'user': uid, 'gender': ugender, 'item_id': iid, 'cate_id': cate_id,
                    'hist_item_id': hist_iid, 'hist_cate_id': hist_cate_id,
                    'pay_score': pay_score, 'seq_length': seq_length}
    x = {name: feature_dict[name] for name in get_feature_names(feature_columns)}
    y = np.array([1, 0, 1])
    return x, y, feature_columns, behavior_feature_list
Exemplo n.º 6
0
def get_mtl_test_data(sample_size=10,
                      embedding_size=4,
                      sparse_feature_num=1,
                      dense_feature_num=1,
                      task_types=('binary', 'binary'),
                      hash_flag=False,
                      prefix='',
                      use_group=False):
    feature_columns = []
    model_input = {}

    for i in range(sparse_feature_num):
        if use_group:
            group_name = str(i % 3)
        else:
            group_name = DEFAULT_GROUP_NAME
        dim = np.random.randint(1, 10)
        feature_columns.append(
            SparseFeat(prefix + 'sparse_feature_' + str(i),
                       dim,
                       embedding_size,
                       use_hash=hash_flag,
                       dtype=tf.int32,
                       group_name=group_name))

    for i in range(dense_feature_num):

        def transform_fn(x):
            return (x - 0.0) / 1.0

        feature_columns.append(
            DenseFeat(prefix + 'dense_feature_' + str(i),
                      1,
                      dtype=tf.float32,
                      transform_fn=transform_fn))

    for fc in feature_columns:
        if isinstance(fc, SparseFeat):
            model_input[fc.name] = np.random.randint(0, fc.vocabulary_size,
                                                     sample_size)
        elif isinstance(fc, DenseFeat):
            model_input[fc.name] = np.random.random(sample_size)
    y_list = []  # multi label
    for task in task_types:
        if task == 'binary':
            y = np.random.randint(0, 2, sample_size)
            y_list.append(y)
        else:
            y = np.random.random(sample_size)
            y_list.append(y)

    return model_input, y_list, feature_columns
Exemplo n.º 7
0
def get_xy_random():
    X = np.random.rand(100, 30)
    y = np.random.binomial(n=1, p=0.5, size=[100])

    ## PREPROCESSING STEPS
    # change into dataframe
    cols = [str(i) for i in range(X.shape[1])
            ]  # define column pd dataframe, need to be string type
    data = pd.DataFrame(
        X, columns=cols
    )  # need to convert into df, following the step from documentation
    data['y'] = y

    # define which feature columns sparse or dense type
    # since our data categorize as Dense Features, we define the sparse features as empty list
    cols_sparse_features = []
    cols_dense_features = [str(i) for i in range(X.shape[1])]

    # convert feature type into SparseFeat or DenseFeat type, adjusting from DeepCTR library
    sparse_feat_l = [
        SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4)
        for i, feat in enumerate(cols_sparse_features)
    ]

    dense_feat_l = [
        DenseFeat(feat, dimension=1) for feat in cols_dense_features
    ]
    feature_col = sparse_feat_l + dense_feat_l

    linear_feat_col = feature_col  # containing all the features used by linear part of the model
    dnn_feat_col = feature_col  # containing all the features used by deep part of the model
    feature_names = get_feature_names(linear_feat_col + dnn_feat_col)

    train_full, test = train_test_split(data,
                                        random_state=2021,
                                        stratify=data['y'])
    train, val = train_test_split(train_full,
                                  random_state=2021,
                                  stratify=train_full['y'])

    train_model_input = {name: train[name] for name in feature_names}
    val_model_input = {name: val[name] for name in feature_names}
    test_model_input = {name: test[name] for name in feature_names}
    target = 'y'
    ## END OF PREPROCESSING STEPS

    X_train, y_train = train_model_input, train[target].values
    X_val, y_val = val_model_input, val[target].values
    X_test, y_test = test_model_input, test[target].values
    return X_train, X_val, X_test, y_train, y_val, y_test, linear_feat_col, dnn_feat_col
Exemplo n.º 8
0
    def fit(self, X, y):
        X_ = X.copy()
        self.dense_features = list(X_.columns.difference(self.cat_features))

        logger.debug("MinMaxScaler")
        self.min_max_scaler.fit(X_[self.dense_features])
        X_[self.dense_features] = self.min_max_scaler.transform(
            X_[self.dense_features])

        self._column_mapping(X_)
        X_.columns = [self.columns_mapping[col] for col in X_.columns]

        self.fixlen_feature_columns = [
            SparseFeat(
                self.columns_mapping[feat],
                vocabulary_size=X_[self.columns_mapping[feat]].max() + 1,
                embedding_dim=4,
            ) for i, feat in enumerate(self.cat_features)
        ] + [
            DenseFeat(
                self.columns_mapping[feat],
                1,
            ) for feat in self.dense_features
        ]
        self.feature_names = get_feature_names(self.fixlen_feature_columns)

        logger.debug("Compile DeepFM model")
        self.model = DeepFM(self.fixlen_feature_columns,
                            self.fixlen_feature_columns,
                            task="binary")
        self.model.compile(
            "adam",
            "binary_crossentropy",
            metrics=["binary_crossentropy"],
        )

        logger.debug("Fit DeepFM")
        train_model_input = {
            name: X_[name].values
            for name in self.feature_names
        }
        self.model.fit(
            train_model_input,
            y,
            batch_size=256,
            epochs=3,
            verbose=2,
            validation_split=0.2,
        )
Exemplo n.º 9
0
def get_xy_fd():

    feature_columns = [
        SparseFeat('user', 3, embedding_dim=10),
        SparseFeat('gender', 2, embedding_dim=4),
        SparseFeat('item_id', 3 + 1, embedding_dim=8),
        SparseFeat('cate_id', 2 + 1, embedding_dim=4),
        DenseFeat('pay_score', 1)
    ]
    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item_id',
                                    vocabulary_size=3 + 1,
                                    embedding_dim=8,
                                    embedding_name='item_id'),
                         maxlen=4),
        VarLenSparseFeat(SparseFeat('hist_cate_id',
                                    2 + 1,
                                    embedding_dim=4,
                                    embedding_name='cate_id'),
                         maxlen=4)
    ]

    behavior_feature_list = ["item_id", "cate_id"]  # 变长特征使用的base稀疏特征
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    cate_id = np.array([1, 2, 2])  # 0 is mask value
    pay_score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [3, 2, 1, 0], [1, 2, 0, 0]])
    hist_cate_id = np.array([[1, 2, 2, 0], [2, 2, 1, 0], [1, 2, 0, 0]])

    # 特征名->data输入
    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item_id': iid,
        'cate_id': cate_id,
        'hist_item_id': hist_iid,
        'hist_cate_id': hist_cate_id,
        'pay_score': pay_score
    }
    x = {
        name: feature_dict[name]
        for name in get_feature_names(feature_columns)
    }
    y = np.array([1, 0, 1])
    return x, y, feature_columns, behavior_feature_list
Exemplo n.º 10
0
def run_base_experiment(data_path, dataset_type, model_params, model_type,
                        opt):
    if dataset_type == 'critero':
        data_df, sparse_features, dense_features, target = load_citero_dataset(
            data_path)
    else:
        data_df, sparse_features, dense_features, target = load_taboola_dataset(
            data_path)
    data_df = prepare_data_for_train(data_df, sparse_features, dense_features)
    fixlen_feature_columns = [
        SparseFeat(
            feat, vocabulary_size=data_df[feat].nunique(), embedding_dim=10)
        for i, feat in enumerate(sparse_features)
    ] + [DenseFeat(
        feat,
        1,
    ) for feat in dense_features]
    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns
    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)
    # 3.generate input data for model
    train, test = train_test_split(data_df, test_size=0.2)
    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {name: test[name] for name in feature_names}
    batch_size = 1024
    # 4.Define Model,train,predict and evaluate
    model = model_type(linear_feature_columns,
                       dnn_feature_columns,
                       seed=1024,
                       **model_params)
    model.compile(
        optimizer=opt,
        loss="binary_crossentropy",
        metrics=['binary_crossentropy', 'accuracy'],
    )
    history = model.fit(
        train_model_input,
        train[target].values,
        batch_size=batch_size,
        epochs=10,
        verbose=1,
        validation_split=0.2,
    )
    pred_ans = model.predict(test_model_input, batch_size=batch_size)
    print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
    print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))
    pass
Exemplo n.º 11
0
def get_xy_fd(hash_flag=False):
    feature_columns = [
        SparseFeat('user', 3),
        SparseFeat('gender', 2),
        SparseFeat('item', 3 + 1),
        SparseFeat('item_gender', 2 + 1),
        DenseFeat('score', 1)
    ]
    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item',
                                    vocabulary_size=3 + 1,
                                    embedding_dim=8,
                                    embedding_name='item'),
                         maxlen=4),
        VarLenSparseFeat(SparseFeat('hist_item_gender',
                                    2 + 1,
                                    embedding_dim=4,
                                    embedding_name='item_gender'),
                         maxlen=4)
    ]

    behavior_feature_list = ["item", "item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    igender = np.array([1, 2, 1])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
    hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item': iid,
        'item_gender': igender,
        'hist_item': hist_iid,
        'hist_item_gender': hist_igender,
        'score': score
    }

    feature_names = get_feature_names(feature_columns)
    x = {name: feature_dict[name] for name in feature_names}
    y = [1, 0, 1]
    return x, y, feature_columns, behavior_feature_list
Exemplo n.º 12
0
def read_data_as_model():
    data = pd.read_csv('GiveMeSomeCredit/cs-training.csv')
    sparse_features = [
        'NumberOfTime30-59DaysPastDueNotWorse', 'NumberOfTimes90DaysLate',
        'NumberOfTime60-89DaysPastDueNotWorse', 'NumberOfDependents'
    ]
    dense_features = [
        'RevolvingUtilizationOfUnsecuredLines', 'age', 'DebtRatio',
        'MonthlyIncome', 'NumberOfOpenCreditLinesAndLoans',
        'NumberRealEstateLoansOrLines'
    ]

    data[sparse_features] = data[sparse_features].fillna(-1, )
    data[dense_features] = data[dense_features].fillna(-1, )
    target = ['SeriousDlqin2yrs']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name

    fixlen_feature_columns = [
        SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4)
        for i, feat in enumerate(sparse_features)
    ] + [DenseFeat(
        feat,
        1,
    ) for feat in dense_features]

    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns
    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)

    # 3.generate input data for model

    train, test = train_test_split(data, test_size=0.2, random_state=1234)
    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {name: test[name] for name in feature_names}

    return train, test, train_model_input, test_model_input, dnn_feature_columns, linear_feature_columns, feature_names, target
Exemplo n.º 13
0
def train_youtube_model(train_model_input, train_label, embedding_dim,
                        feature_max_idx, his_seq_maxlen, batch_size, epochs,
                        verbose, validation_split):
    """构建youtubednn并完成训练"""
    # 特征封装
    user_feature_columns = [
        SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim),
        VarLenSparseFeat(
            SparseFeat('hist_doc_ids',
                       feature_max_idx['article_id'],
                       embedding_dim,
                       embedding_name="click_doc_id"), his_seq_maxlen, 'mean',
            'hist_len'),
        SparseFeat('u_city', feature_max_idx['city'], embedding_dim),
        SparseFeat('u_age', feature_max_idx['age'], embedding_dim),
        SparseFeat('u_gender', feature_max_idx['gender'], embedding_dim),
        DenseFeat(
            'u_example_age',
            1,
        )
    ]
    doc_feature_columns = [
        SparseFeat('click_doc_id', feature_max_idx['article_id'],
                   embedding_dim)
        # 这里后面也可以把文章的类别画像特征加入
    ]

    # 定义模型
    model = YoutubeDNN(user_feature_columns,
                       doc_feature_columns,
                       num_sampled=5,
                       user_dnn_hidden_units=(64, embedding_dim))

    # 模型编译
    model.compile(optimizer="adam", loss=sampledsoftmaxloss)

    # 模型训练,这里可以定义验证集的比例,如果设置为0的话就是全量数据直接进行训练
    history = model.fit(train_model_input,
                        train_label,
                        batch_size=batch_size,
                        epochs=epochs,
                        verbose=verbose,
                        validation_split=validation_split)

    return model
Exemplo n.º 14
0
def custom_model():
    sparse_features = ["C" + str(i) for i in range(1, 27)]
    dense_features = ["I" + str(i) for i in range(1, 14)]
    fixlen_feature_columns = [
        SparseFeat(
            feat,
            vocabulary_size=10000,
            embedding_dim=4,
            dtype="string",
            use_hash=True,
        ) for i, feat in enumerate(sparse_features)
    ] + [DenseFeat(
        feat,
        1,
    ) for feat in dense_features]

    model = WDL(fixlen_feature_columns, fixlen_feature_columns, task="binary")
    return model
Exemplo n.º 15
0
    def _build_model(self):
        to_drop = config.Keywords_Categories[self.params['category']]
        self._build_category_dict(drop_categories=to_drop)
        attrs_matrix, attrs_max_len = self._get_category_matrix(self.data)
        
        vars_fixlen = [SparseFeat(var, self.data[var].nunique(),
                                  embedding_dim=4)
                       for var in self.features_sparse]
        vars_fixlen += [DenseFeat(var, 1,) for var in self.features_dense]
        vars_varlen = [VarLenSparseFeat(SparseFeat('categories',
                        vocabulary_size=len(self.attr2index) + 1,
                        embedding_dim=4),
                        maxlen=attrs_max_len, combiner='mean',
                        weight_name='attrs_weight' if self.params['weight'] else None)]

        self.features_linear = vars_fixlen + vars_varlen
        self.features_dnn = vars_fixlen + vars_varlen

        self.model = DeepFM(self.features_linear, self.features_dnn,
                            task='regression', **self.params_deepfm)
        return attrs_matrix, attrs_max_len
Exemplo n.º 16
0
def get_xy_fd(use_neg=False, hash_flag=False):
    feature_columns = [
        SparseFeat('user', 3, embedding_dim=10, use_hash=hash_flag),
        SparseFeat('gender', 2, embedding_dim=4, use_hash=hash_flag),
        SparseFeat('item_id', 3 + 1, embedding_dim=8, use_hash=hash_flag),
        SparseFeat('cate_id', 2 + 1, embedding_dim=4, use_hash=hash_flag),
        DenseFeat('pay_score', 1)
    ]

    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item_id',
                                    vocabulary_size=3 + 1,
                                    embedding_dim=8,
                                    embedding_name='item_id'),
                         maxlen=4,
                         length_name="seq_length"),
        VarLenSparseFeat(SparseFeat('hist_cate_id',
                                    2 + 1,
                                    embedding_dim=4,
                                    embedding_name='cate_id'),
                         maxlen=4,
                         length_name="seq_length")
    ]

    behavior_feature_list = ["item_id", "cate_id"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    cate_id = np.array([1, 2, 2])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
    hist_cate_id = np.array([[1, 2, 2, 0], [1, 2, 2, 0], [1, 2, 0, 0]])

    behavior_length = np.array([3, 3, 2])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item_id': iid,
        'cate_id': cate_id,
        'hist_item_id': hist_iid,
        'hist_cate_id': hist_cate_id,
        'pay_score': score,
        "seq_length": behavior_length
    }

    if use_neg:
        feature_dict['neg_hist_item_id'] = np.array([[1, 2, 3,
                                                      0], [1, 2, 3, 0],
                                                     [1, 2, 0, 0]])
        feature_dict['neg_hist_cate_id'] = np.array([[1, 2, 2,
                                                      0], [1, 2, 2, 0],
                                                     [1, 2, 0, 0]])
        feature_columns += [
            VarLenSparseFeat(SparseFeat('neg_hist_item_id',
                                        vocabulary_size=3 + 1,
                                        embedding_dim=8,
                                        embedding_name='item_id'),
                             maxlen=4,
                             length_name="seq_length"),
            VarLenSparseFeat(SparseFeat('neg_hist_cate_id',
                                        2 + 1,
                                        embedding_dim=4,
                                        embedding_name='cate_id'),
                             maxlen=4,
                             length_name="seq_length")
        ]

    x = {
        name: feature_dict[name]
        for name in get_feature_names(feature_columns)
    }
    y = np.array([1, 0, 1])
    return x, y, feature_columns, behavior_feature_list
Exemplo n.º 17
0
size_dict = {'Medical Claims Features': 2,
             'Condition Related Features':2,
             'Lab Claims Features':2,
             'Pharmacy Claims Features':2,
             'CMS Features':2,
             'Demographics':16,
             'Other features':16
            }

dense_features = set(train_data.columns) - set(sparse_features) - set(['transportation_issues', 'person_id_syn'])
dense_features = list(dense_features)

target = ['transportation_issues']

fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=train_data[feat].nunique(), embedding_dim=size_dict[field_info[feat]], dtype='int32', group_name=field_info[feat]) for feat in sparse_features] + [DenseFeat(feat, 1,) for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

train, test = train_test_split(train_data, test_size=0.2, random_state=2020)
train_model_input = {name:train[name] for name in feature_names}
test_model_input = {name:test[name] for name in feature_names}

model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary', dnn_hidden_units=(2, 256), dnn_dropout=0.0)
opt = keras.optimizers.Adam(learning_rate=0.001)
model.compile(loss="binary_crossentropy", metrics=['binary_crossentropy'], optimizer=opt)

history = model.fit(train_model_input, train[target].values, batch_size=64, epochs=10, verbose=1, validation_split=0.2, class_weight={0:1, 1:3})
Exemplo n.º 18
0
        if num > 10000:
            dim = 10
        else:
            if num > 1000:
                dim = 8
            else:
                dim = 4

        if column == 'user_id':
            feature_columns += [SparseFeat(column, 212062 + 1, embedding_dim=dim)]
        elif column == 'merchant_id':
            feature_columns += [SparseFeat(column, 1993 + 1, embedding_dim=dim)]
        elif column == 'action_type':
            feature_columns += [SparseFeat(column, 4 + 1, embedding_dim=dim)]
        else:
            feature_columns += [DenseFeat(column, 1)]

# maxlen为历史信息的长度,vocabulary_size为onehot的长度
feature_columns += [
    VarLenSparseFeat(sparsefeat=SparseFeat('hist_merchant_id', vocabulary_size=1993, embedding_dim=8,
                                           embedding_name='merchant_id'), maxlen=M),
    VarLenSparseFeat(sparsefeat=SparseFeat('hist_action_type', vocabulary_size=4, embedding_dim=4,
                                           embedding_name='action_type'), maxlen=M)]
history_features = ['merchant_id', 'action_type']
print(len(feature_columns))

# 使用DIN模型
model = DIN(feature_columns, history_features)
# 使用Adam优化器,二分类的交叉熵
model.compile('adam', 'binary_crossentropy', metrics=['binary_crossentropy'])
# model.compile(loss=[binary_focal_loss(alpha=.25, gamma=2)], metrics=["accuracy"])
Exemplo n.º 19
0
def get_xy_fd(hash_flag=False):
    feature_columns = [
        SparseFeat('user', 3, use_hash=hash_flag),
        SparseFeat('gender', 2, use_hash=hash_flag),
        SparseFeat('item', 3 + 1, use_hash=hash_flag),
        SparseFeat('item_gender', 2 + 1, use_hash=hash_flag),
        DenseFeat('score', 1)
    ]
    feature_columns += [
        VarLenSparseFeat(SparseFeat('sess_0_item',
                                    3 + 1,
                                    embedding_dim=4,
                                    use_hash=hash_flag,
                                    embedding_name='item'),
                         maxlen=4),
        VarLenSparseFeat(SparseFeat('sess_0_item_gender',
                                    2 + 1,
                                    embedding_dim=4,
                                    use_hash=hash_flag,
                                    embedding_name='item_gender'),
                         maxlen=4)
    ]
    feature_columns += [
        VarLenSparseFeat(SparseFeat('sess_1_item',
                                    3 + 1,
                                    embedding_dim=4,
                                    use_hash=hash_flag,
                                    embedding_name='item'),
                         maxlen=4),
        VarLenSparseFeat(SparseFeat('sess_1_item_gender',
                                    2 + 1,
                                    embedding_dim=4,
                                    use_hash=hash_flag,
                                    embedding_name='item_gender'),
                         maxlen=4)
    ]

    behavior_feature_list = ["item", "item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    igender = np.array([1, 2, 1])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    sess1_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [0, 0, 0, 0]])
    sess1_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [0, 0, 0, 0]])

    sess2_iid = np.array([[1, 2, 3, 0], [0, 0, 0, 0], [0, 0, 0, 0]])
    sess2_igender = np.array([[1, 1, 2, 0], [0, 0, 0, 0], [0, 0, 0, 0]])

    sess_number = np.array([2, 1, 0])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item': iid,
        'item_gender': igender,
        'sess_0_item': sess1_iid,
        'sess_0_item_gender': sess1_igender,
        'score': score,
        'sess_1_item': sess2_iid,
        'sess_1_item_gender': sess2_igender,
    }

    x = {
        name: feature_dict[name]
        for name in get_feature_names(feature_columns)
    }
    x["sess_length"] = sess_number

    y = [1, 0, 1]
    return x, y, feature_columns, behavior_feature_list
Exemplo n.º 20
0
def get_xy_dataset(data_sample=None):
    if data_sample == "avazu":
        df = pd.read_csv(
            'https://raw.githubusercontent.com/shenweichen/DeepCTR/master/examples/avazu_sample.txt'
        )
        df['day'] = df['hour'].apply(lambda x: str(x)[4:6])
        df['hour'] = df['hour'].apply(lambda x: str(x)[6:])

        sparse_features = [
            'hour',
            'C1',
            'banner_pos',
            'site_id',
            'site_domain',
            'site_category',
            'app_id',
            'app_domain',
            'app_category',
            'device_id',
            'device_model',
            'device_type',
            'device_conn_type',  # 'device_ip',
            'C14',
            'C15',
            'C16',
            'C17',
            'C18',
            'C19',
            'C20',
            'C21',
        ]

        df[sparse_features] = df[sparse_features].fillna('-1', )
        target = ['click']

        # 1.Label Encoding for sparse features,and do simple Transformation for dense features
        for feat in sparse_features:
            lbe = LabelEncoder()
            df[feat] = lbe.fit_transform(df[feat])

        # 2.count #unique features for each sparse field,and record dense feature field name
        field_info = dict(C14='user',
                          C15='user',
                          C16='user',
                          C17='user',
                          C18='user',
                          C19='user',
                          C20='user',
                          C21='user',
                          C1='user',
                          banner_pos='context',
                          site_id='context',
                          site_domain='context',
                          site_category='context',
                          app_id='item',
                          app_domain='item',
                          app_category='item',
                          device_model='user',
                          device_type='user',
                          device_conn_type='context',
                          hour='context',
                          device_id='user')

        fixlen_feat_col = [
            SparseFeat(name,
                       vocabulary_size=df[name].nunique(),
                       embedding_dim=16,
                       use_hash=False,
                       dtype='int32',
                       group_name=field_info[name]) for name in sparse_features
        ]

        dnn_feat_col = fixlen_feat_col
        linear_feat_col = fixlen_feat_col
        feature_names = get_feature_names(linear_feat_col + dnn_feat_col)

    elif data_sample == "criteo":
        df = pd.read_csv(
            'https://raw.githubusercontent.com/shenweichen/DeepCTR/master/examples/criteo_sample.txt'
        )
        sparse_features = ['C' + str(i) for i in range(1, 27)]
        dense_features = ['I' + str(i) for i in range(1, 14)]

        df[sparse_features] = df[sparse_features].fillna('-1', )
        df[dense_features] = df[dense_features].fillna(0, )
        target = ['label']

        # 1.Label Encoding for sparse features,and do simple Transformation for dense features
        for feat in sparse_features:
            lbe = LabelEncoder()
            df[feat] = lbe.fit_transform(df[feat])
        mms = MinMaxScaler(feature_range=(0, 1))
        df[dense_features] = mms.fit_transform(df[dense_features])

        # 2.count #unique features for each sparse field,and record dense feature field name
        fixlen_feat_col = [
            SparseFeat(
                feat, vocabulary_size=df[feat].nunique(), embedding_dim=4)
            for i, feat in enumerate(sparse_features)
        ] + [DenseFeat(
            feat,
            1,
        ) for feat in dense_features]

        dnn_feat_col = fixlen_feat_col
        linear_feat_col = fixlen_feat_col
        feature_names = get_feature_names(linear_feat_col + dnn_feat_col)

    elif data_sample == "movielens":
        df = pd.read_csv(
            "https://raw.githubusercontent.com/shenweichen/DeepCTR/master/examples/movielens_sample.txt"
        )
        sparse_features = [
            "movie_id", "user_id", "gender", "age", "occupation", "zip"
        ]
        target = ['rating']

        # 1.Label Encoding for sparse features,and do simple Transformation for dense features
        for feat in sparse_features:
            lbe = LabelEncoder()
            df[feat] = lbe.fit_transform(df[feat])

        # 2.count #unique features for each sparse field
        fixlen_feat_col = [
            SparseFeat(feat, df[feat].nunique(), embedding_dim=4)
            for feat in sparse_features
        ]
        linear_feat_col = fixlen_feat_col
        dnn_feat_col = fixlen_feat_col
        feature_names = get_feature_names(linear_feat_col + dnn_feat_col)

    # 3.generate input data for model
    train_full, test = train_test_split(df,
                                        random_state=2021,
                                        stratify=df[target])
    train, val = train_test_split(train_full,
                                  random_state=2021,
                                  stratify=train_full[target])

    train_model_input = {name: train[name] for name in feature_names}
    val_model_input = {name: val[name] for name in feature_names}
    test_model_input = {name: test[name] for name in feature_names}

    X_train, y_train = train_model_input, train[target].values
    X_val, y_val = val_model_input, val[target].values
    X_test, y_test = test_model_input, test[target].values
    return X_train, X_val, X_test, y_train, y_val, y_test, linear_feat_col, dnn_feat_col
Exemplo n.º 21
0
#将类别型特征硬编码
for feat in sparse_features:
    lbe = LabelEncoder()
    data[feat] = lbe.fit_transform(data[feat])

mms = MinMaxScaler(feature_range=(0, 1))
data[dense_features] = mms.fit_transform(data[dense_features])

sys.exit()

fixlen_feature_columns = [
    SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4)
    for i, feat in enumerate(sparse_features)
] + [DenseFeat(
    feat,
    1,
) for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

train, test = train_test_split(data, test_size=0.2)

train_model_input = {name: train[name].values for name in feature_names}
test_model_input = {name: test[name].values for name in feature_names}

model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
model.compile(
    "adam",
    "binary_crossentropy",
Exemplo n.º 22
0
    data[sparse_features] = data[sparse_features].fillna('-1', )
    data[dense_features] = data[dense_features].fillna(0, )
    target = ['label']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name

    fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max() + 1,embedding_dim=4 )
                           for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
                          for feat in dense_features]

    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns) # list of string

    # 3.generate input data for model
    train, test = train_test_split(data, test_size=0.2, random_state=2020)
    train_model_input = {name:train[name] for name in feature_names}
    test_model_input = {name:test[name] for name in feature_names}

    # 4.Define Model,train,predict and evaluate
    model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
    model.compile("adam", "binary_crossentropy", metrics=['binary_crossentropy'], )
Exemplo n.º 23
0
          eval_set=[(valid_X, valid_df[label])],
          eval_metric=['auc'],
          early_stopping_rounds=None)
model.save_model(MODEL_PATH_XGB)

print('LightGBM模型')
model = lgb.LGBMClassifier(n_estimators=200, random_state=RANDOM_SEED)
model.fit(train_X,
          train_df[label],
          eval_set=[(valid_X, valid_df[label])],
          eval_metric=['auc'],
          early_stopping_rounds=None)
model.booster_.save_model(MODEL_PATH_LGB)

print('DCN模型')
feature_columns = [DenseFeat(c, 1) for c in num_feat] + [
    SparseFeat(c, n + 1, 'auto') for c, n in train_X[cat_feat].max().items()
]
model = DCN(feature_columns,
            feature_columns,
            cross_num=4,
            dnn_use_bn=True,
            cross_parameterization='matrix',
            seed=RANDOM_SEED)
model.compile('adam', 'binary_crossentropy', metrics=['AUC'])
model.fit([x for _, x in train_X.items()],
          train_df[label],
          validation_data=([x for _, x in valid_X.items()], valid_df[label]),
          shuffle=False,
          epochs=1)
model.save(MODEL_PATH_DCN)
Exemplo n.º 24
0
def main(model_dir, data_dir, train_steps, model_name):
    data = pd.read_csv(os.path.join(data_dir, 'criteo_sample.txt'))

    sparse_features = ['C' + str(i) for i in range(1, 27)]
    dense_features = ['I' + str(i) for i in range(1, 14)]

    data[sparse_features] = data[sparse_features].fillna('-1', )
    data[dense_features] = data[dense_features].fillna(0, )
    target = ['label']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name

    fixlen_feature_columns = [
        SparseFeat(feat, vocabulary_size=data[feat].nunique(), embedding_dim=4)
        for i, feat in enumerate(sparse_features)
    ] + [DenseFeat(
        feat,
        1,
    ) for feat in dense_features]

    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)

    # 3.generate input data for model

    train, test = train_test_split(data, test_size=0.2, random_state=2020)
    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {name: test[name] for name in feature_names}

    # 4.Define Model,train,predict and evaluate
    if model_name == 'DeepFM':
        model = DeepFM(linear_feature_columns,
                       dnn_feature_columns,
                       task='binary')
    elif model_name == 'FNN':
        model = FNN(linear_feature_columns, dnn_feature_columns, task='binary')
    elif model_name == 'WDL':
        model = WDL(linear_feature_columns, dnn_feature_columns, task='binary')
    elif model_name == 'MLR':
        model = MLR(linear_feature_columns, dnn_feature_columns, task='binary')
    elif model_name == 'NFM':
        model = NFM(linear_feature_columns, dnn_feature_columns, task='binary')
    elif model_name == 'DIN':
        model = DIN(linear_feature_columns, dnn_feature_columns, task='binary')
    elif model_name == 'CCPM':
        model = CCPM(linear_feature_columns,
                     dnn_feature_columns,
                     task='binary')
    elif model_name == 'PNN':
        model = PNN(linear_feature_columns, dnn_feature_columns, task='binary')
    elif model_name == 'AFM':
        model = AFM(linear_feature_columns, dnn_feature_columns, task='binary')
    elif model_name == 'DCN':
        model = DCN(linear_feature_columns, dnn_feature_columns, task='binary')
    elif model_name == 'DIEN':
        model = DIEN(linear_feature_columns,
                     dnn_feature_columns,
                     task='binary')
    elif model_name == 'DSIN':
        model = DSIN(linear_feature_columns,
                     dnn_feature_columns,
                     task='binary')
    elif model_name == 'xDeepFM':
        model = xDeepFM(linear_feature_columns,
                        dnn_feature_columns,
                        task='binary')
    elif model_name == 'AutoInt':
        model = AutoInt(linear_feature_columns,
                        dnn_feature_columns,
                        task='binary')
    elif model_name == 'ONN':
        model = ONN(linear_feature_columns, dnn_feature_columns, task='binary')
    elif model_name == 'FGCNN':
        model = FGCNN(linear_feature_columns,
                      dnn_feature_columns,
                      task='binary')
    elif model_name == 'FiBiNET':
        model = FiBiNET(linear_feature_columns,
                        dnn_feature_columns,
                        task='binary')
    elif model_name == 'FLEN':
        model = FLEN(linear_feature_columns,
                     dnn_feature_columns,
                     task='binary')
    else:
        print(model_name + ' is not supported now.')
        return

    gpus = int(os.getenv('SM_NUM_GPUS', '0'))
    print('gpus:', gpus)
    if gpus > 1:
        from tensorflow.keras.utils import multi_gpu_model
        model = multi_gpu_model(model, gpus=gpus)

    model.compile(
        "adam",
        "binary_crossentropy",
        metrics=['binary_crossentropy'],
    )

    history = model.fit(
        train_model_input,
        train[target].values,
        batch_size=256,
        epochs=train_steps,
        verbose=2,
        validation_split=0.2,
    )
    pred_ans = model.predict(test_model_input, batch_size=256)
    try:
        print("test LogLoss", round(log_loss(test[target].values, pred_ans),
                                    4))
    except Exception as e:
        print(e)
    try:
        print("test AUC", round(roc_auc_score(test[target].values, pred_ans),
                                4))
    except Exception as e:
        print(e)

    model.save_weights(os.path.join(model_dir, 'DeepFM_w.h5'))
Exemplo n.º 25
0
    data[dense_features] = np.log(data[dense_features] + 1.0)
    test[dense_features] = np.log(test[dense_features] + 1.0)

    print('data.shape', data.shape)
    print('data.columns', data.columns.tolist())
    print('unique date_: ', data['date_'].unique())

    train = data[data['date_'] < 14]
    val = data[data['date_'] == 14]  # 第14天样本作为验证集
    pretrained_feed_embedding_initializer = tf.initializers.identity(feed_embedding)

    # 2.count #unique features for each sparse field,and record dense feature field name
    fixlen_feature_columns = [SparseFeat('feedid', vocabulary_size=data['feedid'].max() + 1, embedding_dim=512,
                                         embeddings_initializer=pretrained_feed_embedding_initializer)] + [
                                 SparseFeat(feat, vocabulary_size=data[feat].max() + 1, embedding_dim=embedding_dim)
                                 for feat in sparse_features if feat is not 'feedid'] + [DenseFeat(feat, 1) for feat in
                                                                                         dense_features]

    dnn_feature_columns = fixlen_feature_columns
    feature_names = get_feature_names(dnn_feature_columns)

    # 3.generate input data for model
    train_model_input = {name: train[name] for name in feature_names}
    val_model_input = {name: val[name] for name in feature_names}
    userid_list = val['userid'].astype(str).tolist()
    test_model_input = {name: test[name] for name in feature_names}

    train_labels = [train[y].values for y in target]
    val_labels = [val[y].values for y in target]

    # 4.Define Model,train,predict and evaluate
Exemplo n.º 26
0
def get_test_data(sample_size=1000,
                  embedding_size=4,
                  sparse_feature_num=1,
                  dense_feature_num=1,
                  sequence_feature=['sum', 'mean', 'max', 'weight'],
                  classification=True,
                  include_length=False,
                  hash_flag=False,
                  prefix='',
                  use_group=False):
    feature_columns = []
    model_input = {}

    if 'weight' in sequence_feature:
        feature_columns.append(
            VarLenSparseFeat(SparseFeat(prefix + "weighted_seq",
                                        vocabulary_size=2,
                                        embedding_dim=embedding_size),
                             maxlen=3,
                             length_name=prefix + "weighted_seq" +
                             "_seq_length",
                             weight_name=prefix + "weight"))
        s_input, s_len_input = gen_sequence(2, 3, sample_size)

        model_input[prefix + "weighted_seq"] = s_input
        model_input[prefix + 'weight'] = np.random.randn(sample_size, 3, 1)
        model_input[prefix + "weighted_seq" + "_seq_length"] = s_len_input
        sequence_feature.pop(sequence_feature.index('weight'))

    for i in range(sparse_feature_num):
        if use_group:
            group_name = str(i % 3)
        else:
            group_name = DEFAULT_GROUP_NAME
        dim = np.random.randint(1, 10)
        feature_columns.append(
            SparseFeat(prefix + 'sparse_feature_' + str(i),
                       dim,
                       embedding_size,
                       use_hash=hash_flag,
                       dtype=tf.int32,
                       group_name=group_name))

    for i in range(dense_feature_num):
        feature_columns.append(
            DenseFeat(prefix + 'dense_feature_' + str(i), 1, dtype=tf.float32))
    for i, mode in enumerate(sequence_feature):
        dim = np.random.randint(1, 10)
        maxlen = np.random.randint(1, 10)
        feature_columns.append(
            VarLenSparseFeat(SparseFeat(prefix + 'sequence_' + mode,
                                        vocabulary_size=dim,
                                        embedding_dim=embedding_size),
                             maxlen=maxlen,
                             combiner=mode))

    for fc in feature_columns:
        if isinstance(fc, SparseFeat):
            model_input[fc.name] = np.random.randint(0, fc.vocabulary_size,
                                                     sample_size)
        elif isinstance(fc, DenseFeat):
            model_input[fc.name] = np.random.random(sample_size)
        else:
            s_input, s_len_input = gen_sequence(fc.vocabulary_size, fc.maxlen,
                                                sample_size)
            model_input[fc.name] = s_input
            if include_length:
                fc.length_name = prefix + "sequence_" + str(i) + '_seq_length'
                model_input[prefix + "sequence_" + str(i) +
                            '_seq_length'] = s_len_input

    if classification:
        y = np.random.randint(0, 2, sample_size)
    else:
        y = np.random.random(sample_size)

    return model_input, y, feature_columns
Exemplo n.º 27
0
                 continue
             select_columns_name.append(feat_name)
             for key in vocabulary_size.keys():
                 if key in feat_name:
                     vocabulary_size_val = vocabulary_size[key]
                     embedding_name = key
                     break
             varlen_feature_columns.append(VarLenSparseFeat(
                 SparseFeat(feat_name, vocabulary_size=vocabulary_size_val + 1, embedding_dim=4,
                            use_hash=False, embedding_name=embedding_name),
                 maxlen=1,
                 combiner='mean', weight_name=feat_name + '_weight', weight_norm=False))
         else:  # 是dense特征
             if feat_name[-6:] == 'weight':
                 select_columns_name.append(feat_name)
                 fixed_feature_columns.append(DenseFeat(feat_name, 1, ))  # dense 特征
             else:
                 continue
 # if use_hour_features:  # 复现最优结果
 #     for feat_name in all_columns:
 #         if feat_name[-6:] == 'weight' or feat_name in ['ctr_label', 'cvr_label']:
 #             select_columns_name.append(feat_name)
 #             continue
 #         for key in vocabulary_size.keys():
 #             if key in feat_name:
 #                 vocabulary_size_val = vocabulary_size[key]
 #                 print("key:{0},size:{1},feature name:{2}".format(key, vocabulary_size_val, feat_name))
 #                 break
 #         print("size:{0},feature name:{1}".format(vocabulary_size_val, feat_name))
 #         varlen_feature_columns.append(VarLenSparseFeat(
 #             SparseFeat(feat_name, vocabulary_size=vocabulary_size_val + 1, embedding_dim=4, use_hash=False),
Exemplo n.º 28
0
    data[dense_features] = data[dense_features].fillna(0, )
    test[dense_features] = test[dense_features].fillna(0, )

    data[dense_features] = np.log(data[dense_features] + 1.0)
    test[dense_features] = np.log(test[dense_features] + 1.0)

    logging.info('data.shape: {}'.format(data.shape))
    # logging.info('data.columns', data.columns.tolist())
    # logging.info('unique date_: ', data['date_'].unique())

    train = data[data['date_'] < 14]
    val = data[data['date_'] == 14]  # 第14天样本作为验证集

    # 2.count #unique features for each sparse field,and record dense feature field name
    fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max() + 1, embedding_dim=embedding_dim)
                              for feat in sparse_features] + [DenseFeat(feat, 1) for feat in dense_features]

    dnn_feature_columns = fixlen_feature_columns
    feature_names = get_feature_names(dnn_feature_columns)

    # 3.generate input data for model
    if args.stage == 'offline':
        train_model_input = {name: train[name] for name in feature_names}
        train_labels = [train[y].values for y in target]
    else:
        train_model_input = {name: data[name] for name in feature_names}
        train_labels = [data[y].values for y in target]
    val_model_input = {name: val[name] for name in feature_names}
    userid_list = val['userid'].astype(str).tolist()
    test_model_input = {name: test[name] for name in feature_names}
Exemplo n.º 29
0
    user_item_list = data.groupby("user_id")['movie_id'].apply(list)

    train_set, test_set = gen_data_set(data, 20)

    train_model_input, train_label = gen_model_input(train_set, user_profile,
                                                     SEQ_LEN)
    test_model_input, test_label = gen_model_input(test_set, user_profile,
                                                   SEQ_LEN)

    # 2.count #unique features for each sparse field and generate feature config for sequence feature

    embedding_dim = 16

    user_feature_columns = [
        SparseFeat('user_id', feature_max_idx['user_id'], embedding_dim),
        DenseFeat("gender"),
        SparseFeat("age", feature_max_idx['age'], embedding_dim),
        SparseFeat("occupation", feature_max_idx['occupation'], embedding_dim),
        SparseFeat("zip", feature_max_idx['zip'], embedding_dim),
        VarLenSparseFeat(
            SparseFeat('hist_movie_id',
                       feature_max_idx['movie_id'],
                       embedding_dim,
                       embedding_name="movie_id"), SEQ_LEN, 'mean',
            'hist_len'),
    ]
    # feature_max_idx['gender'], embedding_dim
    item_feature_columns = [
        SparseFeat('movie_id', feature_max_idx['movie_id'], embedding_dim)
    ]
Exemplo n.º 30
0
    def _model_fn(features, labels, mode, config):
        train_flag = (mode == tf.estimator.ModeKeys.TRAIN)
        with variable_scope(DNN_SCOPE_NAME):
            sparse_feature_columns = []
            dense_feature_columns = []
            varlen_sparse_feature_columns = []

            for feat in dnn_feature_columns:

                new_feat_name = list(feat.parse_example_spec.keys())[0]
                if new_feat_name in ['hist_price_id', 'hist_des_id']:
                    varlen_sparse_feature_columns.append(
                        VarLenSparseFeat(SparseFeat(new_feat_name,
                                                    vocabulary_size=100,
                                                    embedding_dim=32,
                                                    use_hash=False),
                                         maxlen=3))
                elif is_embedding(feat):
                    sparse_feature_columns.append(
                        SparseFeat(new_feat_name,
                                   vocabulary_size=feat[0]._num_buckets + 1,
                                   embedding_dim=feat.dimension))
                else:
                    dense_feature_columns.append(DenseFeat(new_feat_name))

            history_feature_columns = []
            sparse_varlen_feature_columns = []
            history_fc_names = list(
                map(lambda x: "hist_" + x, history_feature_list))
            for fc in varlen_sparse_feature_columns:
                feature_name = fc.name
                if feature_name in history_fc_names:
                    history_feature_columns.append(fc)
                else:
                    sparse_varlen_feature_columns.append(fc)
            my_feature_columns = sparse_feature_columns + dense_feature_columns + varlen_sparse_feature_columns
            embedding_dict = create_embedding_matrix(my_feature_columns,
                                                     l2_reg_embedding,
                                                     seed,
                                                     prefix="")

            query_emb_list = embedding_lookup(embedding_dict,
                                              features,
                                              sparse_feature_columns,
                                              history_feature_list,
                                              history_feature_list,
                                              to_list=True)
            print('query_emb_list', query_emb_list)
            print('embedding_dict', embedding_dict)
            print('haha')
            print('history_feature_columns', history_feature_columns)
            print('haha')
            keys_emb_list = embedding_lookup(embedding_dict,
                                             features,
                                             history_feature_columns,
                                             history_fc_names,
                                             history_fc_names,
                                             to_list=True)
            print('keys_emb_list', keys_emb_list)
            dnn_input_emb_list = embedding_lookup(
                embedding_dict,
                features,
                sparse_feature_columns,
                mask_feat_list=history_feature_list,
                to_list=True)
            print('dnn_input_emb_list', dnn_input_emb_list)
            dense_value_list = get_dense_input(features, dense_feature_columns)
            sequence_embed_dict = varlen_embedding_lookup(
                embedding_dict, features, sparse_varlen_feature_columns)
            sequence_embed_list = get_varlen_pooling_list(
                sequence_embed_dict,
                features,
                sparse_varlen_feature_columns,
                to_list=True)

            dnn_input_emb_list += sequence_embed_list

            keys_emb = concat_func(keys_emb_list, mask=True)
            deep_input_emb = concat_func(dnn_input_emb_list)
            query_emb = concat_func(query_emb_list, mask=True)
            hist = AttentionSequencePoolingLayer(
                att_hidden_size,
                att_activation,
                weight_normalization=att_weight_normalization,
                supports_masking=True)([query_emb, keys_emb])

            deep_input_emb = tf.keras.layers.Concatenate()(
                [NoMask()(deep_input_emb), hist])
            deep_input_emb = tf.keras.layers.Flatten()(deep_input_emb)
            dnn_input = combined_dnn_input([deep_input_emb], dense_value_list)
            output = DNN(dnn_hidden_units,
                         dnn_activation,
                         l2_reg_dnn,
                         dnn_dropout,
                         dnn_use_bn,
                         seed=seed)(dnn_input)
            final_logit = tf.keras.layers.Dense(
                1,
                use_bias=False,
                kernel_initializer=tf.keras.initializers.glorot_normal(seed))(
                    output)
        #             logits_list.append(final_logit)
        #         logits = add_func(logits_list)
        #             print(labels)
        #             tf.summary.histogram(final_logit + '/final_logit', final_logit)
        return deepctr_model_fn(features,
                                mode,
                                final_logit,
                                labels,
                                task,
                                linear_optimizer,
                                dnn_optimizer,
                                training_chief_hooks=training_chief_hooks)