Пример #1
0
def generate_din_feature_columns(data, sparse_features, dense_features):
    feat_lbe_dict = get_glv('feat_lbe_dict')

    sparse_feature_columns = [
        SparseFeat(feat,
                   vocabulary_size=len(feat_lbe_dict[feat].classes_) + 1,
                   embedding_dim=EMBED_DIM)
        for i, feat in enumerate(sparse_features) if feat not in time_feat
    ]

    dense_feature_columns = [DenseFeat(
        feat,
        1,
    ) for feat in dense_features]

    var_feature_columns = [
        VarLenSparseFeat(SparseFeat(
            'hist_item_id',
            vocabulary_size=len(feat_lbe_dict['item_id'].classes_) + 1,
            embedding_dim=EMBED_DIM,
            embedding_name='item_id'),
                         maxlen=max_seq_len)
    ]

    # DNN side
    dnn_feature_columns = sparse_feature_columns + dense_feature_columns + var_feature_columns
    # FM side
    linear_feature_columns = sparse_feature_columns + dense_feature_columns + var_feature_columns
    # all feature names
    feature_names = get_feature_names(dnn_feature_columns +
                                      linear_feature_columns)

    return feature_names, linear_feature_columns, dnn_feature_columns
Пример #2
0
def simple_pre(df):

    # Label Encoding for sparse features,and normalization for dense numerical features
    for feat in config.sparse_features:
        lbe = LabelEncoder()
        df[feat] = lbe.fit_transform(df[feat])

    mms = MinMaxScaler(feature_range=(0, 1))
    df[config.dense_features] = mms.fit_transform(df[config.dense_features])

    #Generate feature columns
    #For sparse features, we transform them into dense vectors by embedding techniques.
    #For dense numerical features, we concatenate them to the input tensors of fully connected layer.

    # count #unique features for each sparse field
    fixlen_feature_columns = [
        SparseFeat(feat, df[feat].nunique(), embedding_dim=4)
        for feat in config.sparse_features
    ]

    linear_feature_columns = fixlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)
    return linear_feature_columns, dnn_feature_columns, feature_names
Пример #3
0
def get_xy_fd(hash_flag=False):

    feature_columns = [SparseFeat('user', 3, hash_flag),
                       SparseFeat('gender', 2, hash_flag),
                       SparseFeat('item', 3 + 1, hash_flag),
                       SparseFeat('item_gender', 2 + 1, hash_flag),
                       DenseFeat('score', 1)]
    feature_columns += [VarLenSparseFeat('sess_0_item',3+1,4,use_hash=hash_flag,embedding_name='item'),VarLenSparseFeat('sess_0_item_gender',2+1,4,use_hash=hash_flag,embedding_name='item_gender')]
    feature_columns += [VarLenSparseFeat('sess_1_item', 3 + 1, 4, use_hash=hash_flag, embedding_name='item'),VarLenSparseFeat('sess_1_item_gender', 2 + 1, 4, use_hash=hash_flag,embedding_name='item_gender')]

    behavior_feature_list = ["item", "item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    igender = np.array([1, 2, 1])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    sess1_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [0, 0, 0, 0]])
    sess1_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [0, 0, 0, 0]])

    sess2_iid = np.array([[1, 2, 3, 0], [0, 0, 0, 0], [0, 0, 0, 0]])
    sess2_igender = np.array([[1, 1, 2, 0], [0, 0, 0, 0], [0, 0, 0, 0]])

    sess_number = np.array([2, 1, 0])

    feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender,
                    'sess_0_item': sess1_iid, 'sess_0_item_gender': sess1_igender, 'score': score,
                    'sess_1_item': sess2_iid, 'sess_1_item_gender': sess2_igender, }

    x = {name:feature_dict[name] for name in get_feature_names(feature_columns)}
    x["sess_length"] = sess_number
    y = [1, 0, 1]
    return x, y, feature_columns, behavior_feature_list
Пример #4
0
def test_long_dense_vector():

    feature_columns = [
        SparseFeat(
            'user_id',
            4,
        ),
        SparseFeat(
            'item_id',
            5,
        ),
        DenseFeat("pic_vec", 5)
    ]
    fixlen_feature_names = get_feature_names(feature_columns)

    user_id = np.array([[1], [0], [1]])
    item_id = np.array([[3], [2], [1]])
    pic_vec = np.array([[0.1, 0.5, 0.4, 0.3, 0.2], [0.1, 0.5, 0.4, 0.3, 0.2],
                        [0.1, 0.5, 0.4, 0.3, 0.2]])
    label = np.array([1, 0, 1])

    input_dict = {'user_id': user_id, 'item_id': item_id, 'pic_vec': pic_vec}
    model_input = [input_dict[name] for name in fixlen_feature_names]

    model = DeepFM(feature_columns, feature_columns[:-1])
    model.compile('adagrad', 'binary_crossentropy')
    model.fit(model_input, label)
Пример #5
0
def train_deepFM():
    k = featureengineer.k
    #缺失值填充+编码处理
    data,appsnum, tags_nums = trainmodel.data,trainmodel.appsnum,trainmodel.tags_nums
    data[trainmodel.sparse_features] = data[trainmodel.sparse_features].fillna('-1', )
    for feat in trainmodel.dense_features:
        data[feat].fillna(data[feat].dropna().mean(), inplace=True)

    for feat in trainmodel.sparse_features:
        data[feat] = data[feat].apply(lambda x:str(x))
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[trainmodel.dense_features] = mms.fit_transform(data[trainmodel.dense_features])


    #数据格式转换
    fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max()+1, embedding_dim=8)
                              for i, feat in enumerate(trainmodel.sparse_features)] + \
                             [DenseFeat(feat, 1, ) for feat in trainmodel.dense_features]

    lgbOut_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max()+1, embedding_dim=1)
                              for i, feat in enumerate(trainmodel.lgbOut_Features)]

    key2index_len = {'applist': appsnum+1, 'new_tag': tags_nums}
    varlen_features = [VarLenSparseFeat('%s' % i, vocabulary_size=key2index_len[i], maxlen=k, embedding_dim=8, combiner='mean',weight_name=None) for i
                       in trainmodel.var_features]

    dnn_feature_columns = fixlen_feature_columns + varlen_features
    linear_feature_columns = fixlen_feature_columns + varlen_features + lgbOut_feature_columns

    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

    sparse_dense_features = trainmodel.sparse_features + trainmodel.dense_features + trainmodel.lgbOut_Features

    train, test = train_test_split(data, test_size=0.2)


    train_model_input = {name: train[name] for name in sparse_dense_features}
    test_model_input = {name: test[name] for name in sparse_dense_features}
    for x in trainmodel.var_features:
        if x == 'applist':
            train_model_input[x] = np.array(train[x].tolist())
            test_model_input[x] = np.array(test[x].tolist())
        if x == 'new_tag':
            train_model_input[x] = np.array(train[x].tolist())-appsnum
            test_model_input[x] = np.array(test[x].tolist())-appsnum
    # 模型
    model = DeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns,
                   dnn_hidden_units=(50, 30, 30), l2_reg_linear=0.001, l2_reg_embedding=0.001,
                   l2_reg_dnn=0, init_std=0.0001, seed=1024, dnn_dropout=0.1, dnn_activation='relu', dnn_use_bn=True,
                   task='binary')
    model.compile("adam", "binary_crossentropy",metrics=['AUC'], )

    history = model.fit(train_model_input, train['target'].values,
                        batch_size=256, epochs=1, verbose=2, validation_split=0.2, )

    pred_ans = model.predict(test_model_input, batch_size=256)
    print("test AUC", round(roc_auc_score(test['target'].values, pred_ans), 4))
Пример #6
0
def get_xy_fd():
    feature_columns = [
        SparseFeat('user', 3, embedding_dim=10),
        SparseFeat('gender', 2, embedding_dim=4),
        SparseFeat('item', 3 + 1, embedding_dim=8),
        SparseFeat('item_gender', 2 + 1, embedding_dim=4),
        DenseFeat('score', 1)
    ]
    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item',
                                    vocabulary_size=3 + 1,
                                    embedding_dim=20,
                                    embedding_name='item'),
                         maxlen=4),
        VarLenSparseFeat(SparseFeat('hist_item_gender',
                                    2 + 1,
                                    embedding_dim=4,
                                    embedding_name='item_gender'),
                         maxlen=4)
    ]

    feature_columns += [DenseFeat('hist_len', 1, dtype="int64")]

    behavior_feature_list = ["item"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    igender = np.array([1, 2, 1])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
    hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])
    hist_len = np.array([3, 3, 2])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item': iid,
        'item_gender': igender,
        'hist_item': hist_iid,
        'hist_item_gender': hist_igender,
        'hist_len': hist_len,
        'score': score
    }
    x = {
        name: feature_dict[name]
        for name in get_feature_names(feature_columns)
    }
    y = [1, 1, 1]
    return x, y, feature_columns, behavior_feature_list
Пример #7
0
def get_xy_fd():
    # 固定长度的离散特征
    feature_columns = [
        SparseFeat('user', 3, embedding_dim=10),
        SparseFeat('gender', 2, embedding_dim=4),
        SparseFeat('item_id', 3 + 1, embedding_dim=8),
        SparseFeat('cate_id', 2 + 1, embedding_dim=4),
        DenseFeat('pay_score', 1)
    ]
    # 不固定长度的离散特征
    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item_id',
                                    vocabulary_size=3 + 1,
                                    embedding_dim=8,
                                    embedding_name='item_id'),
                         maxlen=4),
        VarLenSparseFeat(SparseFeat('hist_cate_id',
                                    2 + 1,
                                    embedding_dim=4,
                                    embedding_name='cate_id'),
                         maxlen=4)
    ]

    behavior_feature_list = ["item", "cate_id"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    cate_id = np.array([1, 2, 1])  # 0 is mask value
    pay_score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
    hist_cate_id = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item_id': iid,
        'cate_id': cate_id,
        'hist_item_id': hist_iid,
        'hist_cate_id': hist_cate_id,
        'pay_score': pay_score
    }
    x = {
        name: feature_dict[name]
        for name in get_feature_names(feature_columns)
    }
    print('x=', x)
    y = [1, 0, 1]
    return x, y, feature_columns, behavior_feature_list
Пример #8
0
def get_xy_fd(hash_flag=False):

    feature_columns = [
        SparseFeat('user', 3),
        SparseFeat('gender', 2),
        SparseFeat('item', 3 + 1),
        SparseFeat('item_gender', 2 + 1),
        DenseFeat('score', 1)
    ]
    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item',
                                    vocabulary_size=3 + 1,
                                    embedding_dim=8,
                                    embedding_name='item'),
                         maxlen=4),
        VarLenSparseFeat(SparseFeat('hist_item_gender',
                                    2 + 1,
                                    embedding_dim=4,
                                    embedding_name='item_gender'),
                         maxlen=4)
    ]

    behavior_feature_list = ["item", "item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    igender = np.array([1, 2, 1])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
    hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item': iid,
        'item_gender': igender,
        'hist_item': hist_iid,
        'hist_item_gender': hist_igender,
        'score': score
    }

    feature_names = get_feature_names(feature_columns)
    x = {name: feature_dict[name] for name in feature_names}
    y = [1, 0, 1]
    return x, y, feature_columns, behavior_feature_list
    def prepare(self):
        if self.data_format == "deepctr":
            # 1.Label Encoding for sparse features, and
            # simple Transformation for dense features
            for feat in self.sparse_features:
                lbe = LabelEncoder()
                self.input[feat] = lbe.fit_transform(self.input[feat])
                self.encoders[feat] = lbe

            # 2.count #unique features for each sparse field
            fixlen_feature_columns = [
                SparseFeat(feat, self.input[feat].nunique(), embedding_dim=4)
                for feat in self.sparse_features
            ]

            self.linear_feature_columns = fixlen_feature_columns
            self.dnn_feature_columns = fixlen_feature_columns

            self.feature_names = get_feature_names(
                self.linear_feature_columns + self.dnn_feature_columns)

            # 3.generate input data for model
            train, test = train_test_split(self.input,
                                           test_size=self.test_size)

            self.X_train = {
                name: train[name].values
                for name in self.feature_names
            }
            self.y_train = train[self.target].values

            self.X_test = {
                name: test[name].values
                for name in self.feature_names
            }
            self.y_test = test[self.target].values
        else:
            raise ("Not supported dataset:" + self.data_format)
Пример #10
0
def get_xy_fd(use_neg=False, hash_flag=False):

    feature_columns = [SparseFeat('user', 3,hash_flag),
                       SparseFeat('gender', 2,hash_flag),
                       SparseFeat('item', 3+1,hash_flag),
                       SparseFeat('item_gender', 2+1,hash_flag),
                       DenseFeat('score', 1)]

    feature_columns += [VarLenSparseFeat('hist_item',3+1, maxlen=4, embedding_name='item'),
                        VarLenSparseFeat('hist_item_gender',3+1, maxlen=4, embedding_name='item_gender')]

    behavior_feature_list = ["item","item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])#0 is mask value
    igender = np.array([1, 2, 1])# 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[ 1, 2, 3,0], [ 1, 2, 3,0], [ 1, 2, 0,0]])
    hist_igender = np.array([[1, 1, 2,0 ], [2, 1, 1, 0], [2, 1, 0, 0]])

    behavior_length = np.array([3,3,2])

    feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender,
                    'hist_item': hist_iid, 'hist_item_gender': hist_igender,
                    'score': score}

    if use_neg:
        feature_dict['neg_hist_item'] = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
        feature_dict['neg_hist_item_gender'] = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])
        feature_columns += [VarLenSparseFeat('neg_hist_item',3+1, maxlen=4, embedding_name='item'),
                        VarLenSparseFeat('neg_hist_item_gender',3+1, maxlen=4, embedding_name='item_gender')]

    x = {name:feature_dict[name] for name in get_feature_names(feature_columns)}
    x["seq_length"] = behavior_length
    y = [1, 0, 1]
    return x, y, feature_columns, behavior_feature_list
Пример #11
0
def client_restful_criteo():
    data = pd.read_csv('./data/criteo_sample.txt')

    sparse_features = ['C' + str(i) for i in range(1, 27)]
    dense_features = ['I' + str(i) for i in range(1, 14)]

    data[sparse_features] = data[sparse_features].fillna('-1', )
    data[dense_features] = data[dense_features].fillna(0, )
    target = ['label']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name

    fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4)
                           for i,feat in enumerate(sparse_features)] + [DenseFeat(feat, 1,)
                          for feat in dense_features]

    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
    
    # model_input = [data[name].iloc[0] for name in feature_names]
    # model_input = [{name:data[name].iloc[0]} for name in feature_names]
    model_input = [{name:data[name].iloc[0] for name in feature_names}]
    print(model_input)
    data = json.dumps({"signature_name": "serving_default", "instances": model_input}, cls=NpEncoder)
    headers = {"content-type": "application/json"}
    json_response = requests.post('http://localhost:8501/v1/models/criteo:predict', data=data, headers=headers)
    json_response = json.loads(json_response.text)
    print(json_response)
Пример #12
0
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name

    fixlen_feature_columns = [
        SparseFeat(feat, data[feat].nunique()) for feat in sparse_features
    ] + [DenseFeat(
        feat,
        1,
    ) for feat in dense_features]

    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)

    # 3.generate input data for model

    train, test = train_test_split(data, test_size=0.2)

    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {name: test[name] for name in feature_names}

    # 4.Define Model,train,predict and evaluate
    model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
    model = multi_gpu_model(model, gpus=2)

    model.compile(
        "adam",
        "binary_crossentropy",
Пример #13
0
def get_xy_fd(use_neg=False, hash_flag=False):
    feature_columns = [
        SparseFeat('user', 3, embedding_dim=10, use_hash=hash_flag),
        SparseFeat('gender', 2, embedding_dim=4, use_hash=hash_flag),
        SparseFeat('item_id', 3 + 1, embedding_dim=8, use_hash=hash_flag),
        SparseFeat('cate_id', 2 + 1, embedding_dim=4, use_hash=hash_flag),
        DenseFeat('pay_score', 1)
    ]

    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item_id',
                                    vocabulary_size=3 + 1,
                                    embedding_dim=8,
                                    embedding_name='item_id'),
                         maxlen=4,
                         length_name="seq_length"),
        VarLenSparseFeat(SparseFeat('hist_cate_id',
                                    2 + 1,
                                    embedding_dim=4,
                                    embedding_name='cate_id'),
                         maxlen=4,
                         length_name="seq_length")
    ]

    behavior_feature_list = ["item_id", "cate_id"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    cate_id = np.array([1, 2, 2])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
    hist_cate_id = np.array([[1, 2, 2, 0], [1, 2, 2, 0], [1, 2, 0, 0]])

    behavior_length = np.array([3, 3, 2])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item_id': iid,
        'cate_id': cate_id,
        'hist_item_id': hist_iid,
        'hist_cate_id': hist_cate_id,
        'pay_score': score,
        "seq_length": behavior_length
    }

    if use_neg:
        feature_dict['neg_hist_item_id'] = np.array([[1, 2, 3,
                                                      0], [1, 2, 3, 0],
                                                     [1, 2, 0, 0]])
        feature_dict['neg_hist_cate_id'] = np.array([[1, 2, 2,
                                                      0], [1, 2, 2, 0],
                                                     [1, 2, 0, 0]])
        feature_columns += [
            VarLenSparseFeat(SparseFeat('neg_hist_item_id',
                                        vocabulary_size=3 + 1,
                                        embedding_dim=8,
                                        embedding_name='item_id'),
                             maxlen=4,
                             length_name="seq_length"),
            VarLenSparseFeat(SparseFeat('neg_hist_cate_id',
                                        2 + 1,
                                        embedding_dim=4,
                                        embedding_name='cate_id'),
                             maxlen=4,
                             length_name="seq_length")
        ]

    x = {
        name: feature_dict[name]
        for name in get_feature_names(feature_columns)
    }
    y = [1, 0, 1]
    return x, y, feature_columns, behavior_feature_list
Пример #14
0
def feature_construct(path,
                      embedding_dim=16,
                      data_sample=100000,
                      test_size=0.2):
    data = load_data(path)
    data = data.sample(data_sample, random_state=SEED)
    data['day'] = data['hour'].apply(lambda x: str(x)[4:6])
    data['hour'] = data['hour'].apply(lambda x: str(x)[6:])
    target = ['click']
    sparse_features = [
        'hour', 'C1', 'banner_pos', 'site_id', 'site_domain', 'site_category',
        'app_id', 'app_domain', 'app_category', 'device_id', 'device_model',
        'device_type', 'device_conn_type', 'C14', 'C15', 'C16', 'C17', 'C18',
        'C19', 'C20', 'C21'
    ]
    field_info = dict(
        C14='user',
        C15='user',
        C16='user',
        C17='user',
        C18='user',
        C19='user',
        C20='user',
        C21='user',
        C1='user',
        device_model='user',
        device_type='user',
        device_id='user',
        banner_pos='context',
        site_id='context',
        site_domain='context',
        site_category='context',
        device_conn_type='context',
        hour='context',
        app_id='item',
        app_domain='item',
        app_category='item',
    )

    for feature in sparse_features:
        lbe = LabelEncoder()
        data[feature] = lbe.fit_transform(data[feature])

    fixlen_feature_columns = [
        SparseFeat(feature,
                   data[feature].nunique(),
                   embedding_dim=embedding_dim,
                   group_name=field_info[feature])
        for feature in sparse_features
    ]
    linear_feature_columns = fixlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns
    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)

    data_train, data_test = train_test_split(data, test_size=test_size)
    target_train = data_train[target].values
    target_test = data_test[target].values

    train_model_input = {
        name: data_train[name].values
        for name in feature_names
    }
    test_model_input = {name: data_test[name].values for name in feature_names}

    return (train_model_input, target_train), (
        test_model_input,
        target_test), linear_feature_columns, dnn_feature_columns
Пример #15
0
def structural_feature(train, test):
    test['label'] = -1
    data = pd.concat([train, test], axis=0)
    '''特征工程 >>>>>'''
    # data['year'] = data['date'].dt.year
    # data['month'] = data['date'].dt.month
    # data['day'] = data['date'].dt.day
    data['hour'] = data['date'].dt.hour
    del data['date']

    data['D1+D2'] = data['D1'] + data['D2']
    data['D1-D2'] = data['D1'] - data['D2']
    data['D1/D2'] = data['D1'] / data['D2']

    # data['A_sum'] = data['A1'] + data['A2'] + data['A3']
    data['B_sum'] = data['B1'] + data['B2'] + data['B3']
    # data['C_sum'] = data['C1'] + data['C2'] + data['C3']

    data['A_*'] = data['A1'] * data['A2'] * data['A3']
    data['B_*'] = data['B1'] * data['B2'] * data['B3']
    # data['C_*'] = data['C1'] * data['C2'] * data['C3']

    data['A_+'] = data['A1'] + data['A2'] + data['A3']
    data['B_+'] = data['B1'] + data['B2'] + data['B3']
    data['C_+'] = data['C1'] + data['C2'] + data['C3']

    normalization_columns = [
        'A1', 'A2', 'A3', 'B1', 'B2', 'B3', 'C1', 'C2', 'C3', 'E2', 'E3', 'E5',
        'E7', 'E9', 'E10', 'E13', 'E16', 'E17', 'E19', 'E21', 'E22'
    ]
    for column in normalization_columns:
        data[column] = (data[column] - data[column].min(axis=0)) / (
            data[column].max(axis=0) - data[column].min(axis=0))

    sparse_features = [
        'D1', 'D2', 'E4', 'E8', 'E11', 'E15', 'E18', 'E25', 'hour'
    ]
    dense_features = [
        'E1', 'E2', 'E3', 'E5', 'E6', 'E7', 'E9', 'E10', 'E12', 'E13', 'E14',
        'E16', 'E17', 'E16', 'E17', 'E19', 'E20', 'E21', 'E22', 'E23', 'E24',
        'A1', 'A2', 'A3', 'B1', 'B2', 'B3', 'C1', 'C2', 'C3'
    ]

    data[sparse_features] = data[sparse_features].fillna('-1', )
    data[dense_features] = data[dense_features].fillna(0, )

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name

    fixlen_feature_columns = [
        SparseFeat(feat, data[feat].nunique()) for feat in sparse_features
    ] + [DenseFeat(
        feat,
        1,
    ) for feat in dense_features]

    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)
    '''特征工程结束 <<<<'''

    train = data[data.label != -1]
    test = data[data.label == -1]
    del test['label']
    '''调整特征顺序'''
    l = train['label']
    del train['label']
    train['label'] = l
    return train, test, feature_names, linear_feature_columns, dnn_feature_columns
Пример #16
0
feats = [i for i in data.columns if i != 'Rating']
X = data[feats]
y = data['Rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

sparse_features = [
    'UserID', 'MovieID', 'Gender', 'Occupation', 'day', 'weekday'
]
dense_features = ['hour', 'Age']

fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4) for feat in sparse_features] + \
                         [DenseFeat(feat, 1) for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

model = DeepFM(linear_feature_columns, dnn_feature_columns, task='multiclass')
model.compile('adam', 'mse', metrics=['accuracy'])

feature_names = get_feature_names(fixlen_feature_columns)

train_feed_dict = {name: X_train[name] for name in feature_names}
test_feed_dict = {name: X_test[name] for name in feature_names}

model.fit(train_feed_dict,
          y_train,
          batch_size=256,
          epochs=10,
          validation_split=0.2)
pred_ans = model.predict(test_feed_dict, batch_size=256)
Пример #17
0
def _preprocess_movielens(df, **kw):
    multiple_value = kw.get('multiple_value')
    sparse_col = ["movie_id", "user_id", "gender", "age", "occupation", "zip"]
    target = ['rating']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_col:
        lbe = LabelEncoder()
        df[feat] = lbe.fit_transform(df[feat])

    if not multiple_value:
        # 2.count #unique features for each sparse field
        fixlen_cols = [SparseFeat(feat, df[feat].nunique(), embedding_dim=4) for feat in sparse_col]
        linear_cols = fixlen_cols
        dnn_cols = fixlen_cols
        train, test = train_test_split(df, test_size=0.2)
        ytrue = test[target].values
    else:
        ytrue = df[target].values
        hash_feature = kw.get('hash_feature', False)
        if not hash_feature:
            def split(x):
                key_ans = x.split('|')
                for key in key_ans:
                    if key not in key2index:
                        # Notice : input value 0 is a special "padding",so we do not use 0 to encode valid feature for sequence input
                        key2index[key] = len(key2index) + 1
                return list(map(lambda x: key2index[x], key_ans))

            # preprocess the sequence feature
            key2index = {}
            genres_list = list(map(split, df['genres'].values))
            genres_length = np.array(list(map(len, genres_list)))
            max_len = max(genres_length)
            # Notice : padding=`post`
            genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', )
            fixlen_cols = [SparseFeat(feat, df[feat].nunique(), embedding_dim=4) for feat in sparse_col]

            use_weighted_sequence = False
            if use_weighted_sequence:
                varlen_cols = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
                    key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean',
                                                weight_name='genres_weight')]  # Notice : value 0 is for padding for sequence input feature
            else:
                varlen_cols = [VarLenSparseFeat(SparseFeat('genres', vocabulary_size=len(
                    key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean',
                                                weight_name=None)]  # Notice : value 0 is for padding for sequence input feature

            linear_cols = fixlen_cols + varlen_cols
            dnn_cols = fixlen_cols + varlen_cols

            # generate input data for model
            model_input = {name: df[name] for name in sparse_col}  #
            model_input["genres"] = genres_list
            model_input["genres_weight"] = np.random.randn(df.shape[0], max_len, 1)


        else:
            df[sparse_col] = df[sparse_col].astype(str)

            # 1.Use hashing encoding on the fly for sparse features,and process sequence features
            genres_list = list(map(lambda x: x.split('|'), df['genres'].values))
            genres_length = np.array(list(map(len, genres_list)))
            max_len = max(genres_length)

            # Notice : padding=`post`
            genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', dtype=str, value=0)

            # 2.set hashing space for each sparse field and generate feature config for sequence feature
            fixlen_cols = [
                SparseFeat(feat, df[feat].nunique() * 5, embedding_dim=4, use_hash=True, dtype='string')
                for feat in sparse_col]
            varlen_cols = [
                VarLenSparseFeat(
                    SparseFeat('genres', vocabulary_size=100, embedding_dim=4, use_hash=True, dtype="string"),
                    maxlen=max_len, combiner='mean',
                )]  # Notice : value 0 is for padding for sequence input feature

            linear_cols = fixlen_cols + varlen_cols
            dnn_cols = fixlen_cols + varlen_cols
            feature_names = get_feature_names(linear_cols + dnn_cols)

            # 3.generate input data for model
            model_input = {name: df[name] for name in feature_names}
            model_input['genres'] = genres_list

        train, test = model_input, model_input

    return df, linear_cols, dnn_cols, train, test, target, ytrue
Пример #18
0
def loadData(trainFile,
             testFile,
             embedding_dim,
             multivalue_len,
             multiClass=False):
    train = pd.read_csv(trainFile)
    test = pd.read_csv(testFile)

    ##1. feature type declarion
    sparse_features = [
        "BaseAdGroupId", "Criteria", 'placementType', 'Week', 'IsRestrict',
        'IsNegative', 'AccountTimeZone', 'AccountCurrencyCode',
        'BiddingStrategyType', 'CampaignId', 'Month'
    ]

    dense_features = [
        'adClicks', 'adConversions', 'adCtr', 'adConversionRate',
        'adActiveViewImpressions', 'adActiveViewMeasurability',
        'adActiveViewMeasurableCost', 'adActiveViewViewability',
        'adImpressions', 'adActiveViewCpm', 'adAverageCpc', 'adAverageCpe',
        'adCpcBid', 'adActiveViewMeasurableImpressions', 'adActiveViewCtr',
        'adAverageCpm', 'adAverageCpv', 'adCost', 'plaClicks',
        'plaConversions', 'plaCtr', 'plaConversionRate',
        'plaActiveViewImpressions', 'plaActiveViewMeasurability',
        'plaActiveViewMeasurableCost', 'plaActiveViewViewability',
        'plaImpressions', 'plaCpcBid', 'plaActiveViewMeasurableImpressions',
        'plaActiveViewCtr', 'plaActiveViewCpm', 'plaAverageCpc',
        'plaAverageCpe', 'plaAverageCpm', 'plaAverageCpv', 'plaCost',
        'histListLen'
    ]
    multivalue_features = [
        'locationName', 'languageCode', 'hist_BaseAdGroupId'
    ]
    sparse_features = ["BaseAdGroupId", "Criteria", 'placementType']
    target = ['Ctr']

    # 2. Missing value process.
    train[sparse_features +
          multivalue_features] = train[sparse_features +
                                       multivalue_features].fillna('-1', )
    train[dense_features + target] = train[dense_features + target].fillna(0, )
    test[sparse_features +
         multivalue_features] = test[sparse_features +
                                     multivalue_features].fillna('-1', )
    test[dense_features + target] = test[dense_features + target].fillna(0, )

    train["BaseAdGroupId"] = train["BaseAdGroupId"].apply(lambda x: str(
        (int(x))))

    test["BaseAdGroupId"] = test["BaseAdGroupId"].apply(lambda x: str(
        (int(x))))

    # 3. sparse features transformation
    for feat in sparse_features:
        lbe = LabelEncoder()
        train[feat] = lbe.fit_transform(train[feat])
        test[feat] = lbe.fit_transform(test[feat])

    # 4. dense features transformation
    for numFeature in dense_features:
        train[numFeature] = train[numFeature].apply(
            lambda x: x if x < 2 else math.sqrt(math.log(x)))
        test[numFeature] = test[numFeature].apply(
            lambda x: x if x < 2 else math.sqrt(math.log(x)))

    # 5. multivalue features transformation

    for feat in multivalue_features:
        exec(
            '{}_train_list = list([split(x,{}Dict) for x in train[feat].values])'
            .format(feat, feat, feat))
        exec(
            '{}_test_list = list([split(x,{}Dict) for x in test[feat].values])'
            .format(feat, feat, feat))

        exec('{}_length = np.array(list(map(len, {}_train_list)))'.format(
            feat, feat))
        exec('{}_maxlen = max({}_length)'.format(feat, feat))

        exec(
            '{}_train_list = pad_sequences({}_train_list, maxlen=multivalue_len, padding="post",)'
            .format(feat, feat, feat))
        exec(
            '{}_test_list = pad_sequences({}_test_list, maxlen=multivalue_len, padding="post",)'
            .format(feat, feat, feat))

    # 6. feature colums
    fixlen_feature_columns = [
        SparseFeat(feat,
                   vocabulary_size=(train[feat].append(
                       test[feat], ignore_index=True)).nunique(),
                   embedding_dim=embedding_dim)
        for i, feat in enumerate(sparse_features)
    ] + [DenseFeat(
        feat,
        1,
    ) for feat in dense_features]

    varlen_feature_columns = []
    for feat in multivalue_features:
        exec(
            'varlen_feature_columns.append(VarLenSparseFeat("{}", maxlen= multivalue_len,vocabulary_size=len({}Dict) + 1,embedding_dim=embedding_dim, combiner="mean",weight_name=None))'
            .format(str(feat), feat))

    dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)

    # 7.generate input data for model
    train_model_input = {
        name: train[name]
        for name in sparse_features + dense_features
    }
    test_model_input = {
        name: test[name]
        for name in sparse_features + dense_features
    }

    for feat in multivalue_features:
        name = str(feat)
        exec('train_model_input["{}"] = {}_train_list'.format(name, feat))
        exec('test_model_input["{}"] = {}_test_list'.format(name, feat))

    behavior_feature_list = ["BaseAdGroupId"]
    return train_model_input, train, test_model_input, test, dnn_feature_columns, linear_feature_columns, behavior_feature_list
Пример #19
0
def test_DFM_avazu(data, train, test):
    print("\nTesting DFM on avazu dataset...\n")

    results_activation_function = {"auc": [], "logloss": [], "rmse": []}
    results_dropout = {"auc": [], "logloss": [], "rmse": []}
    results_number_of_neurons = {"auc": [], "logloss": [], "rmse": []}

    auc = 0
    logloss = 0
    rmse = 0

    features_labels = train.columns

    sparse_features_labels = features_labels[1:23]
    target_label = features_labels[0]

    dnn_feature_columns = [
        SparseFeat(
            feat,
            vocabulary_size=data[feat].nunique(),
            embedding_dim=4,
        ) for feat in sparse_features_labels
    ]
    linear_feature_columns = [
        SparseFeat(
            feat,
            vocabulary_size=data[feat].nunique(),
            embedding_dim=4,
        ) for feat in sparse_features_labels
    ]

    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)

    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {name: test[name] for name in feature_names}

    true_y = test[target_label].values

    print("\t\t-- ACTIVATION FUNCTIONS --\t\t")
    for dnn_activation in dnn_activation_list:
        print("\nTesting {dnn_activation}...".format(
            dnn_activation=dnn_activation))

        model = DeepFM(linear_feature_columns,
                       dnn_feature_columns,
                       dnn_activation=dnn_activation,
                       task='binary')
        model.compile(
            "adam",
            "binary_crossentropy",
            metrics=['binary_crossentropy'],
        )
        model.fit(
            train_model_input,
            train[target_label].values,
            batch_size=256,
            epochs=10,
            verbose=0,
            validation_split=TEST_PROPORTION,
        )
        pred_y = model.predict(test_model_input, batch_size=256)

        auc = compute_auc(true_y, pred_y)
        logloss = compute_log_loss(true_y, pred_y)
        rmse = compute_rmse(true_y, pred_y)

        results_activation_function["auc"].append(auc)
        results_activation_function["logloss"].append(logloss)
        results_activation_function["rmse"].append(rmse)

    print("\t\t-- DROPOUT RATES --\t\t")
    for dnn_dropout in dnn_dropout_list:
        print("\nTesting {dnn_dropout}...".format(dnn_dropout=dnn_dropout))

        model = DeepFM(linear_feature_columns,
                       dnn_feature_columns,
                       dnn_dropout=dnn_dropout,
                       task='binary')
        model.compile(
            "adam",
            "binary_crossentropy",
            metrics=['binary_crossentropy'],
        )
        model.fit(
            train_model_input,
            train[target_label].values,
            batch_size=256,
            epochs=10,
            verbose=0,
            validation_split=TEST_PROPORTION,
        )
        pred_y = model.predict(test_model_input, batch_size=256)

        auc = compute_auc(true_y, pred_y)
        logloss = compute_log_loss(true_y, pred_y)
        rmse = compute_rmse(true_y, pred_y)

        results_dropout["auc"].append(auc)
        results_dropout["logloss"].append(logloss)
        results_dropout["rmse"].append(rmse)

    print("\t\t-- HIDDEN UNITS --\t\t")
    for dnn_hidden_units in dnn_hidden_units_list:
        print("\nTesting {dnn_hidden_units}...".format(
            dnn_hidden_units=dnn_hidden_units))

        model = DeepFM(linear_feature_columns,
                       dnn_feature_columns,
                       dnn_hidden_units=dnn_hidden_units,
                       task='binary')
        model.compile(
            "adam",
            "binary_crossentropy",
            metrics=['binary_crossentropy'],
        )
        model.fit(
            train_model_input,
            train[target_label].values,
            batch_size=256,
            epochs=10,
            verbose=0,
            validation_split=TEST_PROPORTION,
        )
        pred_y = model.predict(test_model_input, batch_size=256)

        auc = compute_auc(true_y, pred_y)
        logloss = compute_log_loss(true_y, pred_y)
        rmse = compute_rmse(true_y, pred_y)

        results_number_of_neurons["auc"].append(auc)
        results_number_of_neurons["logloss"].append(logloss)
        results_number_of_neurons["rmse"].append(rmse)

    if PLOT:
        create_plots("DFM", "avazu", results_activation_function,
                     "Activation Function", "activation_func",
                     dnn_activation_list)
        create_plots("DFM", "avazu", results_dropout, "Dropout Rate",
                     "dropout", dnn_dropout_list)
        create_plots("DFM", "avazu", results_number_of_neurons,
                     "Number of Neurons per layer", "nr_neurons",
                     dnn_hidden_units_list)
Пример #20
0
def run(data, ziel, line0, grid , loop):
    poi_feature_transfer = []
    print('++++', '\n', grid)
    for a in range(len(poi_feature)):
        poi_feature_transfer.append('poi_feature_%d'%a)
        data = data.rename(columns={poi_feature[a]: 'poi_feature_%d'%a})

    features = ['provname', 'prefname', 'cntyname', 'townname', 'villname','dispincm', 'urbcode_1', 'hauslvl']  + poi_feature_transfer#
    sparse_features = []
    dense_features = []
    for f in features:
        if f not in x_category or x_category[f] == 1:
            dense_features.append(f)
        else:
            sparse_features.append(f)
    data[sparse_features] = data[sparse_features].fillna(-1)
    data[dense_features] = data[dense_features].fillna(0 )

    y=[]
    #ziel =  # villmean, income
    y_limit= [np.min(data[ziel])-1]+ line0 +[np.max(data[ziel])]
    for index, row in data.iterrows():
        for i in range(1, len(y_limit)):
            if y_limit[i - 1] < row[ziel] <= y_limit[i]:
                y.append(i-1)
                break
    data['income_0'] = y
    target = ['income_0']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name
    fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique()) for feat in sparse_features] + \
                             [DenseFeat(feat, 1,)for feat in dense_features]
    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns
    fixlen_feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

    # 3.generate input data for model
    train, test = train_test_split(data, test_size=0.2)

    # try to oversampling
   # (train_x,train_y)=over_sampling(train[features],train[ziel], 3)
   # train = (np.column_stack((train_x, train_y)))
    train_model_input = [train[name] for name in fixlen_feature_names]
    test_model_input = [test[name] for name in fixlen_feature_names]
# 4.Define Model,train,predict and evaluate ##############################################
    (models, model_names,xlabel) = model_gridsearch(linear_feature_columns,dnn_feature_columns,grid)
    logloss, auc1, acc1, pre1, recall1,f11 = [],[],[],[],[],[]
    print(ziel, line0, len(data))
    for name,model in zip(model_names,models):
        ll_avg, auc_avg = [],[]
        for i in range(loop):
            model.compile("adam",'binary_crossentropy',
                          metrics=['binary_crossentropy'])
            history = model.fit(train_model_input, train[target].values,
                                batch_size=256, epochs=10, verbose=0, validation_split=0.2, )
            pred_ans = model.predict(test_model_input, batch_size=256)

            true = test[target].values
            '''
            f = open("pred.csv", 'a', encoding='utf_8_sig')
            f.write('%s\n'%(ziel))
            for i in range(len(pred_ans)):
                f.write('%s, %s\n' % (pred_ans[i],true[i] ))
            f.close()'''


            ll = round(log_loss(test[target].values, pred_ans), 4)
            auc = round(roc_auc_score(test[target].values, pred_ans), 4)
            #acc = round(accuracy_score(test[target].values, pred_ans.round()), 4)
            #pre = round(precision_score(test[target].values, pred_ans.round()), 4)
            #recall = round(recall_score(test[target].values, pred_ans.round()), 4)
            #f1 = round(f1_score(test[target].values, pred_ans.round(), average='weighted'),4)
            #spec = round(specificity_score(test[target].values, pred_ans.round(), average='weighted'),4)
            #sens = round(sensitivity_score(test[target].values, pred_ans.round(), average='weighted'),4)
            print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
            print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))
            ll_avg.append(ll), auc_avg.append(auc)
        logloss.append(np.mean(ll_avg)), auc1.append(np.mean(auc_avg))#, acc1.append(acc), pre1.append(pre), recall1.append(recall), f11.append(f1)

        '''
        cm = confusion_matrix(test[target].values, pred_ans.round())
        cm_normalized = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        cm = []
        for m in range(len(line0)+1):
            cm.append([])
            for n in range(len(line0)+1):
                cm[m].append(round(cm_normalized[m][n],4))
        '''
        '''
        print(name)
        print("LogLoss", ll, end=' ')
        print("AUC", auc, end=' ')
        print("accuracy", acc, end=' ')
        #print("precision" , pre, end=' ')
        #print("recall", recall, end=' ')
        print("f1" , f1, end=' ')
        print("spec", spec, end=' ')
        print("sens" , sens, end=' ')
        print(cm)
        #f = open("DeepFM.csv", 'a', encoding='utf_8_sig')
        #f.write('%s,%s\n'%(ziel,line0))
        #f.write('%s, %s, %s, %s, %s, %s, %s,' % (name, ll, auc, acc, f1, spec, sens))
        #f.write('%s\n' % str(cm).replace(',',';'))
        #f.close()
        '''
    return (logloss, auc1, xlabel)