Exemplo n.º 1
0
    def __init__(self, dim, maxlen, indexes):
        self.dim = dim
        self.maxlen = maxlen

        self.item_index = indexes[0]
        self.city_index = indexes[1]
        self.action_index = indexes[2]

        hash_flag = True
        iFeature = SingleFeat('item', len(self.item_index) + 1, hash_flag)
        cFeature = SingleFeat('city', len(self.city_index) + 1, hash_flag)
        pFeature = SingleFeat('position', 25 + 1, hash_flag)
        aFeature = SingleFeat('action', len(self.action_index) + 1, hash_flag)

        self.feature_dim_dict = {"sparse": [iFeature, cFeature, pFeature, aFeature],
                                 "dense": [SingleFeat('price', False)]}

        self.behavior_feature_list = ["item", "city", "position", "action"]

        self.model = DSIN(self.feature_dim_dict, self.behavior_feature_list, sess_max_count=1, sess_len_max=self.maxlen,
                          embedding_size=self.dim,
                          att_head_num=1,
                          att_embedding_size=self.dim * len(self.behavior_feature_list),
                          dnn_hidden_units=[self.dim, self.dim, self.dim ], dnn_dropout=0.5)

        self.model.compile('adam', 'binary_crossentropy',
                           metrics=['acc'])
Exemplo n.º 2
0
def get_xy_fd(use_neg=False, hash_flag=False):
    feature_dim_dict = {"sparse": [SingleFeat('user', 3,hash_flag), SingleFeat(
        'gender', 2,hash_flag), SingleFeat('item', 3+1,hash_flag), SingleFeat('item_gender', 2+1,hash_flag)], "dense": [SingleFeat('score', 0)]}
    behavior_feature_list = ["item","item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])#0 is mask value
    igender = np.array([1, 2, 1])# 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[ 1, 2, 3,0], [ 1, 2, 3,0], [ 1, 2, 0,0]])
    hist_igender = np.array([[1, 1, 2,0 ], [2, 1, 1, 0], [2, 1, 0, 0]])

    behavior_length = np.array([3,3,2])

    feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender,
                    'hist_item': hist_iid, 'hist_item_gender': hist_igender,
                    'score': score}

    x = [feature_dict[feat.name] for feat in feature_dim_dict["sparse"]] + [feature_dict[feat.name] for feat in
                                                                            feature_dim_dict["dense"]] + [
            feature_dict['hist_' + feat] for feat in behavior_feature_list]
    if use_neg:
        feature_dict['neg_hist_item'] = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
        feature_dict['neg_hist_item_gender'] = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])
        x += [feature_dict['neg_hist_'+feat] for feat in behavior_feature_list]

    x += [behavior_length]
    y = [1, 0, 1]
    return x, y, feature_dim_dict, behavior_feature_list
Exemplo n.º 3
0
def get_xy_fd():
    feature_dim_dict = {
        "sparse": [
            SingleFeat('user', 4),
            SingleFeat('gender', 2),
            SingleFeat('item', 4),
            SingleFeat('item_gender', 2)
        ],
        "dense": []
    }
    behavior_feature_list = ["item"]
    uid = np.array([1, 2, 3])
    ugender = np.array([0, 1, 0])
    iid = np.array([0, 1, 2])
    igender = np.array([0, 1, 0])

    hist_iid = np.array([[0, 1, 2, 3], [0, 1, 2, 3], [0, 1, 2, 3]])
    hist_igender = np.array([[0, 1, 0, 1], [0, 1, 1, 1], [0, 0, 1, 0]])
    hist_length = np.array([4, 4, 4])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item': iid,
        'item_gender': igender,
        'hist_item': hist_iid,
        'hist_item_gender': hist_igender,
    }
    x = [feature_dict[feat.name] for feat in feature_dim_dict["sparse"]] \
        + [feature_dict['hist_'+feat] for feat in behavior_feature_list]\
        + [hist_length]
    y = [1, 0, 1]
    return x, y, feature_dim_dict, behavior_feature_list
def sparse_feat_list_gen(data, sparse_features, hashing):
    if hashing:
        sparse_feat_list = [
            SingleFeat(feat,
                       data[feat].nunique() * 5,
                       hash_flag=True,
                       dtype='string') for feat in sparse_features
        ]
    else:
        sparse_feat_list = [
            SingleFeat(feat, data[feat].nunique()) for feat in sparse_features
        ]
    return sparse_feat_list
Exemplo n.º 5
0
def get_train_and_feature_list(data, sparse_features, multivalue_cols, name_col='C4'):
    sparse_feature_list = [SingleFeat(feat, 1e3, hash_flag=True, dtype='float32')  # since the input is string
                           for feat in sparse_features]
    sequence_feature = []
    sequence_input = []
    sequence_input_lens = []
    for f in multivalue_cols:
        print(data.iloc[0][f])
        print(len(data.columns))
        data[f] = data[f] + "|" + data[name_col].map(str)
        print(data.iloc[0][f])
        genres_list = list(map(lambda x: list(reversed(x.split('|'))), data[f].values))
        genres_length = np.array(list(map(len, genres_list)))
        print("{0}: mean len {1}, max len {2}".format(f, np.mean(genres_length), np.max(genres_length)))
        max_len = max(genres_length)
        max_len = max(max_len, 51)
        # print(max_len)
        # Notice : padding=`post`
        genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', dtype=str, value=0)
        # print(genres_list)
        # sequence_feature += [VarLenFeat(f, len(key2index) + 1, max_len, 'mean')]
        sequence_feature += [VarLenFeat(f, 1e3, max_len, 'mean', hash_flag=True, dtype="string")]
        sequence_input.append(genres_list)

        sequence_input_lens.append(max_len)
    data[name_col] = data[name_col].map(float)
    sparse_input = [data[feat.name].values for feat in sparse_feature_list]

    model_input = sparse_input + sequence_input + [genres_length]
    # print("eseseswes {0}".format(sequence_input))
    return model_input, sparse_feature_list, sequence_feature, sequence_input_lens
Exemplo n.º 6
0
def get_xy_fd(hash_flag=False):
    feature_dim_dict = {
        "sparse": [
            SingleFeat('user', 3, hash_flag),
            SingleFeat('gender', 2, hash_flag),
            SingleFeat('item', 3 + 1, hash_flag),
            SingleFeat('item_gender', 2 + 1, hash_flag)
        ],
        "dense": [SingleFeat('score', 0)]
    }
    behavior_feature_list = ["item", "item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    igender = np.array([1, 2, 1])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    sess1_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [0, 0, 0, 0]])
    sess1_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [0, 0, 0, 0]])

    sess2_iid = np.array([[1, 2, 3, 0], [0, 0, 0, 0], [0, 0, 0, 0]])
    sess2_igender = np.array([[1, 1, 2, 0], [0, 0, 0, 0], [0, 0, 0, 0]])

    sess_number = np.array([2, 1, 0])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item': iid,
        'item_gender': igender,
        'sess1_item': sess1_iid,
        'sess1_item_gender': sess1_igender,
        'score': score,
        'sess2_item': sess2_iid,
        'sess2_item_gender': sess2_igender,
    }

    x = [feature_dict[feat.name] for feat in feature_dim_dict["sparse"]] + [
        feature_dict[feat.name] for feat in feature_dim_dict["dense"]
    ] + [feature_dict['sess1_' + feat] for feat in behavior_feature_list
         ] + [feature_dict['sess2_' + feat] for feat in behavior_feature_list]

    x += [sess_number]

    y = [1, 0, 1]
    return x, y, feature_dim_dict, behavior_feature_list
Exemplo n.º 7
0
def get_test_data(sample_size=1000,
                  sparse_feature_num=1,
                  dense_feature_num=1,
                  sequence_feature=('max', 'mean', 'sum'),
                  classification=True,
                  include_length=False):

    feature_dim_dict = {"sparse": [], 'dense': [], 'sequence': []}

    for i in range(sparse_feature_num):
        dim = np.random.randint(1, 10)
        feature_dim_dict['sparse'].append(SingleFeat('sparse_' + str(i), dim))
    for i in range(dense_feature_num):
        feature_dim_dict['dense'].append(SingleFeat('dense_' + str(i), 0))
    for i, mode in enumerate(sequence_feature):
        dim = np.random.randint(1, 10)
        maxlen = np.random.randint(1, 10)
        feature_dim_dict['sequence'].append(
            VarLenFeat('sequence_' + str(i), dim, maxlen, mode))

    sparse_input = [
        np.random.randint(0, dim, sample_size)
        for feat, dim in feature_dim_dict['sparse']
    ]
    dense_input = [
        np.random.random(sample_size) for name in feature_dim_dict['dense']
    ]
    sequence_input = []
    sequence_len_input = []
    for var in feature_dim_dict['sequence']:
        s_input, s_len_input = gen_sequence(var.dimension, var.maxlen,
                                            sample_size)
        sequence_input.append(s_input)
        sequence_len_input.append(s_len_input)
    if classification:
        y = np.random.randint(0, 2, sample_size)
    else:
        y = np.random.random(sample_size)

    x = sparse_input + dense_input + sequence_input
    if include_length:
        x += sequence_len_input

    return x, y, feature_dim_dict
Exemplo n.º 8
0
def get_xy_fd():
    feature_dim_dict = {"sparse": [SingleFeat('user', 3), SingleFeat(
        'gender', 2), SingleFeat('item', 3+1), SingleFeat('item_gender', 2+1)], "dense": [SingleFeat('score', 0)]}
    behavior_feature_list = ["item", "item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    igender = np.array([1, 2, 1])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
    hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])

    feature_dict = {'user': uid, 'gender': ugender, 'item': iid, 'item_gender': igender,
                    'hist_item': hist_iid, 'hist_item_gender': hist_igender, 'score': score}

    x = [feature_dict[feat.name] for feat in feature_dim_dict["sparse"]] + [feature_dict[feat.name]
                                                                            for feat in feature_dim_dict["dense"]] + [feature_dict['hist_'+feat] for feat in behavior_feature_list]

    y = [1, 0, 1]
    return x, y, feature_dim_dict, behavior_feature_list
Exemplo n.º 9
0
def get_feature_list(sparse_features, multivalue_cols, max_len=50):
    sparse_feature_list = [
        SingleFeat(feat, 1000, hash_flag=True,
                   dtype='string')  # since the input is string
        for feat in sparse_features
    ]
    sequence_feature = []
    for f in multivalue_cols:
        sequence_feature += [
            VarLenFeat(f,
                       1000,
                       max_len,
                       'mean',
                       hash_flag=True,
                       dtype="string")
        ]
    return sparse_feature_list, sequence_feature
Exemplo n.º 10
0
def test_DCN_invalid(embedding_size=8, cross_num=0, hidden_size=()):
    feature_dim_dict = {
        'sparse': [
            SingleFeat('sparse_1', 2),
            SingleFeat('sparse_2', 5),
            SingleFeat('sparse_3', 10)
        ],
        'dense': [
            SingleFeat('dense_1', 1),
            SingleFeat('dense_1', 1),
            SingleFeat('dense_1', 1)
        ]
    }
    with pytest.raises(ValueError):
        _ = DCN(
            feature_dim_dict,
            embedding_size=embedding_size,
            cross_num=cross_num,
            dnn_hidden_units=hidden_size,
            dnn_dropout=0.5,
        )
Exemplo n.º 11
0
    sparse_features = [
        'userid', 'adgroup_id', 'pid', 'cms_segid', 'cms_group_id',
        'final_gender_code', 'age_level', 'pvalue_level', 'shopping_level',
        'occupation', 'new_user_class_level', 'campaign_id', 'customer'
    ]
    dense_features = ['price']

    for feat in tqdm(sparse_features):
        lbe = LabelEncoder()  # or Hash
        data[feat] = lbe.fit_transform(data[feat])
    mms = StandardScaler()
    data[dense_features] = mms.fit_transform(data[dense_features])

    sparse_feature_list = [
        SingleFeat(feat, data[feat].nunique() + 1)
        for feat in sparse_features + ['cate_id', 'brand']
    ]

    dense_feature_list = [SingleFeat(feat, 1) for feat in dense_features]
    sess_feature = ['cate_id', 'brand']

    sess_input = [
        pad_sequences(sess_input_dict[feat],
                      maxlen=DIN_SESS_MAX_LEN,
                      padding='post') for feat in sess_feature
    ]
    neg_sess_input = [
        pad_sequences(neg_sess_input_dict[feat],
                      maxlen=DIN_SESS_MAX_LEN,
                      padding='post') for feat in sess_feature
Exemplo n.º 12
0
    data[sparse_features] = data[sparse_features].fillna('-1', )
    data[dense_features] = data[dense_features].fillna(0, )
    target = ['label']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name

    sparse_feature_list = [
        SingleFeat(feat, data[feat].nunique()) for feat in sparse_features
    ]
    dense_feature_list = [SingleFeat(
        feat,
        0,
    ) for feat in dense_features]

    # 3.generate input data for model

    train, test = train_test_split(data, test_size=0.2)
    train_model_input = [train[feat.name].values for feat in sparse_feature_list] + \
                        [train[feat.name].values for feat in dense_feature_list]
    test_model_input = [test[feat.name].values for feat in sparse_feature_list] + \
                       [test[feat.name].values for feat in dense_feature_list]

    # 4.Define Model,train,predict and evaluate
Exemplo n.º 13
0
#     key2index = {}
#     df_train[col] = df_train[col].apply(str)
#     # all_data[col] = all_data[col].apply(str)
#     lst = list(map(split, df_train[col].values))
#     lst_all = list(map(split, df_train[col].values))
#     lst_length = np.array(list(map(len, lst_all)))
#     max_len = max(lst_length)
#     # Notice : padding=`post`
#     lst = pad_sequences(lst, maxlen=max_len, padding='post')
#     multi_values_input += [lst]
#     # 2.count #unique features for each sparse field and generate feature config for sequence feature
#     sequence_feature += [VarLenFeat(col, len(key2index) + 1, max_len, 'mean')]
#     # Notice : value 0 is for padding for sequence input feature

sparse_feat_list = [
    SingleFeat(feat, df_train[feat].nunique()) for feat in sparse_features
]

dense_feat_list = [SingleFeat(feat, 0) for feat in dense_features]
# 3.generate input data for model
sparse_input = [df_train[feat.name].values for feat in sparse_feat_list]
dense_input = [df_train[feat.name].values for feat in dense_feat_list]

model_input = sparse_input + dense_input + multi_values_input
print(model_input)
# print(model_input.shape)
# 4.Define Model,compile and train
model = DeepFM(
    {
        "sparse": sparse_feat_list,
        "dense": dense_feat_list,
Exemplo n.º 14
0
    data = pd.merge(sample_sub, user, how='left', on='userid', )
    data = pd.merge(data, ad, how='left', on='adgroup_id')

    sparse_features = ['userid', 'adgroup_id', 'pid', 'cms_segid', 'cms_group_id', 'final_gender_code', 'age_level',
                       'pvalue_level', 'shopping_level', 'occupation', 'new_user_class_level', 'campaign_id',
                       'customer']
    dense_features = ['price']

    for feat in tqdm(sparse_features):
        lbe = LabelEncoder()  # or Hash
        data[feat] = lbe.fit_transform(data[feat])
    mms = StandardScaler()
    data[dense_features] = mms.fit_transform(data[dense_features])

    sparse_feature_list = [SingleFeat(feat, data[feat].max(
    ) + 1) for feat in sparse_features + ['cate_id', 'brand']]

    dense_feature_list = [SingleFeat(feat, 1) for feat in dense_features]
    sess_feature = ['cate_id', 'brand']

    sess_input = [pad_sequences(
        sess_input_dict[feat], maxlen=DIN_SESS_MAX_LEN, padding='post') for feat in sess_feature]
    neg_sess_input = [pad_sequences(neg_sess_input_dict[feat], maxlen=DIN_SESS_MAX_LEN, padding='post') for feat in
                      sess_feature]

    model_input = [data[feat.name].values for feat in sparse_feature_list] + \
                  [data[feat.name].values for feat in dense_feature_list]
    sess_lists = sess_input + neg_sess_input + [np.array(sess_input_length)]
    model_input += sess_lists

    if not os.path.exists('../model_input/'):
genres_list = list(map(lambda x: x.split('|'), data['genres'].values))
genres_length = np.array(list(map(len, genres_list)))
max_len = max(genres_length)

# Notice : padding=`post`
genres_list = pad_sequences(genres_list,
                            maxlen=max_len,
                            padding='post',
                            dtype=str,
                            value=0)

# 2.set hashing space for each sparse field and generate feature config for sequence feature

sparse_feat_list = [
    SingleFeat(feat, data[feat].nunique() * 5, hash_flag=True, dtype='string')
    for feat in sparse_features
]
sequence_feature = [
    VarLenFeat('genres', 100, max_len, 'mean', hash_flag=True, dtype="string")
]  # Notice : value 0 is for padding for sequence input feature

# 3.generate input data for model
sparse_input = [data[feat.name].values for feat in sparse_feat_list]
dense_input = []
sequence_input = [genres_list]
model_input = sparse_input + dense_input + \
              sequence_input  # make sure the order is right

# 4.Define Model,compile and train
model = DeepFM({
Exemplo n.º 16
0
    data[sparse_features] = data[sparse_features].fillna('-1', )
    data[dense_features] = data[dense_features].fillna(0, )

    target = ['label']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat]).astype(np.int32)
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features]).astype(np.float32)

    # 2.count #unique features for each sparse field,and record dense feature field name

    sparse_feature_list = [SingleFeat(feat, data[feat].nunique())
                           for feat in sparse_features]
    dense_feature_list = [SingleFeat(feat, 0,)
                          for feat in dense_features]

    # 3.generate input data for model

    train, test = train_test_split(data, test_size=0.2)
    train_model_input = [train[feat.name].values for feat in sparse_feature_list] + \
                        [train[feat.name].values for feat in dense_feature_list]
    test_model_input = [test[feat.name].values for feat in sparse_feature_list] + \
                       [test[feat.name].values for feat in dense_feature_list]

    # 4.Define Model,train,predict and evaluate
    model = DeepFM({"sparse": sparse_feature_list,
                    "dense": dense_feature_list}, task='binary', embedding_size=4, dnn_hidden_units=(64, 64))
Exemplo n.º 17
0
if __name__ == "__main__":

    data = pd.read_csv("./movielens_sample.txt")
    sparse_features = [
        "movie_id", "user_id", "gender", "age", "occupation", "zip"
    ]
    target = ['rating']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    # 2.count #unique features for each sparse field
    sparse_feat_list = [
        SingleFeat(feat, data[feat].nunique()) for feat in sparse_features
    ]

    # 3.generate input data for model
    train, test = train_test_split(data, test_size=0.2)
    train_model_input = [train[feat.name].values for feat in sparse_feat_list]
    test_model_input = [test[feat.name].values for feat in sparse_feat_list]
    # 4.Define Model,train,predict and evaluate
    model = DeepFM({"sparse": sparse_feat_list}, task='regression')
    model.compile(
        "adam",
        "mse",
        metrics=['mse'],
    )

    history = model.fit(
Exemplo n.º 18
0
    data = pd.read_csv(file, sep='\t', header=None, names=names, dtype=dtypes)

    data[sparse_features] = data[sparse_features].fillna('-1', )
    data[dense_features] = data[dense_features].fillna(0, )

    target = ['label']

    # 1.do simple Transformation for dense features
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.set hashing space for each sparse field,and record dense feature field name

    sparse_feature_list = [
        SingleFeat(feat, 1000, hash_flag=True,
                   dtype='string')  # since the input is string
        for feat in sparse_features
    ]
    dense_feature_list = [SingleFeat(
        feat,
        0,
    ) for feat in dense_features]

    # 3.generate input data for model

    train, test = train_test_split(data, test_size=0.2)
    train_model_input = [train[feat.name].values for feat in sparse_feature_list] + \
                        [train[feat.name].values for feat in dense_feature_list]
    test_model_input = [test[feat.name].values for feat in sparse_feature_list] + \
                       [test[feat.name].values for feat in dense_feature_list]
Exemplo n.º 19
0
from deepctr.models import DeepFM, DCN
from deepctr.utils import SingleFeat

#对稠密特征归一化
mms = MinMaxScaler(feature_range=(0, 1))
X_train[dense_features] = mms.fit_transform(X_train[dense_features])
X_test[dense_features] = mms.transform(X_test[dense_features])

#对稀疏特征编码
for feat in sparse_features:
    lbe = LabelEncoder()
    X_train[feat] = lbe.fit_transform(X_train[feat])
    X_test[feat] = lbe.transform(X_test[feat])

sparse_feature_list = [
    SingleFeat(feat, X_train[feat].nunique())  # since the input is string
    for feat in sparse_features
]
dense_feature_list = [SingleFeat(
    feat,
    0,
) for feat in dense_features]

train_model_input = [X_train[feat.name].values for feat in sparse_feature_list] + \
                    [X_train[feat.name].values for feat in dense_feature_list]
test_model_input = [X_test[feat.name].values for feat in sparse_feature_list] + \
                   [X_test[feat.name].values for feat in dense_feature_list]

model = DeepFM({
    "sparse": sparse_feature_list,
    "dense": dense_feature_list