Exemplo n.º 1
0
def get_xy_fd(hash_flag=False):
    feature_columns = [
        SparseFeat('user', 4, embedding_dim=4, use_hash=hash_flag),
        SparseFeat('gender', 2, embedding_dim=4, use_hash=hash_flag),
        SparseFeat('item_id', 3 + 1, embedding_dim=8, use_hash=hash_flag),
        SparseFeat('cate_id', 2 + 1, embedding_dim=4, use_hash=hash_flag),
        DenseFeat('pay_score', 1)
    ]

    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item_id',
                                    vocabulary_size=3 + 1,
                                    embedding_dim=8,
                                    embedding_name='item_id'),
                         maxlen=4,
                         length_name="seq_length"),
        VarLenSparseFeat(SparseFeat('hist_cate_id',
                                    vocabulary_size=2 + 1,
                                    embedding_dim=4,
                                    embedding_name='cate_id'),
                         maxlen=4,
                         length_name="seq_length")
    ]

    behavior_feature_list = ["item_id", "cate_id"]
    uid = np.array([0, 1, 2, 3])
    gender = np.array([0, 1, 0, 1])
    item_id = np.array([1, 2, 3, 2])  # 0 is mask value
    cate_id = np.array([1, 2, 1, 2])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3, 0.2])

    hist_item_id = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0],
                             [1, 2, 0, 0]])
    hist_cate_id = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0],
                             [1, 2, 0, 0]])

    behavior_length = np.array([3, 3, 2, 2])

    feature_dict = {
        'user': uid,
        'gender': gender,
        'item_id': item_id,
        'cate_id': cate_id,
        'hist_item_id': hist_item_id,
        'hist_cate_id': hist_cate_id,
        'pay_score': score,
        "seq_length": behavior_length
    }

    x = {
        name: feature_dict[name]
        for name in get_feature_names(feature_columns)
    }
    y = np.array([1, 0, 1, 0])
    return x, y, feature_columns, behavior_feature_list
Exemplo n.º 2
0
def get_test_data(sample_size=1000, embedding_size=4, sparse_feature_num=1, dense_feature_num=1,
                  sequence_feature=['sum', 'mean', 'max'], classification=True, include_length=False,
                  hash_flag=False, prefix=''):


    feature_columns = []
    model_input = {}


    if 'weight'  in sequence_feature:
        feature_columns.append(VarLenSparseFeat(SparseFeat(prefix+"weighted_seq",vocabulary_size=2,embedding_dim=embedding_size),maxlen=3,length_name=prefix+"weighted_seq"+"_seq_length",weight_name=prefix+"weight"))
        s_input, s_len_input = gen_sequence(
            2, 3, sample_size)

        model_input[prefix+"weighted_seq"] = s_input
        model_input[prefix+'weight'] = np.random.randn(sample_size,3,1)
        model_input[prefix+"weighted_seq"+"_seq_length"] = s_len_input
        sequence_feature.pop(sequence_feature.index('weight'))


    for i in range(sparse_feature_num):
        dim = np.random.randint(1, 10)
        feature_columns.append(SparseFeat(prefix+'sparse_feature_'+str(i), dim,embedding_size,dtype=torch.int32))
    for i in range(dense_feature_num):
        feature_columns.append(DenseFeat(prefix+'dense_feature_'+str(i), 1,dtype=torch.float32))
    for i, mode in enumerate(sequence_feature):
        dim = np.random.randint(1, 10)
        maxlen = np.random.randint(1, 10)
        feature_columns.append(
            VarLenSparseFeat(SparseFeat(prefix +'sequence_' + mode,vocabulary_size=dim,  embedding_dim=embedding_size), maxlen=maxlen, combiner=mode))

    for fc in feature_columns:
        if isinstance(fc,SparseFeat):
            model_input[fc.name]= np.random.randint(0, fc.vocabulary_size, sample_size)
        elif isinstance(fc,DenseFeat):
            model_input[fc.name] = np.random.random(sample_size)
        else:
            s_input, s_len_input = gen_sequence(
                fc.vocabulary_size, fc.maxlen, sample_size)
            model_input[fc.name] = s_input
            if include_length:
                fc.length_name = prefix+"sequence_"+str(i)+'_seq_length'
                model_input[prefix+"sequence_"+str(i)+'_seq_length'] = s_len_input

    if classification:
        y = np.random.randint(0, 2, sample_size)
    else:
        y = np.random.random(sample_size)

    return model_input, y, feature_columns
Exemplo n.º 3
0
def get_test_data(sample_size=1000,
                  sparse_feature_num=1,
                  dense_feature_num=1,
                  sequence_feature=('sum', 'mean', 'max'),
                  classification=True,
                  include_length=False,
                  hash_flag=False,
                  prefix=''):

    feature_columns = []

    for i in range(sparse_feature_num):
        dim = np.random.randint(1, 10)
        feature_columns.append(
            SparseFeat(prefix + 'sparse_feature_' + str(i), dim, hash_flag,
                       torch.int32))
    for i in range(dense_feature_num):
        feature_columns.append(
            DenseFeat(prefix + 'dense_feature_' + str(i), 1, torch.float32))
    for i, mode in enumerate(sequence_feature):
        dim = np.random.randint(1, 10)
        maxlen = np.random.randint(1, 10)
        feature_columns.append(
            VarLenSparseFeat(prefix + 'sequence_' + str(i), dim, maxlen, mode))

    model_input = []
    sequence_input = []
    sequence_len_input = []
    for fc in feature_columns:
        if isinstance(fc, SparseFeat):
            model_input.append(np.random.randint(0, fc.dimension, sample_size))
        elif isinstance(fc, DenseFeat):
            model_input.append(np.random.random(sample_size))
        else:
            s_input, s_len_input = gen_sequence(fc.dimension, fc.maxlen,
                                                sample_size)
            sequence_input.append(s_input)
            sequence_len_input.append(s_len_input)

    if classification:
        y = np.random.randint(0, 2, sample_size)
        while sum(y) < 0.3 * sample_size:
            y = np.random.randint(0, 2, sample_size)
    else:
        y = np.random.random(sample_size)

    x = model_input + sequence_input
    if include_length:
        for i, mode in enumerate(sequence_feature):
            dim = np.random.randint(1, 10)
            maxlen = np.random.randint(1, 10)
            feature_columns.append(
                SparseFeat(prefix + 'sequence_' + str(i) + '_seq_length',
                           1,
                           embedding=False))

        x += sequence_len_input

    return x, y, feature_columns
Exemplo n.º 4
0
def get_xy_fd():
    feature_columns = [
        SparseFeat('user', 3, embedding_dim=8),
        SparseFeat('gender', 2, embedding_dim=8),
        SparseFeat('item', 3 + 1, embedding_dim=8),
        SparseFeat('item_gender', 2 + 1, embedding_dim=8),
        DenseFeat('score', 1)
    ]

    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item', 3 + 1, embedding_dim=8),
                         4,
                         length_name="seq_length"),
        VarLenSparseFeat(SparseFeat('hist_item_gender', 2 + 1,
                                    embedding_dim=8),
                         4,
                         length_name="seq_length")
    ]
    behavior_feature_list = ["item", "item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    igender = np.array([1, 2, 1])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
    hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])
    behavior_length = np.array([3, 3, 2])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item': iid,
        'item_gender': igender,
        'hist_item': hist_iid,
        'hist_item_gender': hist_igender,
        'score': score,
        "seq_length": behavior_length
    }
    x = {
        name: feature_dict[name]
        for name in get_feature_names(feature_columns)
    }
    y = np.array([1, 0, 1])

    return x, y, feature_columns, behavior_feature_list
    def build_model(
        self,
        embedding_dim=4,
        task='binary',
        optimizer='adam',
        loss='binary_crossentropy',
        metrics=['accuracy'],
        device='cpu',
    ):
        fixlen_feature_columns = [
            SparseFeat(
                feat,
                vocabulary_size=self.vocabulary_size_dict[feat],
                embedding_dim=embedding_dim,
            ) for feat in self.sparse_features
        ]

        if self.variable_length_features:
            varlen_feature_columns = [
                VarLenSparseFeat(
                    SparseFeat(
                        feat,
                        vocabulary_size=self.vocabulary_size_dict[feat],
                        embedding_dim=embedding_dim,
                    ),
                    maxlen=self.variable_length_features_max_len[feat],
                    combiner='mean',
                ) for feat in self.variable_length_features
            ]
        else:
            varlen_feature_columns = []

        linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
        dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns

        model = DeepFM(linear_feature_columns,
                       dnn_feature_columns,
                       task=task,
                       device=device)
        model.compile(optimizer, loss, metrics)
        return model
        genres_list,
        maxlen=max_len,
        padding='post',
    )

    # 2.count #unique features for each sparse field and generate feature config for sequence feature

    fixlen_feature_columns = [
        SparseFeat(feat, data[feat].nunique(), embedding_dim=4)
        for feat in sparse_features
    ]

    varlen_feature_columns = [
        VarLenSparseFeat(SparseFeat('genres',
                                    vocabulary_size=len(key2index) + 1,
                                    embedding_dim=4),
                         maxlen=max_len,
                         combiner='mean',
                         weight_name=None)
    ]  # Notice : value 0 is for padding for sequence input feature

    linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)

    # 3.generate input data for model
    model_input = {name: data[name] for name in sparse_features}  #
    model_input["genres"] = genres_list

    # 4.Define Model,compile and train
Exemplo n.º 7
0
genres_length = np.array(list(map(len, genres_list)))
max_len = max(genres_length)
# Notice : padding=`post`
genres_list = pad_sequences(
    genres_list,
    maxlen=max_len,
    padding='post',
)

# 2.count #unique features for each sparse field and generate feature config for sequence feature

fixlen_feature_columns = [
    SparseFeat(feat, data[feat].nunique()) for feat in sparse_features
]
varlen_feature_columns = [
    VarLenSparseFeat('genres',
                     len(key2index) + 1, max_len, 'mean')
]  # Notice : value 0 is for padding for sequence input feature

linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

# 3.generate input data for model
model_input = {name: data[name] for name in feature_names}
model_input['genres'] = genres_list

# 4.Define Model,compile and train
model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression')

model.compile(
    "adam",
Exemplo n.º 8
0
        data[feat] = lbe.fit_transform(data[feat])
    # preprocess the sequence feature

    key2index = {}
    genres_list = list(map(split, data['Genres'].values))
    genres_length = np.array(list(map(len, genres_list)))
    max_len = max(genres_length)
    # Notice : padding=`post`
    genres_list = pad_sequences(genres_list, maxlen=max_len, padding='post', )

    # 2.count #unique features for each sparse field and generate feature config for sequence feature

    fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique(), embedding_dim=4)
                              for feat in sparse_features]

    varlen_feature_columns = [VarLenSparseFeat(SparseFeat('Genres', vocabulary_size=len(
        key2index) + 1, embedding_dim=4), maxlen=max_len, combiner='mean')]  # Notice : value 0 is for padding for sequence input feature

    linear_feature_columns = fixlen_feature_columns + varlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns + varlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

    # 3.generate input data for model
    model_input = {name: data[name] for name in sparse_features}  #
    model_input["Genres"] = genres_list

    # 4.Define Model,compile and train

    device = 'cpu'
    use_cuda = True
    if use_cuda and torch.cuda.is_available():
Exemplo n.º 9
0
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from deepctr_torch.models import *
from deepctr_torch.inputs import SparseFeat, DenseFeat, VarLenSparseFeat, get_feature_names
import torch
import torch.nn.functional as F

feature_columns = [
    SparseFeat('user', 3),
    SparseFeat('gender', 2),
    SparseFeat('item', 3 + 1),
    SparseFeat('item_gender', 2 + 1),
    DenseFeat('score', 1)
]
feature_columns += [
    VarLenSparseFeat('hist_item', 3 + 1, maxlen=4, embedding_name='item'),
    VarLenSparseFeat('hist_item_gender',
                     3 + 1,
                     maxlen=4,
                     embedding_name='item_gender')
]

behavior_feature_list = ["item", "item_gender"]
uid = np.array([0, 1, 2])
ugender = np.array([0, 1, 0])
iid = np.array([1, 2, 3])  # 0 is mask value
igender = np.array([1, 2, 1])  # 0 is mask value
score = np.array([0.1, 0.2, 0.3])

hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])