Exemplo n.º 1
0
    def get_feature_columns(self):
        '''
        获取特征列
        '''
        file_name = "{stage}_{action}_{day}_concate_sample.csv".format(stage=self.stage, action=self.action,
                                                                       day=STAGE_END_DAY[self.stage])
        stage_dir = os.path.join(FLAGS.root_path, self.stage, file_name)
        self.df = pd.read_csv(stage_dir)
        sparse_features = ["userid", "feedid", "authorid", "bgm_singer_id", "bgm_song_id"]
        self.df[sparse_features] = self.df[sparse_features].fillna('-1', )
        for feat in sparse_features:
            lbe = LabelEncoder()
            self.df[feat] = lbe.fit_transform(self.df[feat])
        # mms = MinMaxScaler(feature_range=(0, 1))
        # data[dense_features] = mms.fit_transform(data[dense_features])

        # df[dense_features] = df[dense_features].fillna(0, )
        linear_feature_columns = list()
        dnn_feature_columns = [SparseFeat(feat, self.df[feat].nunique(), FLAGS.embed_dim, dtype=str) for feat in sparse_features]

        video_seconds = DenseFeat(name='videoplayseconds')
        device = DenseFeat(name='device')
        linear_feature_columns.append(video_seconds)
        linear_feature_columns.append(device)
        # 行为统计特征
        for b in FEA_COLUMN_LIST:
            feed_b = DenseFeat(b + "sum")
            linear_feature_columns.append(feed_b)
            user_b = DenseFeat(b + "sum_user")
            linear_feature_columns.append(user_b)
        return dnn_feature_columns, linear_feature_columns
Exemplo n.º 2
0
def get_test_data(sample_size=1000,
                  sparse_feature_num=1,
                  dense_feature_num=1,
                  sequence_feature=('sum', 'mean', 'max'),
                  classification=True,
                  include_length=False,
                  hash_flag=False,
                  prefix=''):

    feature_columns = []

    for i in range(sparse_feature_num):
        dim = np.random.randint(1, 10)
        feature_columns.append(
            SparseFeat(prefix + 'sparse_feature_' + str(i), dim, hash_flag,
                       torch.int32))
    for i in range(dense_feature_num):
        feature_columns.append(
            DenseFeat(prefix + 'dense_feature_' + str(i), 1, torch.float32))
    for i, mode in enumerate(sequence_feature):
        dim = np.random.randint(1, 10)
        maxlen = np.random.randint(1, 10)
        feature_columns.append(
            VarLenSparseFeat(prefix + 'sequence_' + str(i), dim, maxlen, mode))

    model_input = []
    sequence_input = []
    sequence_len_input = []
    for fc in feature_columns:
        if isinstance(fc, SparseFeat):
            model_input.append(np.random.randint(0, fc.dimension, sample_size))
        elif isinstance(fc, DenseFeat):
            model_input.append(np.random.random(sample_size))
        else:
            s_input, s_len_input = gen_sequence(fc.dimension, fc.maxlen,
                                                sample_size)
            sequence_input.append(s_input)
            sequence_len_input.append(s_len_input)

    if classification:
        y = np.random.randint(0, 2, sample_size)
        while sum(y) < 0.3 * sample_size:
            y = np.random.randint(0, 2, sample_size)
    else:
        y = np.random.random(sample_size)

    x = model_input + sequence_input
    if include_length:
        for i, mode in enumerate(sequence_feature):
            dim = np.random.randint(1, 10)
            maxlen = np.random.randint(1, 10)
            feature_columns.append(
                SparseFeat(prefix + 'sequence_' + str(i) + '_seq_length',
                           1,
                           embedding=False))

        x += sequence_len_input

    return x, y, feature_columns
Exemplo n.º 3
0
 def fit_test(self, train_X, train_Y, val_X, val_Y, test_X, test_Y,
              cat_cols):
     sparse_features = cat_cols
     dense_features = [
         idx for idx in range(train_X.shape[1]) if idx not in cat_cols
     ]
     sparse_feature_columns = [
         SparseFeat(str(feat),
                    vocabulary_size=len(set(train_X[:, feat])) + 1,
                    embedding_dim=4)
         for i, feat in enumerate(sparse_features)
     ]
     dense_feature_columns = [
         DenseFeat(
             str(feat),
             1,
         ) for feat in dense_features
     ]
     dnn_feature_columns = sparse_feature_columns + dense_feature_columns
     linear_feature_columns = sparse_feature_columns + dense_feature_columns
     feature_names = get_feature_names(linear_feature_columns +
                                       dnn_feature_columns)
     train_model_input = {
         name: train_X[:, int(name)]
         for name in feature_names
     }
     val_model_input = {name: val_X[:, int(name)] for name in feature_names}
     test_model_input = {
         name: test_X[:, int(name)]
         for name in feature_names
     }
     use_cuda = True
     if use_cuda and torch.cuda.is_available():
         print('cuda ready...')
         self.device = 'cuda:0'
     self.model = xDeepFM(linear_feature_columns,
                          dnn_feature_columns,
                          task='binary',
                          device=self.device)
     self.model.compile(
         Adam(self.model.parameters(), 0.0001),
         "binary_crossentropy",
         metrics=['binary_crossentropy'],
     )
     es = EarlyStopping(monitor='val_binary_crossentropy',
                        min_delta=0,
                        verbose=1,
                        patience=30,
                        mode='min')
     lbe = LabelEncoder()
     self.model.fit(train_model_input,
                    lbe.fit_transform(train_Y),
                    batch_size=512,
                    epochs=21,
                    verbose=2,
                    validation_data=(val_model_input, lbe.transform(val_Y)))
     pred_ans = self.model.predict(test_model_input, batch_size=256)
     print(f'{log_loss(test_Y, pred_ans):.5f}')
Exemplo n.º 4
0
def get_xy_fd(hash_flag=False):
    feature_columns = [
        SparseFeat('user', 4, embedding_dim=4, use_hash=hash_flag),
        SparseFeat('gender', 2, embedding_dim=4, use_hash=hash_flag),
        SparseFeat('item_id', 3 + 1, embedding_dim=8, use_hash=hash_flag),
        SparseFeat('cate_id', 2 + 1, embedding_dim=4, use_hash=hash_flag),
        DenseFeat('pay_score', 1)
    ]

    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item_id',
                                    vocabulary_size=3 + 1,
                                    embedding_dim=8,
                                    embedding_name='item_id'),
                         maxlen=4,
                         length_name="seq_length"),
        VarLenSparseFeat(SparseFeat('hist_cate_id',
                                    vocabulary_size=2 + 1,
                                    embedding_dim=4,
                                    embedding_name='cate_id'),
                         maxlen=4,
                         length_name="seq_length")
    ]

    behavior_feature_list = ["item_id", "cate_id"]
    uid = np.array([0, 1, 2, 3])
    gender = np.array([0, 1, 0, 1])
    item_id = np.array([1, 2, 3, 2])  # 0 is mask value
    cate_id = np.array([1, 2, 1, 2])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3, 0.2])

    hist_item_id = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0],
                             [1, 2, 0, 0]])
    hist_cate_id = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0],
                             [1, 2, 0, 0]])

    behavior_length = np.array([3, 3, 2, 2])

    feature_dict = {
        'user': uid,
        'gender': gender,
        'item_id': item_id,
        'cate_id': cate_id,
        'hist_item_id': hist_item_id,
        'hist_cate_id': hist_cate_id,
        'pay_score': score,
        "seq_length": behavior_length
    }

    x = {
        name: feature_dict[name]
        for name in get_feature_names(feature_columns)
    }
    y = np.array([1, 0, 1, 0])
    return x, y, feature_columns, behavior_feature_list
Exemplo n.º 5
0
def get_test_data(sample_size=1000, embedding_size=4, sparse_feature_num=1, dense_feature_num=1,
                  sequence_feature=['sum', 'mean', 'max'], classification=True, include_length=False,
                  hash_flag=False, prefix=''):


    feature_columns = []
    model_input = {}


    if 'weight'  in sequence_feature:
        feature_columns.append(VarLenSparseFeat(SparseFeat(prefix+"weighted_seq",vocabulary_size=2,embedding_dim=embedding_size),maxlen=3,length_name=prefix+"weighted_seq"+"_seq_length",weight_name=prefix+"weight"))
        s_input, s_len_input = gen_sequence(
            2, 3, sample_size)

        model_input[prefix+"weighted_seq"] = s_input
        model_input[prefix+'weight'] = np.random.randn(sample_size,3,1)
        model_input[prefix+"weighted_seq"+"_seq_length"] = s_len_input
        sequence_feature.pop(sequence_feature.index('weight'))


    for i in range(sparse_feature_num):
        dim = np.random.randint(1, 10)
        feature_columns.append(SparseFeat(prefix+'sparse_feature_'+str(i), dim,embedding_size,dtype=torch.int32))
    for i in range(dense_feature_num):
        feature_columns.append(DenseFeat(prefix+'dense_feature_'+str(i), 1,dtype=torch.float32))
    for i, mode in enumerate(sequence_feature):
        dim = np.random.randint(1, 10)
        maxlen = np.random.randint(1, 10)
        feature_columns.append(
            VarLenSparseFeat(SparseFeat(prefix +'sequence_' + mode,vocabulary_size=dim,  embedding_dim=embedding_size), maxlen=maxlen, combiner=mode))

    for fc in feature_columns:
        if isinstance(fc,SparseFeat):
            model_input[fc.name]= np.random.randint(0, fc.vocabulary_size, sample_size)
        elif isinstance(fc,DenseFeat):
            model_input[fc.name] = np.random.random(sample_size)
        else:
            s_input, s_len_input = gen_sequence(
                fc.vocabulary_size, fc.maxlen, sample_size)
            model_input[fc.name] = s_input
            if include_length:
                fc.length_name = prefix+"sequence_"+str(i)+'_seq_length'
                model_input[prefix+"sequence_"+str(i)+'_seq_length'] = s_len_input

    if classification:
        y = np.random.randint(0, 2, sample_size)
    else:
        y = np.random.random(sample_size)

    return model_input, y, feature_columns
Exemplo n.º 6
0
def get_xy_fd():
    feature_columns = [
        SparseFeat('user', 3, embedding_dim=8),
        SparseFeat('gender', 2, embedding_dim=8),
        SparseFeat('item', 3 + 1, embedding_dim=8),
        SparseFeat('item_gender', 2 + 1, embedding_dim=8),
        DenseFeat('score', 1)
    ]

    feature_columns += [
        VarLenSparseFeat(SparseFeat('hist_item', 3 + 1, embedding_dim=8),
                         4,
                         length_name="seq_length"),
        VarLenSparseFeat(SparseFeat('hist_item_gender', 2 + 1,
                                    embedding_dim=8),
                         4,
                         length_name="seq_length")
    ]
    behavior_feature_list = ["item", "item_gender"]
    uid = np.array([0, 1, 2])
    ugender = np.array([0, 1, 0])
    iid = np.array([1, 2, 3])  # 0 is mask value
    igender = np.array([1, 2, 1])  # 0 is mask value
    score = np.array([0.1, 0.2, 0.3])

    hist_iid = np.array([[1, 2, 3, 0], [1, 2, 3, 0], [1, 2, 0, 0]])
    hist_igender = np.array([[1, 1, 2, 0], [2, 1, 1, 0], [2, 1, 0, 0]])
    behavior_length = np.array([3, 3, 2])

    feature_dict = {
        'user': uid,
        'gender': ugender,
        'item': iid,
        'item_gender': igender,
        'hist_item': hist_iid,
        'hist_item_gender': hist_igender,
        'score': score,
        "seq_length": behavior_length
    }
    x = {
        name: feature_dict[name]
        for name in get_feature_names(feature_columns)
    }
    y = np.array([1, 0, 1])

    return x, y, feature_columns, behavior_feature_list
    data[dense_features] = data[dense_features].fillna(0, )
    target = ['label']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name

    fixlen_feature_columns = [
        SparseFeat(feat, data[feat].nunique()) for feat in sparse_features
    ] + [DenseFeat(
        feat,
        1,
    ) for feat in dense_features]
    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)

    # 3.generate input data for model

    train, test = train_test_split(data,
                                   test_size=0.2,
                                   random_state=2020,
                                   shuffle=False)
    # litez: a dictionary of pd.Series
    train_model_input = {name: train[name] for name in feature_names}
Exemplo n.º 8
0
def task(action):
    print('-----------action-----------', action)
    USE_FEAT = [action] + SELECT_FRTS
    train = pd.read_csv(ROOT_PATH + f'/train_data_for_{action}.csv')[USE_FEAT]
    train = train.sample(frac=1, random_state=42).reset_index(drop=True)
    print("posi prop:")
    print(sum((train[action] == 1) * 1) / train.shape[0])
    test = pd.read_csv(ROOT_PATH + '/test_data.csv')[SELECT_FRTS]
    target = [action]
    test[target[0]] = 0
    test = test[USE_FEAT]
    data = pd.concat((train, test)).reset_index(drop=True)
    print(train.shape, test.shape, data.shape)
    dense_features = DENSE_FEATURE
    sparse_features = [
        i for i in USE_FEAT if i not in dense_features and i not in target
    ]

    data[sparse_features] = data[sparse_features].fillna(0)
    data[dense_features] = data[dense_features].fillna(0)

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name
    fixlen_feature_columns = [
        SparseFeat(feat, data[feat].nunique()) for feat in sparse_features
    ] + [DenseFeat(
        feat,
        1,
    ) for feat in dense_features]
    #
    dnn_feature_columns = fixlen_feature_columns
    #linear_feature_columns = [SparseFeat(feat, data[feat].nunique())
    #                         for feat in sparse_features]
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)

    # 3.generate input data for model
    train, test = data.iloc[:train.shape[0]].reset_index(
        drop=True), data.iloc[train.shape[0]:].reset_index(drop=True)
    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {name: test[name] for name in feature_names}
    #-------
    eval_ratio = 0.
    eval_df = train[int((1 - eval_ratio) *
                        train.shape[0]):].reset_index(drop=True)
    userid_list = eval_df['userid'].astype(str).tolist()
    print('val len:', len(userid_list))

    # 4.Define Model,train,predict and evaluate
    device = 'cpu'
    use_cuda = True
    if use_cuda and torch.cuda.is_available():
        print('cuda ready...')
        device = 'cuda:0'

    model = MyDeepFM(linear_feature_columns=linear_feature_columns,
                     dnn_feature_columns=dnn_feature_columns,
                     use_fm=True,
                     dnn_hidden_units=(256, 128),
                     l2_reg_linear=1e-1,
                     l2_reg_embedding=0.00001,
                     l2_reg_dnn=0,
                     init_std=0.0001,
                     seed=1024,
                     dnn_dropout=0.,
                     dnn_activation='relu',
                     dnn_use_bn=False,
                     task='binary',
                     device=device)

    model.compile("adagrad", "binary_crossentropy", metrics=["auc"])

    history = model.fit(train_model_input,
                        train[target].values,
                        batch_size=1024,
                        epochs=NUM_EPOCH_DICT[action],
                        verbose=1,
                        validation_split=eval_ratio,
                        userid_list=userid_list)
    pred_ans = model.predict(test_model_input, 128)
    #submit[action] = pred_ans
    torch.cuda.empty_cache()
    return pred_ans
Exemplo n.º 9
0
INNER_DIM = args.inner_dim
if INNER_DIM <= 0:
    INNER_DIM = None
BATCH = args.batch
OUTER_DIM = args.embd_dim
#data = pd.read_csv('../../preprocessed/criteo_train.csv')
data = pickle.load(open('../preprocessed/preprocessed_avazu.pkl','rb'))
header_names = ['id', 'click', 'hour', 'C1', 'banner_pos', 'site_id', 'site_domain',
       'site_category', 'app_id', 'app_domain', 'app_category', 'device_id',
       'device_ip', 'device_model', 'device_type', 'device_conn_type', 'C14',
       'C15', 'C16', 'C17', 'C18', 'C19', 'C20', 'C21']
sparse_features = header_names[3:]
dense_features = ['hour']
target = ['click']
# 2.count #unique features for each sparse field,and record dense feature field name
fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique(),embedding_dim=OUTER_DIM) for feat in sparse_features] + [DenseFeat(feat, 1, ) for feat in dense_features]
dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns
feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)
# 3.generate input data for model
train, test = train_test_split(data, test_size=0.1,random_state=42)
train_model_input = {name: train[name] for name in feature_names}
test_model_input = {name: test[name] for name in feature_names}
train_model_labels = train[target].values
test_model_labels = test[target].values
# memory optimization
import gc
del data
data = None
gc.collect()
# 4.Define Model,train,predict and evaluate
Exemplo n.º 10
0
    data[sparse_features] = data[sparse_features].fillna('-1', )
    data[dense_features] = data[dense_features].fillna(0, )
    target = ['label']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name

    fixlen_feature_columns =  [SparseFeat(feat, data[feat].nunique())
                           for feat in sparse_features] + [DenseFeat(feat, 1,)
                          for feat in dense_features]



    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    fixlen_feature_names = get_fixlen_feature_names(linear_feature_columns + dnn_feature_columns)

    # 3.generate input data for model

    train, test = train_test_split(data, test_size=0.2)
    train_model_input = [train[name] for name in fixlen_feature_names]
    test_model_input = [test[name] for name in fixlen_feature_names]
Exemplo n.º 11
0
    dense_features = ['I' + str(i) for i in range(1, 14)]

    data[sparse_features] = data[sparse_features].fillna('-1', )
    data[dense_features] = data[dense_features].fillna(0, )
    target = ['label']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[dense_features] = mms.fit_transform(data[dense_features])

    # 2.count #unique features for each sparse field,and record dense feature field name

    fixlen_feature_columns = [SparseFeat(feat, data[feat].nunique()) for feat in sparse_features] + [DenseFeat(feat, 1, ) for feat in dense_features]

    dnn_feature_columns = fixlen_feature_columns
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

    # 3.generate input data for model

    train, test = train_test_split(data, test_size=0.2)

    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {name: test[name] for name in feature_names}

    # 4.Define Model,train,predict and evaluate
Exemplo n.º 12
0
import numpy as np
import pandas as pd
from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from deepctr_torch.models import *
from deepctr_torch.inputs import SparseFeat, DenseFeat, VarLenSparseFeat, get_feature_names
import torch
import torch.nn.functional as F

feature_columns = [
    SparseFeat('user', 3),
    SparseFeat('gender', 2),
    SparseFeat('item', 3 + 1),
    SparseFeat('item_gender', 2 + 1),
    DenseFeat('score', 1)
]
feature_columns += [
    VarLenSparseFeat('hist_item', 3 + 1, maxlen=4, embedding_name='item'),
    VarLenSparseFeat('hist_item_gender',
                     3 + 1,
                     maxlen=4,
                     embedding_name='item_gender')
]

behavior_feature_list = ["item", "item_gender"]
uid = np.array([0, 1, 2])
ugender = np.array([0, 1, 0])
iid = np.array([1, 2, 3])  # 0 is mask value
igender = np.array([1, 2, 1])  # 0 is mask value
score = np.array([0.1, 0.2, 0.3])