Exemplo n.º 1
0
def run_deepfm_model():
    train, test, train_model_input, test_model_input, dnn_feature_columns, linear_feature_columns, feature_names, target = read_data_as_model(
    )

    #Define Model,train,predict and evaluate
    model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
    model.compile(
        "adam",
        "binary_crossentropy",
        metrics=['binary_crossentropy'],
    )

    model.fit(
        train_model_input,
        train[target].values,
        batch_size=256,
        epochs=10,
        verbose=2,
        validation_split=0.2,
    )
    pred_ans = model.predict(test_model_input, batch_size=256)
    print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
    print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))
    return pred_ans, test[target].values, round(
        roc_auc_score(test[target].values, pred_ans), 4), 'deepfm'
Exemplo n.º 2
0
def train_deepFM():
    k = featureengineer.k
    #缺失值填充+编码处理
    data,appsnum, tags_nums = trainmodel.data,trainmodel.appsnum,trainmodel.tags_nums
    data[trainmodel.sparse_features] = data[trainmodel.sparse_features].fillna('-1', )
    for feat in trainmodel.dense_features:
        data[feat].fillna(data[feat].dropna().mean(), inplace=True)

    for feat in trainmodel.sparse_features:
        data[feat] = data[feat].apply(lambda x:str(x))
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    mms = MinMaxScaler(feature_range=(0, 1))
    data[trainmodel.dense_features] = mms.fit_transform(data[trainmodel.dense_features])


    #数据格式转换
    fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max()+1, embedding_dim=8)
                              for i, feat in enumerate(trainmodel.sparse_features)] + \
                             [DenseFeat(feat, 1, ) for feat in trainmodel.dense_features]

    lgbOut_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].max()+1, embedding_dim=1)
                              for i, feat in enumerate(trainmodel.lgbOut_Features)]

    key2index_len = {'applist': appsnum+1, 'new_tag': tags_nums}
    varlen_features = [VarLenSparseFeat('%s' % i, vocabulary_size=key2index_len[i], maxlen=k, embedding_dim=8, combiner='mean',weight_name=None) for i
                       in trainmodel.var_features]

    dnn_feature_columns = fixlen_feature_columns + varlen_features
    linear_feature_columns = fixlen_feature_columns + varlen_features + lgbOut_feature_columns

    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

    sparse_dense_features = trainmodel.sparse_features + trainmodel.dense_features + trainmodel.lgbOut_Features

    train, test = train_test_split(data, test_size=0.2)


    train_model_input = {name: train[name] for name in sparse_dense_features}
    test_model_input = {name: test[name] for name in sparse_dense_features}
    for x in trainmodel.var_features:
        if x == 'applist':
            train_model_input[x] = np.array(train[x].tolist())
            test_model_input[x] = np.array(test[x].tolist())
        if x == 'new_tag':
            train_model_input[x] = np.array(train[x].tolist())-appsnum
            test_model_input[x] = np.array(test[x].tolist())-appsnum
    # 模型
    model = DeepFM(linear_feature_columns=linear_feature_columns, dnn_feature_columns=dnn_feature_columns,
                   dnn_hidden_units=(50, 30, 30), l2_reg_linear=0.001, l2_reg_embedding=0.001,
                   l2_reg_dnn=0, init_std=0.0001, seed=1024, dnn_dropout=0.1, dnn_activation='relu', dnn_use_bn=True,
                   task='binary')
    model.compile("adam", "binary_crossentropy",metrics=['AUC'], )

    history = model.fit(train_model_input, train['target'].values,
                        batch_size=256, epochs=1, verbose=2, validation_split=0.2, )

    pred_ans = model.predict(test_model_input, batch_size=256)
    print("test AUC", round(roc_auc_score(test['target'].values, pred_ans), 4))
Exemplo n.º 3
0
def train_model(train, test, linear_feature, dnn_feature):

    model = DeepFM(linear_feature, dnn_feature, task='binary')
    model.compile(
        "adam",
        "binary_crossentropy",
        metrics=['AUC'],
    )
    history = model.fit(
        *train,
        batch_size=512,
        epochs=5,
        verbose=2,
        validation_split=0.1,
    )
    pred_ans = model.predict(test[0], batch_size=512)
    print("test LogLoss", round(log_loss(test[1], pred_ans), 4))
    print("test AUC", round(roc_auc_score(test[1], pred_ans), 4))
Exemplo n.º 4
0
def deepfm_model(linear_feature_columns, dnn_feature_columns,
                 train_model_input, train, test_model_input, test):
    cols = ['model', 'RMSE', 'MAE', 'MSE', 'AUC', 'score']
    df_result = pd.DataFrame(columns=cols, index=range(1))
    model = DeepFM(linear_feature_columns,
                   dnn_feature_columns,
                   dnn_hidden_units=config.deepfm_att["dnn_hidden_units"],
                   init_std=config.deepfm_att["init_std"],
                   seed=config.deepfm_att["seed"],
                   dnn_dropout=config.deepfm_att["dnn_dropout"],
                   dnn_activation=config.deepfm_att["dnn_activation"],
                   task=config.deepfm_att["task"],
                   fm_group=config.deepfm_att["fm_group"],
                   dnn_use_bn=config.deepfm_att["dnn_use_bn"])

    model.compile("adam", "mse", metrics=['mse'])

    history = model.fit(train_model_input,
                        train[target].values,
                        batch_size=256,
                        epochs=config.model_epoch['epoch'],
                        verbose=2,
                        validation_split=0.2)

    pred_ans = model.predict(test_model_input, batch_size=256)
    save_model(model, 'saved_deepfm.h5')  # save_model
    auc = roc_auc_score(test[target].values, pred_ans)

    df_result.loc[0].model = "DeepFM"
    df_result.loc[0].RMSE = np.round(
        math.sqrt(mean_squared_error(test[target].values, pred_ans)), 3)
    df_result.loc[0].MAE = np.round(
        mean_absolute_error(test[target].values, pred_ans), 3)
    df_result.loc[0].MSE = np.round(
        mean_squared_error(test[target].values, pred_ans), 3)
    df_result.loc[0].AUC = np.round(auc, 3)
    #df_result.loc[0].score=(1/df_result.iloc[0]['RMSE'])*(1/df_result.iloc[0]['MAE'])*(2*df_result.iloc[0]['AUC'])
    return df_result
except:
    pass
model.compile(optimizer="adam",
              loss="binary_crossentropy",
              metrics=['accuracy', 'AUC'])
history = model.fit(train_model_input,
                    train[target],
                    batch_size=8192,
                    epochs=5,
                    verbose=1,
                    shuffle=True,
                    callbacks=[cp_callback],
                    validation_data=(val_model_input, val[target]))

data['predict'] = 0
data.loc[train_index, 'predict'] = model.predict(train_model_input,
                                                 batch_size=8192)
data.loc[val_index, 'predict'] = model.predict(val_model_input,
                                               batch_size=8192)
data.loc[test_index, 'predict'] = model.predict(test_model_input,
                                                batch_size=8192)

p = 88.5
pred_val = data.loc[val_index, 'predict']
print("val LogLoss", round(log_loss(val[target], pred_val), 4))
threshold_val = round(np.percentile(pred_val, p), 4)
pred_val2 = [1 if i > threshold_val else 0 for i in pred_val]
print("val F1 >%s" % threshold_val, round(f1_score(val[target], pred_val2), 4))

pred_train_val = data.loc[data['isTest'] != 1, 'predict']
print(
    "train_val LogLoss",
Exemplo n.º 6
0
                                      dnn_feature_columns)

    # 3.generate input data for model

    train, test = train_test_split(data, test_size=0.2)

    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {name: test[name] for name in feature_names}

    # 4.Define Model,train,predict and evaluate
    model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary')
    model = multi_gpu_model(model, gpus=2)

    model.compile(
        "adam",
        "binary_crossentropy",
        metrics=['binary_crossentropy'],
    )

    history = model.fit(
        train_model_input,
        train[target].values,
        batch_size=256,
        epochs=10,
        verbose=2,
        validation_split=0.2,
    )
    pred_ans = model.predict(test_model_input, batch_size=256)
    print("test LogLoss", round(log_loss(test[target].values, pred_ans), 4))
    print("test AUC", round(roc_auc_score(test[target].values, pred_ans), 4))
Exemplo n.º 7
0
                 batch_size=BATCH_SIZE,
                 epochs=EPOCHS,
                 verbose=2,
                 validation_split=0.1,
                 shuffle=True,
                 callbacks=get_callbacks())

# In[ ]:

best_epoch = np.argmax(hist.history["val_auroc"]) + 1
model.load_weights('checkpoint/epoch_{:02d}.hdf5'.format(best_epoch))
print(hist.history["val_auroc"])
print('loading epoch_{:02d}.hdf5'.format(best_epoch))

y_pre = model.predict(online_test_model_input,
                      verbose=1,
                      batch_size=BATCH_SIZE)
res = pd.DataFrame()
res['id'] = test_id
res['probability'] = y_pre
res.to_csv('submission_DeepFM_fibinet_feature.csv', index=False)

# pred_ans = pred_ans.flatten()
# ans = pd.DataFrame(data={'id': np.array(
#     [i for i in range(1, pred_ans.shape[0]+1)]), 'probability': pred_ans})
# ans.to_csv('submission_DeepFM.csv', index=False, header=True)

# del model
# gc.collect()
# # In[ ]:
Exemplo n.º 8
0
        model.load_weights('model_save/deep_fm_sample-ep001-loss0.184-val_loss0.172.h5')

        # model = load_model('model_save/deep_fm_sample-ep001-loss0.192-val_loss0.176.h5')

        data = pd.read_csv("./data/sample/validation.txt")

        # 1.Label Encoding for sparse features,and do simple Transformation for dense features
        for feat in sparse_features:
            lbe = LabelEncoder()
            data[feat] = lbe.fit_transform(data[feat])
        # 2.count #unique features for each sparse field
        sparse_feature_dim = {feat: data[feat].nunique()
                              for feat in sparse_features}
        # 3.generate input data for model
        model_input = [data[feat].values for feat in sparse_feature_dim]

        pred = model.predict(model_input, batch_size, 1)
        label = data[target].values.flatten().tolist()
        pred = pred.flatten().tolist()
        with open('data/pctr', 'w') as fw:
            for i in range(len(pred)):
                if i % 10000 == 0:
                    print('label: %f, pred: %f' % (label[i], pred[i]))
                to_write = str(i+1)+','+str(label[i])+','+str(pred[i])+'\n'
                fw.write(to_write)
            fw.close()
        AUC = auc.auc(label, pred)
        print('auc: %f' % AUC)

    print("demo done")
Exemplo n.º 9
0
print(model_input)
# print(model_input.shape)
# 4.Define Model,compile and train
model = DeepFM(
    {
        "sparse": sparse_feat_list,
        "dense": dense_feat_list,
        "sequence": sequence_feature
    },
    final_activation='linear',
    embedding_size=8,
    use_fm=False,
    hidden_size=(64, 64))

model.compile(
    "adam",
    "mape",
    metrics=['mape'],
)
history = model.fit(
    model_input,
    df_train[target].values,
    batch_size=2048,
    epochs=200,
    verbose=2,
    validation_split=0.2,
)
pred = model.predict(model_input)
print(pred)
print(smape(df_train[target].values, pred))
Exemplo n.º 10
0
X = data[feats]
y = data['Rating']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

sparse_features = [
    'UserID', 'MovieID', 'Gender', 'Occupation', 'day', 'weekday'
]
dense_features = ['hour', 'Age']

fixlen_feature_columns = [SparseFeat(feat, vocabulary_size=data[feat].nunique(),embedding_dim=4) for feat in sparse_features] + \
                         [DenseFeat(feat, 1) for feat in dense_features]

dnn_feature_columns = fixlen_feature_columns
linear_feature_columns = fixlen_feature_columns

model = DeepFM(linear_feature_columns, dnn_feature_columns, task='multiclass')
model.compile('adam', 'mse', metrics=['accuracy'])

feature_names = get_feature_names(fixlen_feature_columns)

train_feed_dict = {name: X_train[name] for name in feature_names}
test_feed_dict = {name: X_test[name] for name in feature_names}

model.fit(train_feed_dict,
          y_train,
          batch_size=256,
          epochs=10,
          validation_split=0.2)
pred_ans = model.predict(test_feed_dict, batch_size=256)
Exemplo n.º 11
0
    linear_feature_columns = fixlen_feature_columns

    feature_names = get_feature_names(linear_feature_columns + dnn_feature_columns)

    # 3.generate input data for model

    train, valid = train_test_split(data, test_size=0.2,random_state=10)

    train_model_input = {name:train[name] for name in feature_names}
    valid_model_input = {name:valid[name] for name in feature_names}
    test_model_input = {name:test[name] for name in feature_names}
    
    # 4.Define Model,train,predict and evaluate
    #dnn_hidden_units用来定义隐藏层数量以及每层神经元个数
    model = DeepFM(linear_feature_columns, dnn_feature_columns, task='binary',dnn_hidden_units=[100,100],dnn_dropout=0.2)
    model.compile("adam", "binary_crossentropy",
                  metrics=['binary_crossentropy'], )

    history = model.fit(train_model_input, train[target].values,
                        batch_size=256, epochs=2, verbose=2, validation_data=(valid_model_input,valid['target']) )
    pred_ans = model.predict(valid_model_input, batch_size=256)
    print("valid LogLoss", round(log_loss(valid[target].values, pred_ans), 4))
    print("valid AUC", round(roc_auc_score(valid[target].values, pred_ans), 4))
    #进行预测,并写入csv文件
    result=model.predict(test_model_input, batch_size=256)
    
    result=pd.DataFrame(result,columns=['label'])
    submit=pd.DataFrame(test['ID'],columns=['ID'])
    submit=submit.join(result)
    submit.to_csv(sys.path[0]+"\\tem\\"+"result"+'.csv',index=False)
from deepctr.models import DeepFM

if __name__ == "__main__":

    data = pd.read_csv("./movielens_sample.txt")
    sparse_features = ["movie_id", "user_id",
                       "gender", "age", "occupation", "zip"]
    target = ['rating']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])
    # 2.count #unique features for each sparse field
    sparse_feature_dim = {feat: data[feat].nunique()
                          for feat in sparse_features}
    # 3.generate input data for model
    train, test = train_test_split(data, test_size=0.2)
    train_model_input = [train[feat].values for feat in sparse_feature_dim]
    test_model_input = [test[feat].values for feat in sparse_feature_dim]
    # 4.Define Model,train,predict and evaluate
    model = DeepFM({"sparse": sparse_feature_dim, "dense": []},
                   final_activation='linear')
    model.compile("adam", "mse", metrics=['mse'],)

    history = model.fit(train_model_input, train[target].values,
                        batch_size=256, epochs=1, verbose=2, validation_split=0.2,)
    pred_ans = model.predict(test_model_input, batch_size=256)
    print("test MSE", round(mean_squared_error(
        test[target].values, pred_ans), 4))
Exemplo n.º 13
0
class RecommenderDeepNN:
    '''
    Recommender for Yelp dataset using the deepFM model.

    Parameters
    ----------
    category: 'restaurants', Keep only businesses of a certain category
        - Options: 'restaurants', 'automotive', 'shopping'
    min_review: 5, Keep only business with more review_count than this value
    min_category: 50, Keep only categories that apply to more than this amount of businesses
    weight:  False, Whether or not to use weights for the attribute matrix in the DeepFM
    scaler: 'minmax', Scaler for dense features
    optimizer: "adam", Optimizer for the DeepFM
    loss: 'mse', Loss function for the DeepFM
    batch_size: 256, 
    epochs: 10, 
    train_size: 0.8,
    deepfm__dnn_hidden_units: (128, 128),
    deepfm__l2_reg_linear: 1e-05,
    deepfm__l2_reg_embedding: 1e-05,
    deepfm__l2_reg_dnn: 0,
    deepfm__seed: 1024,
    deepfm__dnn_dropout: 0,
    deepfm__dnn_activation: 'relu'

    Example
    -------
    deepnn = RecommenderDeepNN(deepfm__seed=2048)
    deepnn.load_data(config.JSON_BUSINESS, config.CSV_RATINGS)
    deepnn.fit()
    deepnn.topN(260, n=5)

    deepnn = RecommenderDeepNN(scaler='standard', train_size=0.99)    
    deepnn.fit(config.JSON_BUSINESS, config.CSV_RATINGS)
    '''    

    def __init__(self, **kwargs):
        '''
        Parameters
        ----------
        path_business: Path to the business.json file that contains 'attributes' and 'catogories' as dictionaries for all businesses
        path_ratings: Path to the ratings.csv file that contains 'user_id', 'business_id' and 'stars'. The review text is not needed here.
        '''
        self.path_business = ""
        self.path_ratings = ""
        self.features_sparse = features_sparse
        self.features_dense = features_dense
        
        self.params = params_deepnn
        self.params_deepfm = {}
        self.business = None
        self.data = None

        self.attr2index = {}
        self.raw_to_iid = {}
        self.iid_to_raw = {}
        self.raw_to_uid = {}
        self.uid_to_raw = {}

        # Label encoders
        self.lbe_user = None
        self.lbe_item = None

        self.model = None
        self.features_linear = []
        self.features_dnn = []
        self.model_input = {}


        self.update_params(**kwargs)

    def load_data(self, path_business, path_ratings):
        '''
        Load data and transform it to usable format.
        '''
        print("Loading data ...")
        
        self.path_business = path_business
        self.path_ratings = path_ratings
        
        df = pd.read_json(self.path_business, lines=True, encoding='utf-8')
        df_ratings = pd.read_csv(self.path_ratings)
        df_ratings.rename({'stars':'rating'}, axis=1, inplace=True)

        to_keep = config.Keywords_Categories[self.params['category']]
        keeprows = utils.filter_business_with_categories(df, to_keep)
        df = df[keeprows]

        # Map user_id and business_id encodings to integers
        self.uid_to_raw = dict(df_ratings['user_id'].drop_duplicates().reset_index()['user_id'])
        self.raw_to_uid = {k:v for v, k in self.uid_to_raw.items()}
        self.iid_to_raw = dict(df['business_id'])
        self.raw_to_iid = {k:v for v, k in self.iid_to_raw.items()}

        self.business = df[['business_id', 'name', 'stars', 'review_count', 'categories']]

        df = df[df['review_count'] > self.params['min_review']]
        df = df_ratings.join(df[['business_id', 'stars', 'review_count', 'categories']].set_index('business_id'), on='business_id', how='right')
        # Has to be "right"... otherwise there will be NaNs
        # Also, use df.set_index() because df is smaller in size

        df['user_id'] = df['user_id'].map(self.raw_to_uid)
        df['business_id'] = df['business_id'].map(self.raw_to_iid)
        
        self.lbe_user = LabelEncoder()
        self.lbe_item = LabelEncoder()
        df['user_id'] = self.lbe_user.fit_transform(df['user_id'])
        df['business_id'] = self.lbe_item.fit_transform(df['business_id'])
        # x = lbe_user.inverse_transform(df_ratings['user_id'])
        # y = lbe_item.inverse_transform(df_ratings['business_id'])
        
        if(self.params['scaler'] == 'minmax'):
            scaler = MinMaxScaler(feature_range=(0,1))
        elif(self.params['scaler'] == 'standard'):
            scaler = StandardScaler()
        df[self.features_dense] = scaler.fit_transform(df[self.features_dense])

        lbe = LabelEncoder()
        for var in self.features_sparse:
            if(var not in ['business_id', 'user_id']):
                df[var] = lbe.fit_transform(df[var])

        self.data = df
        
        del df, df_ratings

    def _compile_business_categories(self, df_business):
        '''
        Find all the categories that apply to the businesses in the DataFrame df_business
        '''
        categories = Counter()
        for line in df_business['categories']:
            if(isinstance(line, str)):
                categories.update(re.split(', ', line))
        categories = pd.DataFrame.from_dict(categories, orient='index', columns=['count'])
        return categories

    def _build_category_dict(self, drop_categories=[]):
        attrs = self._compile_business_categories(self.data)
        attrs = attrs[attrs['count'] > self.params['min_category']].sort_values(by='count', ascending=False)
        for cat in drop_categories:
            attrs.drop(cat, inplace=True)
        attrs.index.to_list()
        self.attr2index = {k:v+1 for v, k in enumerate(attrs.index.to_list())}
        del attrs

    def _category_vectorizer(self, x):
        '''
        Label encode categories of any business x into a list of indices. The mapping is given by the dictionary attr2index{catogory:index}.
        '''
        if(isinstance(x, str)):
            spt = re.split(', ', x)
            return list(map(lambda x: self.attr2index[x] if x in self.attr2index else 0, spt))
        else: return []

    def _get_category_matrix(self, df):
        attrs_matrix = [self._category_vectorizer(x) for x in df['categories'].values]
        attrs_max_len = max(np.array(list(map(len, attrs_matrix))))
        attrs_matrix = pad_sequences(attrs_matrix, maxlen=attrs_max_len, padding='post',)

        print("Matrix takes {:5.2f} MB".format(attrs_matrix.nbytes/1024./1024.))
        return attrs_matrix, attrs_max_len

    def _build_model(self):
        to_drop = config.Keywords_Categories[self.params['category']]
        self._build_category_dict(drop_categories=to_drop)
        attrs_matrix, attrs_max_len = self._get_category_matrix(self.data)
        
        vars_fixlen = [SparseFeat(var, self.data[var].nunique(),
                                  embedding_dim=4)
                       for var in self.features_sparse]
        vars_fixlen += [DenseFeat(var, 1,) for var in self.features_dense]
        vars_varlen = [VarLenSparseFeat(SparseFeat('categories',
                        vocabulary_size=len(self.attr2index) + 1,
                        embedding_dim=4),
                        maxlen=attrs_max_len, combiner='mean',
                        weight_name='attrs_weight' if self.params['weight'] else None)]

        self.features_linear = vars_fixlen + vars_varlen
        self.features_dnn = vars_fixlen + vars_varlen

        self.model = DeepFM(self.features_linear, self.features_dnn,
                            task='regression', **self.params_deepfm)
        return attrs_matrix, attrs_max_len

    def get_feature_names(self):
        return get_feature_names(self.features_linear + self.features_dnn)

    def _set_params_deepfm(self):
        for k, v in self.params.items():
            spt = k.split('__')
            if(len(spt) > 1): self.params_deepfm[spt[1]] = v

    def update_params(self, recompile=True, **kwargs):
        '''
        Update parameters for the recommender and re-compile the DeepFM model unless recompile is set to False.

        Example
        -------
        deepnn.update_params(epochs=20, deepfm__l2_reg_linear=2e-4)
        '''
        for (k, v) in kwargs.items():
            if(k in self.params):
                self.params[k] = v
            else:
                raise ValueError('{0} is not a valid parameter for RecommenderDeepNN.'.format(k))
        self._set_params_deepfm()
        if(recompile == True and self.model is not None):
            self.model = DeepFM(self.features_linear, self.features_dnn,
                                task='regression', **self.params_deepfm)

    def fit(self, path_business=None, path_ratings=None):
        if(self.data is None):
            self.load_data(path_business, path_ratings)

        model_input = self._get_model_input(self.data)

        self.model.compile(self.params['optimizer'],
                           self.params['loss'],
                           metrics=[self.params['loss']],)
        self.model.fit(model_input, self.data['rating'].values,
                       batch_size=self.params['batch_size'],
                       epochs=self.params['epochs'], 
                       validation_split=1-self.params['train_size'],
                       verbose=2)

    def _get_model_input(self, df):
        if(self.model is None):
            attrs_matrix, attrs_max_len = self._build_model()
        else:
            attrs_matrix, attrs_max_len = self._get_category_matrix(df)

        features = self.get_feature_names()

        model_input = {name: df[name] for name in features}
        model_input['categories'] = attrs_matrix
        if(self.params['weight']):
            model_input['attrs_weight'] = np.random.randn(df.shape[0], attrs_max_len, 1)
        return model_input

    def predictAllItemsForUser(self, uid):
        '''
        Returns predicted ratings of all businesses for any user (uid)
        '''
        df = self.data.drop_duplicates('business_id').drop('user_id', axis=1)
        df['user_id'] = uid

        model_input = self._get_model_input(df)
        pred = self.model.predict(model_input, 
                                  batch_size=self.params['batch_size'])
        return pd.DataFrame(pred,index=df['business_id'],columns=['pred'])

    def topN(self, uid, n=5):
        inner_uid = self.lbe_user.transform([uid])[0]
        pred = self.predictAllItemsForUser(inner_uid)
        topn = pred.nlargest(n, columns='pred')
        top_n_iid = self.lbe_item.inverse_transform(topn.index)
        predictions = topn['pred'].to_list()
        n_reviews = self.data['user_id'].value_counts()[inner_uid]
        print()
        print("UserID: {0},  Rated: {1}".format(uid, n_reviews))
        print("--------------------------------")
        topN_business = self.business.loc[top_n_iid]
        for i, (_, business) in enumerate(topN_business.iterrows()):
            print(business['name'])
            print(business['categories'])
            print("Pred: %4.2f  Avg: %3.1f out of %d reviews\n" % \
                  (predictions[i], business['stars'], business['review_count']))
class DeepModel:
    def __init__(self, model_name, model_architecture="DeepFM"):
        self.model_name = model_name
        self.model_architecture = model_architecture

        self.model = None
        self.history = None
        self.data = None
        self.callbacks = []

    # requires tf2
    # def set_notebook_mode(self):
    #    progress_bar_cb = tfa.callbacks.TQDMProgressBar() #TQDMNotebookCallback(leave_inner=True, leave_outer=True)
    #    self.callbacks.append(progress_bar_cb)

    def prepare_data(self,
                     data_source,
                     sparse_features,
                     target,
                     test_size=0.1):
        self.data = Data(sparse_features,
                         target,
                         data_format="deepctr",
                         test_size=test_size)
        self.data.ingest(data_source)
        self.data.prepare()

    def build(self, task):
        assert task in ['regression', 'binary']
        if self.model_architecture == "DeepFM":
            self.model = DeepFM(
                self.data.linear_feature_columns,
                self.data.dnn_feature_columns,
                task=task,
            )
        else:
            raise NotImplementedError(
                'At the current stage of the development, only a DeepFM is supported'
            )

        task_attr = {
            'regression': {
                'loss': 'mse',
                'metrics': 'mse'
            },
            'binary': {
                'loss': 'binary_crossentropy',
                'metrics': 'accuracy'
            }
        }
        if task == "regression":
            loss = "mse"
            metrics = "mse"
        elif task == "binary":
            loss = "binary_crossentropy"
            metrics = "accuracy"

        self.model.compile(optimizer="adam",
                           loss=task_attr[task]['loss'],
                           metrics=task_attr[task]['metrics'])

    def train(self, batch_size=256, epochs=10, validation_split=0.1):
        #class_weights = class_weight.compute_class_weight(
        #    "balanced", np.unique(self.data.y_train[:, 0]), self.data.y_train[:, 0]
        #)
        self.history = self.model.fit(
            self.data.X_train,
            self.data.y_train,
            batch_size=batch_size,
            epochs=epochs,
            validation_split=validation_split,
            verbose=2,
            #class_weight=class_weights,
            callbacks=self.callbacks,
        )

    def evaluate(self):
        self.model.evaluate(self.data.X_test,
                            self.data.y_test,
                            batch_size=4096)

    def prepare_input(self, df):
        df = df.copy()
        for feat in self.data.sparse_features:
            lbe = self.data.encoders[feat]
            df[feat] = lbe.transform(df[feat])

        X = {name: df[name].values for name in self.data.feature_names}
        return X

    def predict(self, X, batch_size=256):
        return self.model.predict(X, batch_size=batch_size)
Exemplo n.º 15
0
hist = model.fit(train_and_val_model_input,
                 y_train_val,
                 batch_size=BATCH_SIZE,
                 epochs=EPOCHS,
                 verbose=2,
                 validation_split=0.1,
                 callbacks=get_callbacks())

# In[ ]:

best_epoch = np.argmax(hist.history["val_auroc"]) + 1
model.load_weights('checkpoint/epoch_{:02d}.hdf5'.format(best_epoch))
print(hist.history["val_auroc"])
print('loading epoch_{:02d}.hdf5'.format(best_epoch))

pred_ans = model.predict(test_model_input, verbose=1, batch_size=BATCH_SIZE)

pred_ans = pred_ans.flatten()
ans = pd.DataFrame(
    data={
        'id': np.array([i for i in range(1, pred_ans.shape[0] + 1)]),
        'probability': pred_ans
    })
ans.to_csv('submission_DeepFM_countFeature.csv', index=False, header=True)

# del model
# gc.collect()
# # In[ ]:

# # EPOCHS = np.argmax(hist.history["val_auroc"])+1
Exemplo n.º 16
0
def test_DFM_avazu(data, train, test):
    print("\nTesting DFM on avazu dataset...\n")

    results_activation_function = {"auc": [], "logloss": [], "rmse": []}
    results_dropout = {"auc": [], "logloss": [], "rmse": []}
    results_number_of_neurons = {"auc": [], "logloss": [], "rmse": []}

    auc = 0
    logloss = 0
    rmse = 0

    features_labels = train.columns

    sparse_features_labels = features_labels[1:23]
    target_label = features_labels[0]

    dnn_feature_columns = [
        SparseFeat(
            feat,
            vocabulary_size=data[feat].nunique(),
            embedding_dim=4,
        ) for feat in sparse_features_labels
    ]
    linear_feature_columns = [
        SparseFeat(
            feat,
            vocabulary_size=data[feat].nunique(),
            embedding_dim=4,
        ) for feat in sparse_features_labels
    ]

    feature_names = get_feature_names(linear_feature_columns +
                                      dnn_feature_columns)

    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {name: test[name] for name in feature_names}

    true_y = test[target_label].values

    print("\t\t-- ACTIVATION FUNCTIONS --\t\t")
    for dnn_activation in dnn_activation_list:
        print("\nTesting {dnn_activation}...".format(
            dnn_activation=dnn_activation))

        model = DeepFM(linear_feature_columns,
                       dnn_feature_columns,
                       dnn_activation=dnn_activation,
                       task='binary')
        model.compile(
            "adam",
            "binary_crossentropy",
            metrics=['binary_crossentropy'],
        )
        model.fit(
            train_model_input,
            train[target_label].values,
            batch_size=256,
            epochs=10,
            verbose=0,
            validation_split=TEST_PROPORTION,
        )
        pred_y = model.predict(test_model_input, batch_size=256)

        auc = compute_auc(true_y, pred_y)
        logloss = compute_log_loss(true_y, pred_y)
        rmse = compute_rmse(true_y, pred_y)

        results_activation_function["auc"].append(auc)
        results_activation_function["logloss"].append(logloss)
        results_activation_function["rmse"].append(rmse)

    print("\t\t-- DROPOUT RATES --\t\t")
    for dnn_dropout in dnn_dropout_list:
        print("\nTesting {dnn_dropout}...".format(dnn_dropout=dnn_dropout))

        model = DeepFM(linear_feature_columns,
                       dnn_feature_columns,
                       dnn_dropout=dnn_dropout,
                       task='binary')
        model.compile(
            "adam",
            "binary_crossentropy",
            metrics=['binary_crossentropy'],
        )
        model.fit(
            train_model_input,
            train[target_label].values,
            batch_size=256,
            epochs=10,
            verbose=0,
            validation_split=TEST_PROPORTION,
        )
        pred_y = model.predict(test_model_input, batch_size=256)

        auc = compute_auc(true_y, pred_y)
        logloss = compute_log_loss(true_y, pred_y)
        rmse = compute_rmse(true_y, pred_y)

        results_dropout["auc"].append(auc)
        results_dropout["logloss"].append(logloss)
        results_dropout["rmse"].append(rmse)

    print("\t\t-- HIDDEN UNITS --\t\t")
    for dnn_hidden_units in dnn_hidden_units_list:
        print("\nTesting {dnn_hidden_units}...".format(
            dnn_hidden_units=dnn_hidden_units))

        model = DeepFM(linear_feature_columns,
                       dnn_feature_columns,
                       dnn_hidden_units=dnn_hidden_units,
                       task='binary')
        model.compile(
            "adam",
            "binary_crossentropy",
            metrics=['binary_crossentropy'],
        )
        model.fit(
            train_model_input,
            train[target_label].values,
            batch_size=256,
            epochs=10,
            verbose=0,
            validation_split=TEST_PROPORTION,
        )
        pred_y = model.predict(test_model_input, batch_size=256)

        auc = compute_auc(true_y, pred_y)
        logloss = compute_log_loss(true_y, pred_y)
        rmse = compute_rmse(true_y, pred_y)

        results_number_of_neurons["auc"].append(auc)
        results_number_of_neurons["logloss"].append(logloss)
        results_number_of_neurons["rmse"].append(rmse)

    if PLOT:
        create_plots("DFM", "avazu", results_activation_function,
                     "Activation Function", "activation_func",
                     dnn_activation_list)
        create_plots("DFM", "avazu", results_dropout, "Dropout Rate",
                     "dropout", dnn_dropout_list)
        create_plots("DFM", "avazu", results_number_of_neurons,
                     "Number of Neurons per layer", "nr_neurons",
                     dnn_hidden_units_list)
Exemplo n.º 17
0
def deepctr_cv(X_train,
               y_train,
               folds,
               logger,
               cv_path,
               X_test=None,
               optional_data=None,
               prep=True,
               split_conf=None):

    scores = []
    preds = []

    meta = np.zeros_like(y_train).astype("float64")
    if split_conf is None:
        X_tr, X_te, main_conf, _ = prep_for_embedding(X_train,
                                                      X_test,
                                                      conf,
                                                      prep=prep)
        X_train, X_test = X_tr, X_te
    else:
        main_conf = split_conf

    cat_cols = [c for c, _, _ in main_conf[0]]
    cat_fs = [SingleFeat(c, d) for c, d, _ in main_conf[0]]
    num_fs = [SingleFeat(c, 0) for c in conf.num_cols]

    X_test = split_df(X_test, cat_cols, conf.num_cols)

    for num_fold, (tr_ind, tes_ind) in enumerate(folds):
        if num_fold > 0:
            break
        logger.info(f"fold_{num_fold}")

        fold_path = cv_path / f"fold{num_fold}"
        seed_path = fold_path
        Path(fold_path).mkdir(exist_ok=True, parents=True)

        callbacks = [CSVLogger(str(fold_path / 'epochs.csv'))]

        X_cv_train, X_cv_test = X_train.iloc[tr_ind], X_train.iloc[tes_ind]
        y_cv_train, y_cv_test = y_train.iloc[tr_ind], y_train.iloc[tes_ind]
        X_cv_train = split_df(X_cv_train, cat_cols, conf.num_cols)
        X_cv_test = split_df(X_cv_test, cat_cols, conf.num_cols)

        model = DeepFM({
            'sparse': cat_fs,
            'dense': num_fs
        },
                       final_activation='sigmoid')
        model.compile("adam", "binary_crossentropy", metrics=['accuracy'])
        model.fit(X_cv_train,
                  y_cv_train,
                  callbacks=callbacks,
                  batch_size=2048,
                  epochs=10,
                  verbose=1,
                  validation_data=(X_cv_test, y_cv_test))
        model.save_weights(str(seed_path / 'weights.h5'), save_format='hdf5')
        gc.collect()

        if X_test is not None:
            pred = model.predict(X_test, batch_size=2048)
            pred = pred[:, 0]
            np.save(seed_path / f"pred.npy", pred)

        train_oof = model.predict(X_cv_test, batch_size=2048)
        train_oof = train_oof[:, 0]
        auc = roc_auc_score(y_cv_test.values, train_oof)
        logger.info(f"{num_fold}: auc {auc}")
        np.save(seed_path / f"train_oof.npy", train_oof)

        # auc = roc_auc_score(y_cv_test, train_oof)
        # logger.info(f"seed_average: auc {auc}")
        scores.append(auc)
        np.save(fold_path / f"tes_ind.npy", tes_ind)
        meta[tes_ind] += train_oof
        del X_cv_train, y_cv_train, X_cv_test, y_cv_test

        if X_test is not None:
            preds.append(pred)

    scores = np.array(scores)
    preds = np.array(preds)
    pred = rank_average(preds)
    logger.info(f"{scores.mean()}, {scores.std()}")
    return scores, pred, meta