Пример #1
0
 def fit_test(self, train_X, train_Y, val_X, val_Y, test_X, test_Y,
              cat_cols):
     sparse_features = cat_cols
     dense_features = [
         idx for idx in range(train_X.shape[1]) if idx not in cat_cols
     ]
     sparse_feature_columns = [
         SparseFeat(str(feat),
                    vocabulary_size=len(set(train_X[:, feat])) + 1,
                    embedding_dim=4)
         for i, feat in enumerate(sparse_features)
     ]
     dense_feature_columns = [
         DenseFeat(
             str(feat),
             1,
         ) for feat in dense_features
     ]
     dnn_feature_columns = sparse_feature_columns + dense_feature_columns
     linear_feature_columns = sparse_feature_columns + dense_feature_columns
     feature_names = get_feature_names(linear_feature_columns +
                                       dnn_feature_columns)
     train_model_input = {
         name: train_X[:, int(name)]
         for name in feature_names
     }
     val_model_input = {name: val_X[:, int(name)] for name in feature_names}
     test_model_input = {
         name: test_X[:, int(name)]
         for name in feature_names
     }
     use_cuda = True
     if use_cuda and torch.cuda.is_available():
         print('cuda ready...')
         self.device = 'cuda:0'
     self.model = xDeepFM(linear_feature_columns,
                          dnn_feature_columns,
                          task='binary',
                          device=self.device)
     self.model.compile(
         Adam(self.model.parameters(), 0.0001),
         "binary_crossentropy",
         metrics=['binary_crossentropy'],
     )
     es = EarlyStopping(monitor='val_binary_crossentropy',
                        min_delta=0,
                        verbose=1,
                        patience=30,
                        mode='min')
     lbe = LabelEncoder()
     self.model.fit(train_model_input,
                    lbe.fit_transform(train_Y),
                    batch_size=512,
                    epochs=21,
                    verbose=2,
                    validation_data=(val_model_input, lbe.transform(val_Y)))
     pred_ans = self.model.predict(test_model_input, batch_size=256)
     print(f'{log_loss(test_Y, pred_ans):.5f}')
Пример #2
0
def test_xDeepFM(dnn_hidden_units, cin_layer_size, cin_split_half, cin_activation, sparse_feature_num, dense_feature_dim):

    model_name = 'xDeepFM'

    sample_size = SAMPLE_SIZE
    x, y, feature_columns = get_test_data(
        sample_size, sparse_feature_num, sparse_feature_num)
    model = xDeepFM(feature_columns, feature_columns, dnn_hidden_units=dnn_hidden_units, cin_layer_size=cin_layer_size,
                    cin_split_half=cin_split_half, cin_activation=cin_activation, dnn_dropout=0.5)
    check_model(model, model_name, x, y)
def test_recommend_movies(pretrained, inputs, linear_feature_columns,
                          dnn_feature_columns, DEVICE):
    model = xDeepFM(linear_feature_columns,
                    dnn_feature_columns,
                    task='regression',
                    device=DEVICE)
    model.load_state_dict(torch.load(pretrained))
    pred_ans = model.predict(inputs, batch_size=256)

    #     print(f'Predict rating: {pred_ans}')

    pred_movie_list = []
    idx = np.argsort(pred_ans, axis=0)[::-1]
    for i, ans_idx in enumerate(idx[:, 0]):
        if i < 5:
            print(
                f"Top {i} predict rating: {pred_ans[ans_idx][0] :.3f}, movie_id: {data.iloc[ans_idx]['movie_id']}, gender: {data.iloc[ans_idx]['gender']}, age: {data.iloc[ans_idx]['age']}, user_id: {data.iloc[ans_idx]['user_id']}"
            )
            pred_movie_list.append(data.iloc[ans_idx]['movie_id'])
            # print(inputs['movie_id'].iloc[i])

    # print('movie_max',data.loc[:, ['movie_id']].max(axis=0)) # 1682

    return pred_movie_list
def train_recommend_movies(csv_file, DEVICE):
    """
        Description:
            Train recommend system on: 
                Model: "xDeepFM", 
                Target: "rating",
                Input features: ["movie_id", "gender", "age"],
                Save model to: "save_model/xDeepFM_MSE{}.h5"

        Parameters: 
            csv_file: "path to *.csv"
            DEVICE: "cuda:0"
    """
    data = pd.read_csv(csv_file)
    sparse_features = ["movie_id", "gender", "age"]
    target = ['rating']

    # 1.Label Encoding for sparse features,and do simple Transformation for dense features
    for feat in sparse_features:
        lbe = LabelEncoder()
        data[feat] = lbe.fit_transform(data[feat])

    # 2.count #unique features for each sparse field
    fixlen_feature_columns = [
        SparseFeat(feat, data[feat].nunique()) for feat in sparse_features
    ]

    # 他自己的資料型態
    # SparseFeat(name='movie_id', vocabulary_size=187, embedding_dim=4, use_hash=False, dtype='int32', embedding_name='movie_id', group_name='default_group')
    linear_feature_columns = fixlen_feature_columns
    dnn_feature_columns = fixlen_feature_columns
    feature_names = get_feature_names(
        linear_feature_columns + dnn_feature_columns
    )  # movie_id, user_id, gender, age, occupation, zip.

    # 3.generate input data for model
    train, test = train_test_split(data, test_size=0.2)
    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {
        name: test[name]
        for name in feature_names
    }  # dict of movie_id, user_id, gender, age, occupation, zip values

    # 4.Define Model,train,predict and evaluate
    device = 'cpu'
    use_cuda = True
    if use_cuda and torch.cuda.is_available():
        print('cuda ready...')
        device = DEVICE

    # model = DeepFM(linear_feature_columns, dnn_feature_columns, task='regression', device=device)
    # model = FiBiNET(linear_feature_columns, dnn_feature_columns, task='regression', device=device)
    model = xDeepFM(linear_feature_columns,
                    dnn_feature_columns,
                    task='regression',
                    device=device)
    model.compile(
        "adam",
        "mse",
        metrics=['mse'],
    )

    history = model.fit(
        train_model_input,
        train[target].values,
        batch_size=256,
        epochs=10,
        verbose=2,
        validation_split=0.2,
    )
    pred_ans = model.predict(test_model_input, batch_size=256)

    print("test MSE",
          round(mean_squared_error(test[target].values, pred_ans), 4))

    torch.save(
        model.state_dict(), 'save_model/xDeepFM_MSE{}.h5'.format(
            round(mean_squared_error(test[target].values, pred_ans), 4)))
    train, test = train_test_split(data, test_size=0.2)

    train_model_input = {name: train[name] for name in feature_names}
    test_model_input = {name: test[name] for name in feature_names}

    # 4.Define Model,train,predict and evaluate

    device = 'cpu'
    use_cuda = True
    if use_cuda and torch.cuda.is_available():
        print('cuda ready...')
        device = 'cuda:0'

    model = xDeepFM(linear_feature_columns=linear_feature_columns,
                    dnn_feature_columns=dnn_feature_columns,
                    mlet_dim=(16, ),
                    task='binary',
                    l2_reg_embedding=1e-5,
                    device=device)

    model.compile(
        "adagrad",
        "binary_crossentropy",
        metrics=["binary_crossentropy", "auc"],
    )
    model.fit(train_model_input,
              train[target].values,
              batch_size=32,
              epochs=10,
              validation_split=0.0,
              verbose=2)
Пример #6
0
X_valid = model_feed_dict(valid[feature_name])
X_test = model_feed_dict(test[feature_name])

Y = train['label'].values
valid_Y = valid['label'].values

torch.cuda.empty_cache()

use_cuda = True
if use_cuda and torch.cuda.is_available():
    print('cuda ready...')
    device = 'cuda:0'

# torch.autograd.set_detect_anomaly(True)

model = xDeepFM(linear_feature_columns, dnn_feature_columns, device=device)
model.compile("adam", 'binary_crossentropy', ["auc"])
model.fit(
    X_train,
    Y,
    batch_size=4096,
    epochs=1,
    validation_data=(X_valid, valid_Y),
    verbose=1,
)

model.fit(X_valid, valid_Y, batch_size=4096)
answer = model.predict(X_test, batch_size=8192)
submit = pd.DataFrame()
submit['id'] = test['id'].astype(int)
submit['probability'] = np.round(answer.flatten(), 6)