示例#1
0
                       sep=",",
                       header=0)
    # convert to implicit data and do negative sampling afterwards
    data["label"] = 1

    # split into train and test data based on time
    train_data, test_data = split_by_ratio_chrono(data, test_size=0.2)

    # specify complete columns information
    sparse_col = ["sex", "occupation", "genre1", "genre2", "genre3"]
    dense_col = ["age"]
    user_col = ["sex", "age", "occupation"]
    item_col = ["genre1", "genre2", "genre3"]

    train_data, data_info = DatasetFeat.build_trainset(train_data, user_col,
                                                       item_col, sparse_col,
                                                       dense_col)
    test_data = DatasetFeat.build_testset(test_data, sparse_col, dense_col)

    # sample negative items for each record
    train_data.build_negative_samples(data_info)
    test_data.build_negative_samples(data_info)
    print(data_info)  # n_users: 5962, n_items: 3226, data sparsity: 0.4185 %

    ytb_ranking = YouTubeRanking(task="ranking",
                                 data_info=data_info,
                                 embed_size=16,
                                 n_epochs=3,
                                 lr=1e-4,
                                 batch_size=512,
                                 use_bn=True,
    col_names = [
        "user", "item", "label", "time", "sex", "age", "occupation", "genre1",
        "genre2", "genre3"
    ]
    data = pd.read_csv("sample_data/sample_movielens_merged.csv",
                       sep=",",
                       header=0)
    train, test = split_by_ratio_chrono(data, test_size=0.2)

    sparse_col = ["sex", "occupation", "genre1", "genre2", "genre3"]
    dense_col = ["age"]
    user_col = ["sex", "age", "occupation"]
    item_col = ["genre1", "genre2", "genre3"]
    train_data, data_info = DatasetFeat.build_trainset(train,
                                                       user_col,
                                                       item_col,
                                                       sparse_col,
                                                       dense_col,
                                                       shuffle=False)
    test_data = DatasetFeat.build_testset(test, shuffle=False)
    print(data_info)
    train_data.build_negative_samples(data_info,
                                      num_neg=1,
                                      item_gen_mode="random",
                                      seed=2020)
    test_data.build_negative_samples(data_info,
                                     num_neg=1,
                                     item_gen_mode="random",
                                     seed=2222)

    deepfm = DeepFM("ranking",
                    data_info,
                       sep=",",
                       header=0)
    train_data, eval_data = split_by_ratio_chrono(data, test_size=0.2)

    # specify complete columns information
    sparse_col = ["sex", "occupation"]
    multi_sparse_col = [["genre1", "genre2",
                         "genre3"]]  # should be list of list
    dense_col = ["age"]
    user_col = ["sex", "age", "occupation"]
    item_col = ["genre1", "genre2", "genre3"]

    train_data, data_info = DatasetFeat.build_trainset(
        train_data=train_data,
        user_col=user_col,
        item_col=item_col,
        sparse_col=sparse_col,
        dense_col=dense_col,
        multi_sparse_col=multi_sparse_col,
    )
    eval_data = DatasetFeat.build_testset(eval_data)
    print(data_info)
    # do negative sampling, assume the data only contains positive feedback
    train_data.build_negative_samples(data_info,
                                      item_gen_mode="random",
                                      num_neg=1,
                                      seed=2020)
    eval_data.build_negative_samples(data_info,
                                     item_gen_mode="random",
                                     num_neg=1,
                                     seed=2222)
if __name__ == "__main__":
    col_names = ["user", "item", "label", "time", "sex",
                 "age", "occupation", "genre1", "genre2", "genre3"]
    all_data = pd.read_csv("sample_data/sample_movielens_merged.csv",
                       sep=",", header=0)

    # use first half data as first training part
    first_half_data = all_data[:(len(all_data) // 2)]
    train, test = split_by_ratio_chrono(first_half_data, test_size=0.2)

    sparse_col = ["sex", "occupation", "genre1", "genre2", "genre3"]
    dense_col = ["age"]
    user_col = ["sex", "age", "occupation"]
    item_col = ["genre1", "genre2", "genre3"]
    train_data, data_info = DatasetFeat.build_trainset(train, user_col, item_col,
                                                       sparse_col, dense_col,
                                                       shuffle=False)
    test_data = DatasetFeat.build_testset(test, shuffle=False)
    print(data_info)
    train_data.build_negative_samples(data_info, num_neg=1,
                                      item_gen_mode="random", seed=2020)
    test_data.build_negative_samples(data_info, num_neg=1,
                                     item_gen_mode="random", seed=2222)

    deepfm = DeepFM("ranking", data_info, embed_size=16, n_epochs=2,
                    lr=1e-4, lr_decay=False, reg=None, batch_size=2048,
                    num_neg=1, use_bn=False, dropout_rate=None,
                    hidden_units="128,64,32", tf_sess_config=None)
    deepfm.fit(train_data, verbose=2, shuffle=True, eval_data=test_data,
               metrics=["loss", "balanced_accuracy", "roc_auc", "pr_auc",
                        "precision", "recall", "map", "ndcg"],
示例#5
0
    data = pd.read_csv("sample_data/sample_movielens_merged.csv",
                       sep=",", header=0)
    train_data, eval_data = split_by_ratio_chrono(data, test_size=0.2)

    # specify complete columns information
    sparse_col = ["sex", "occupation"]
    multi_sparse_col = [["genre1", "genre2", "genre3"]]  # should be list of list
    dense_col = ["age"]
    user_col = ["sex", "age", "occupation"]
    item_col = ["genre1", "genre2", "genre3"]

    train_data, data_info = DatasetFeat.build_trainset(
        train_data=train_data,
        user_col=user_col,
        item_col=item_col,
        sparse_col=sparse_col,
        dense_col=dense_col,
        multi_sparse_col=multi_sparse_col,
        pad_val=["missing"]  # specify padding value
    )
    eval_data = DatasetFeat.build_testset(eval_data)
    print(data_info)
    # do negative sampling, assume the data only contains positive feedback
    train_data.build_negative_samples(data_info, item_gen_mode="random",
                                      num_neg=1, seed=2020)
    eval_data.build_negative_samples(data_info, item_gen_mode="random",
                                     num_neg=1, seed=2222)

    deepfm = DeepFM("ranking", data_info, embed_size=16, n_epochs=2,
                    lr=1e-4, lr_decay=False, reg=None, batch_size=2048,
                    num_neg=1, use_bn=False, dropout_rate=None,