예제 #1
0
def run_test_exp(model_name=None,
                 evaluator=None,
                 raw_data=None,
                 user_per=1.0,
                 keep_days=1,
                 l2_reg=0.001,
                 test_date=None,
                 outdir=None,
                 num_itr=1e4 + 1):

    # parse dataset into incremental training and testing set
    data = raw_data
    max_user = len(np.unique(data["user_id"]))
    max_item = len(np.unique(data["item_id"]))
    print("max_user:{}, max_item:{}".format(max_user, max_item))

    test_date = datetime.datetime.strptime(test_date, "%Y-%m-%d").date()
    print("test date:%s" % test_date)
    train_data = data[data["timestamp"] < test_date]
    test_data = data[(data["timestamp"] >= test_date) & (
        data["timestamp"] < (test_date + datetime.timedelta(days=7)))]
    np.random.seed(10)
    test_data = np.asarray([
        np.random.choice(test_data[test_data["user_id"] == uid], 1)[0]
        for uid in np.unique(test_data["user_id"])
    ])

    # filter training data, for selected users keep only the latest n days of data
    print("filter user percentage:%f" % user_per)
    print("ratings before filter:%d" % len(train_data))
    user_list = np.unique(train_data["user_id"])
    filter_user = np.random.choice(user_list,
                                   int(len(user_list) * user_per),
                                   replace=False)
    filter_mask = (np.isin(train_data["user_id"], filter_user)) & (
        train_data["timestamp"] <
        (test_date - datetime.timedelta(days=keep_days)))

    # output filtered data and test data
    if LOGGING:
        np.save(outdir + "filtered_data.npy", train_data[filter_mask])
        np.save(outdir + "train_data.npy", train_data[~filter_mask])
        np.save(outdir + "test_data.npy", test_data)

    train_data = train_data[~filter_mask]
    print("ratings after filter:%d" % len(train_data))

    train_dataset = ImplicitDataset(train_data,
                                    max_user,
                                    max_item,
                                    name='Train')
    test_dataset = ImplicitDataset(test_data, max_user, max_item, name='Test')

    num_process = 8
    dim_embed = 50
    if model_name == 'PMF':
        model = PMF(batch_size=batch_size,
                    max_user=train_dataset.max_user(),
                    max_item=train_dataset.max_item(),
                    dim_embed=dim_embed,
                    opt='Adam',
                    l2_reg=l2_reg)
        sampler = PointwiseSampler(batch_size=batch_size,
                                   dataset=train_dataset,
                                   pos_ratio=0.5,
                                   num_process=num_process)
    elif model_name == 'CML':
        model = CML(batch_size=batch_size,
                    max_user=train_dataset.max_user(),
                    max_item=train_dataset.max_item(),
                    dim_embed=dim_embed,
                    opt='Adam',
                    l2_reg=l2_reg)
        sampler = PairwiseSampler(batch_size=batch_size,
                                  dataset=train_dataset,
                                  num_process=num_process)
    elif model_name == 'BPR':
        model = BPR(batch_size=batch_size,
                    max_user=train_dataset.max_user(),
                    max_item=train_dataset.max_item(),
                    dim_embed=dim_embed,
                    opt='Adam',
                    l2_reg=l2_reg)
        sampler = PairwiseSampler(batch_size=batch_size,
                                  dataset=train_dataset,
                                  num_process=num_process)
    else:
        print("Wrong model assigned")
        return

    if evaluator == 'Recall':
        test_evaluator = Recall(
            recall_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
    elif evaluator == 'NDCG':
        test_evaluator = NDCG(
            ndcg_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
    else:
        print("Wrong evaluator assisgned")
        return

    model_trainer = ImplicitModelTrainer(batch_size=batch_size,
                                         test_batch_size=test_batch_size,
                                         train_dataset=train_dataset,
                                         model=model,
                                         sampler=sampler,
                                         item_serving_size=1,
                                         eval_save_prefix=outdir)
    model_trainer.train(num_itr=num_itr + 1,
                        display_itr=num_itr,
                        eval_datasets=[test_dataset],
                        evaluators=[test_evaluator],
                        num_negatives=200)
예제 #2
0
                max_item=train_dataset.max_item(),
                dim_embed=50,
                opt='Adam',
                sess_config=None,
                l2_reg=0.1)
sampler = PairwiseSampler(batch_size=batch_size,
                          dataset=train_dataset,
                          num_process=5)
model_trainer = ImplicitModelTrainer(batch_size=batch_size,
                                     test_batch_size=test_batch_size,
                                     train_dataset=train_dataset,
                                     model=cml_model,
                                     sampler=sampler,
                                     eval_save_prefix="cml-citeulike")
auc_evaluator = AUC()
recall_evaluator = Recall(recall_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
precision_evaluator = Precision(
    precision_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
ndcg_evaluator = NDCG(ndcg_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100])

cml_model.load("cml-citeulike")

model_trainer._eval_manager = ImplicitEvalManager(evaluators=[
    auc_evaluator, recall_evaluator, ndcg_evaluator, precision_evaluator
])
model_trainer._num_negatives = 200
model_trainer._exclude_positives([test_dataset])
model_trainer._sample_negatives(seed=10)
model_trainer._eval_save_prefix = "cml-citeulike-test"
model_trainer._evaluate_partial(test_dataset)
예제 #3
0
def run_exp(model_name=None,
            raw_data=None,
            user_per=1.0,
            keep_days=1,
            l2_reg=0.001,
            test_date=None,
            outdir=None):

    # parse dataset into incremental training and testing set
    data = raw_data
    max_user = len(np.unique(data["user_id"]))
    max_item = len(np.unique(data["item_id"]))
    print("max_user:{}, max_item:{}".format(max_user, max_item))

    test_date = datetime.datetime.strptime(test_date, "%Y-%m-%d").date()
    print("test date:%s" % test_date)
    train_data = data[data["timestamp"] < test_date]

    np.random.seed(10)

    # filter training data, for selected users keep only the most recent n days of data
    print("filter user percentage:%f" % user_per)
    print("ratings before filter:%d" % len(train_data))
    user_list = np.unique(train_data["user_id"])
    filter_user = np.random.choice(user_list,
                                   int(len(user_list) * user_per),
                                   replace=False)
    mask = (np.isin(train_data["user_id"], filter_user)) & (
        train_data["timestamp"] <
        (test_date - datetime.timedelta(days=keep_days)))
    train_data = train_data[~mask]
    print("ratings after filter:%d" % len(train_data))

    # random select one item for each user for validation
    user_list = np.unique(train_data["user_id"])
    val_index = [
        np.where(train_data["user_id"] == uid)[0][0] for uid in user_list
    ]  # leave out the most recent rating for validation
    val_data = train_data[val_index]
    train_data = np.delete(train_data, val_index)
    print("trian data: %d, validation data %d" %
          (len(train_data), len(val_data)))

    train_dataset = ImplicitDataset(train_data,
                                    max_user,
                                    max_item,
                                    name='Train')
    val_dataset = ImplicitDataset(val_data, max_user, max_item, name='Val')

    num_process = 8
    dim_embed = 50
    if model_name == 'PMF':
        model = PMF(batch_size=batch_size,
                    max_user=train_dataset.max_user(),
                    max_item=train_dataset.max_item(),
                    dim_embed=dim_embed,
                    opt='Adam',
                    l2_reg=l2_reg)
        sampler = PointwiseSampler(batch_size=batch_size,
                                   dataset=train_dataset,
                                   pos_ratio=0.5,
                                   num_process=num_process)
    elif model_name == 'CML':
        model = CML(batch_size=batch_size,
                    max_user=train_dataset.max_user(),
                    max_item=train_dataset.max_item(),
                    dim_embed=dim_embed,
                    opt='Adam',
                    l2_reg=l2_reg)
        sampler = PairwiseSampler(batch_size=batch_size,
                                  dataset=train_dataset,
                                  num_process=num_process)
    elif model_name == 'BPR':
        model = BPR(batch_size=batch_size,
                    max_user=train_dataset.max_user(),
                    max_item=train_dataset.max_item(),
                    dim_embed=dim_embed,
                    opt='Adam',
                    l2_reg=l2_reg)
        sampler = PairwiseSampler(batch_size=batch_size,
                                  dataset=train_dataset,
                                  num_process=num_process)
    else:
        print("Wrong model assigned")
        return

    recall_evaluator = Recall(
        recall_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100])
    ndcg_evaluator = NDCG(ndcg_at=[10, 20, 30, 40, 50, 60, 70, 80, 90, 100])

    model_trainer = ImplicitModelTrainer(batch_size=batch_size,
                                         test_batch_size=test_batch_size,
                                         train_dataset=train_dataset,
                                         model=model,
                                         sampler=sampler,
                                         item_serving_size=1)
    model_trainer.train(num_itr=num_itr,
                        display_itr=display_itr,
                        eval_datasets=[val_dataset],
                        evaluators=[recall_evaluator, ndcg_evaluator],
                        num_negatives=200)