예제 #1
0
def affinity_matrix(test_specs):
    """Generate a random user/item affinity matrix. By increasing the likehood of 0 elements we simulate
    a typical recommending situation where the input matrix is highly sparse.

    Args:
        users (int): number of users (rows).
        items (int): number of items (columns).
        ratings (int): rating scale, e.g. 5 meaning rates are from 1 to 5.
        spars: probability of obtaining zero. This roughly corresponds to the sparseness.
               of the generated matrix. If spars = 0 then the affinity matrix is dense.

    Returns:
        np.array: sparse user/affinity matrix of integers.

    """

    np.random.seed(test_specs["seed"])

    # uniform probability for the 5 ratings
    s = [(1 - test_specs["spars"]) / test_specs["ratings"]
         ] * test_specs["ratings"]
    s.append(test_specs["spars"])
    P = s[::-1]

    # generates the user/item affinity matrix. Ratings are from 1 to 5, with 0s denoting unrated items
    X = np.random.choice(test_specs["ratings"] + 1,
                         (test_specs["users"], test_specs["items"]),
                         p=P)

    Xtr, Xtst = numpy_stratified_split(X,
                                       ratio=test_specs["ratio"],
                                       seed=test_specs["seed"])

    return (Xtr, Xtst)
예제 #2
0
def test_int_numpy_stratified_splitter(test_specs, python_int_dataset):
    # generate a syntetic dataset
    X = python_int_dataset

    # the splitter returns (in order): train and test user/affinity matrices, train and test datafarmes and user/items to matrix maps
    Xtr, Xtst = numpy_stratified_split(X,
                                       ratio=test_specs["ratio"],
                                       seed=test_specs["seed"])

    # check that the generated matrices have the correct dimensions
    assert (Xtr.shape[0] == X.shape[0]) & (Xtr.shape[1] == X.shape[1])
    assert (Xtst.shape[0] == X.shape[0]) & (Xtst.shape[1] == X.shape[1])

    X_rated = np.sum(X != 0, axis=1)  # number of total rated items per user
    Xtr_rated = np.sum(Xtr != 0,
                       axis=1)  # number of rated items in the train set
    Xtst_rated = np.sum(Xtst != 0,
                        axis=1)  # number of rated items in the test set

    # global split: check that the all dataset is split in the correct ratio
    assert Xtr_rated.sum() / (X_rated.sum()) == pytest.approx(
        test_specs["ratio"], test_specs["tolerance"])

    assert Xtst_rated.sum() / (X_rated.sum()) == pytest.approx(
        1 - test_specs["ratio"], test_specs["tolerance"])

    # This implementation of the stratified splitter performs a random split at the single user level. Here we check
    # that also this more stringent condition is verified. Note that user to user fluctuations in the split ratio
    # are stronger than for the entire dataset due to the random nature of the per user splitting.
    # For this reason we allow a slightly bigger tolerance, as specified in the test_specs()

    assert (Xtr_rated / X_rated <=
            test_specs["ratio"] + test_specs["fluctuation"]).all() & (
                Xtr_rated / X_rated >=
                test_specs["ratio"] - test_specs["fluctuation"]).all()

    assert (Xtst_rated / X_rated <=
            (1 - test_specs["ratio"]) + test_specs["fluctuation"]).all() & (
                Xtst_rated / X_rated >=
                (1 - test_specs["ratio"]) - test_specs["fluctuation"]).all()
예제 #3
0
def main():
    args = read_args()
    model = args['model']
    top_k = args['max_len']
    latent_dim = args['LATENT_DIM']
    encoder_dims = []
    encoder_dims.append(args['ENCODER_DIMS'])
    intermediate_dim = args['INTERMEDIATE_DIM']
    num_epochs = args['NUM_EPOCHS']
    batch_size = args['BATCH_SIZE']
    holdout_pct = args['HOLDOUT_PCT']
    thresh = args['BINARIZATION_THRESHOLD']

    training_path = args['train']
    test_path = args['test']
    result_file_path = Path(args['result'])
    temp_weights_path = result_file_path.parent / 'svae_weights.hdf5'

    if model == 'StandardVAE':
        print("Running StandardVAE")
        tensorflow.compat.v1.disable_eager_execution()
        #tensorflow.python.framework_ops.disable_eager_execution()
        train_df = pd.read_csv(training_path,
                            sep="	", header=None)
        train_df.columns = ["userID", "itemID", "rating"]
        unique_train_items = pd.unique(train_df['itemID'])
        # Create train/validation/test users

        unique_users = sorted(train_df.userID.unique())
        np.random.seed(SEED)
        unique_users = np.random.permutation(unique_users)

        n_users = len(unique_users)
        heldout_users = int((n_users * holdout_pct) / 100)

        train_users = unique_users[:(n_users - heldout_users)]

        val_users = unique_users[(n_users - heldout_users):]

        # For training set keep only users that are in train_users list
        train_set = train_df.loc[train_df['userID'].isin(train_users)]

        # For validation set keep only users that are in val_users list
        val_set = train_df.loc[train_df['userID'].isin(val_users)]

        # For validation set keep only movies that used in training set
        val_set = val_set.loc[val_set['itemID'].isin(unique_train_items)]

        # Theoretically we could use this for predicting rating values
        # Right now will only work for list-wise predictions
        if top_k <= 0:
            test = pd.read_csv(test_path,
                               sep="	", header=None)
            test.columns = ["userID", "itemID", "rating"]

        am_train = AffinityMatrix(df=train_set, items_list=unique_train_items)
        am_val = AffinityMatrix(df=val_set, items_list=unique_train_items)

        train_set, _, _ = am_train.gen_affinity_matrix()
        val_data, val_map_users, val_map_items = am_val.gen_affinity_matrix()
        val_data_tr, val_data_te = numpy_stratified_split(val_data, ratio=0.75, seed=SEED)
        # Binarize validation data: training part
        train_data = binarize(a=train_set, threshold=thresh)
        val_data = binarize(a=val_data, threshold=thresh)
        val_data_tr = binarize(a=val_data_tr, threshold=thresh)

        # Binarize validation data: testing part (save non-binary version in the separate object, will be used for calculating NDCG)
        val_data_te_ratings = val_data_te.copy()
        val_data_te = binarize(a=val_data_te, threshold=3.5)

        print('Number of users: {}'.format(train_set.shape[0]))
        print('Number of items: {}'.format(train_set.shape[1]))

        #train model
        # svae = StandardVAE(
        #     k=latent_dim,
        #     encoder_structure=encoder_dims,
        #     act_fn=act_func,
        #     likelihood=likelihood,
        #     n_epochs=num_epochs,
        #     batch_size=batch_size,
        #     learning_rate=learning_rate,
        #     seed=SEED,
        #     use_gpu=torch.cuda.is_available(),
        #     verbose=True
        # )

        svae = StandardVAE(n_users=train_set.shape[0],  # Number of unique users in the training set
                    original_dim=train_set.shape[1],  # Number of unique items in the training set
                    intermediate_dim=intermediate_dim,
                    latent_dim=latent_dim,
                    n_epochs=num_epochs,
                    batch_size=batch_size,
                    k=top_k,
                    verbose=0,
                    seed=SEED,
                    save_path=str(temp_weights_path),
                    drop_encoder=0.5,
                    drop_decoder=0.5,
                    annealing=False,
                    beta=1.0
                    )

        svae.fit(x_train=train_data,
                                 x_valid=val_data,
                                 x_val_tr=val_data_tr,
                                 x_val_te=val_data_te_ratings, # with the original ratings
                                 mapper=am_val
                                 )

        # Might be able to use recommend_k_items instead?
        final_result = svae.recommend_k_items(train_data, top_k, remove_seen=True)
        #all_predictions = predict_ranking(svae, train, usercol='userID', itemcol='itemID', remove_seen=True)
        #final_result = get_top_k(all_predictions, top_k)
        top_k_df = am_train.map_back_sparse(final_result, kind='prediction')
        top_k_df.columns = ['userID', 'itemID', 'rating']

        top_k_sorted = top_k_df.sort_values(by=['userID', 'rating'], ascending=[True, False])

        top_k_sorted.to_csv(result_file_path, header=None, index=None, sep=',')