def affinity_matrix(test_specs): """Generate a random user/item affinity matrix. By increasing the likehood of 0 elements we simulate a typical recommending situation where the input matrix is highly sparse. Args: users (int): number of users (rows). items (int): number of items (columns). ratings (int): rating scale, e.g. 5 meaning rates are from 1 to 5. spars: probability of obtaining zero. This roughly corresponds to the sparseness. of the generated matrix. If spars = 0 then the affinity matrix is dense. Returns: np.array: sparse user/affinity matrix of integers. """ np.random.seed(test_specs["seed"]) # uniform probability for the 5 ratings s = [(1 - test_specs["spars"]) / test_specs["ratings"]] * test_specs["ratings"] s.append(test_specs["spars"]) P = s[::-1] # generates the user/item affinity matrix. Ratings are from 1 to 5, with 0s denoting unrated items X = np.random.choice( test_specs["ratings"] + 1, (test_specs["users"], test_specs["items"]), p=P ) Xtr, Xtst = numpy_stratified_split( X, ratio=test_specs["ratio"], seed=test_specs["seed"] ) return (Xtr, Xtst)
def test_int_numpy_stratified_splitter(test_specs, python_int_dataset): # generate a syntetic dataset X = python_int_dataset # the splitter returns (in order): train and test user/affinity matrices, train and test datafarmes and user/items to matrix maps Xtr, Xtst = numpy_stratified_split( X, ratio=test_specs["ratio"], seed=test_specs["seed"] ) # check that the generated matrices have the correct dimensions assert (Xtr.shape[0] == X.shape[0]) & (Xtr.shape[1] == X.shape[1]) assert (Xtst.shape[0] == X.shape[0]) & (Xtst.shape[1] == X.shape[1]) X_rated = np.sum(X != 0, axis=1) # number of total rated items per user Xtr_rated = np.sum(Xtr != 0, axis=1) # number of rated items in the train set Xtst_rated = np.sum(Xtst != 0, axis=1) # number of rated items in the test set # global split: check that the all dataset is split in the correct ratio assert Xtr_rated.sum() / (X_rated.sum()) == pytest.approx( test_specs["ratio"], test_specs["tolerance"] ) assert Xtst_rated.sum() / (X_rated.sum()) == pytest.approx( 1 - test_specs["ratio"], test_specs["tolerance"] ) # This implementation of the stratified splitter performs a random split at the single user level. Here we check # that also this more stringent condition is verified. Note that user to user fluctuations in the split ratio # are stronger than for the entire dataset due to the random nature of the per user splitting. # For this reason we allow a slightly bigger tolerance, as specified in the test_specs() assert ( (Xtr_rated / X_rated <= test_specs["ratio"] + test_specs["fluctuation"]).all() & (Xtr_rated / X_rated >= test_specs["ratio"] - test_specs["fluctuation"]).all() ) assert ( ( Xtst_rated / X_rated <= (1 - test_specs["ratio"]) + test_specs["fluctuation"] ).all() & ( Xtst_rated / X_rated >= (1 - test_specs["ratio"]) - test_specs["fluctuation"] ).all() )
def RBMtrain(): data = pd.read_csv("SnacksData100.csv") header = { "col_user": "******", "col_item": "Product_Id", "col_rating": "Ratings", } am = AffinityMatrix(DF=data, **header) X = am.gen_affinity_matrix() Xtr, Xtst = numpy_stratified_split(X) model = RBM(hidden_units=600, training_epoch=30, minibatch_size=60, keep_prob=0.9, with_metrics=True) model.fit(Xtr, Xtst) top_k, test_time = model.recommend_k_items(Xtst) top_k_df = am.map_back_sparse(top_k, kind='prediction') test_df = am.map_back_sparse(Xtst, kind='ratings') joblib.dump(top_k_df, 'testdata')
header = { "col_user": "******", "col_item": "MovieID", "col_rating": "Rating", } # Use a sparse matrix representation rather than a pandas data frame # for significant performance gain. am = AffinityMatrix(DF=data, **header) X = am.gen_affinity_matrix() # Contstruct the training and test datasets. Xtr, Xtst = numpy_stratified_split(X) print('\nTraining matrix size (users, movies) is:', Xtr.shape) print('Testing matrix size is: ', Xtst.shape) # Initialize the model class. Note that through random variation we # can get a much better performing model with seed=1! model = RBM( hidden_units=600, training_epoch=30, minibatch_size=60, keep_prob=0.9, with_metrics=True, # seed = 1, )