def fit_transform(self, X): """""" S = log_surplus_confidence_matrix(X, self.alpha, self.epsilon) transformed, self.components_ = factorize(S, self.k, self.lambda_reg, self.n_ter, self.init_std, self.verbose) return transformed
def train(self, data, num_factors=25, lambda_reg=1e-3, num_iterations=2, init_std=0.01, verbose=True): self.data = data self.W, self.H = wmf.factorize( data, num_factors=num_factors, lambda_reg=lambda_reg, num_iterations=num_iterations, init_std=init_std, verbose=verbose, dtype=np.float64, recompute_factors=wmf.recompute_factors_bias) self.trained = True
import batched_inv_precompute import solve_mp import solve_gpu np.random.seed(123) B = np.load("test_matrix_large.pkl") # shuffle columns of B so the dense parts are evenly distributed indices = np.arange(B.shape[1]) np.random.shuffle(indices) B = B[:, indices] S = wmf.log_surplus_confidence_matrix(B, alpha=2.0, epsilon=1e-6) num_factors = 40 + 1 num_iterations = 1 batch_size = 10000 # solve = batched_inv.solve_sequential # solve = solve_mp.solve_mp solve = solve_gpu.solve_gpu # U, V = wmf.factorize(S, num_factors=num_factors, lambda_reg=1e-5, num_iterations=num_iterations, init_std=0.01, verbose=True, dtype='float32', # recompute_factors=batched_inv.recompute_factors_bias_batched, batch_size=batch_size, solve=solve) U, V = wmf.factorize(S, num_factors=num_factors, lambda_reg=1e-5, num_iterations=num_iterations, init_std=0.01, verbose=True, dtype='float32', recompute_factors=batched_inv_precompute.recompute_factors_bias_batched_precompute, batch_size=batch_size, solve=solve)
import numpy as np import wmf import batched_inv import batched_inv_precompute import solve_mp import solve_gpu np.random.seed(123) B = np.load("test_matrix.pkl") S = wmf.log_surplus_confidence_matrix(B, alpha=2.0, epsilon=1e-6) num_factors = 40 + 1 num_iterations = 1 batch_size = 10000 # solve = batched_inv.solve_sequential # solve = solve_mp.solve_mp solve = solve_gpu.solve_gpu # U, V = wmf.factorize(S, num_factors=num_factors, lambda_reg=1e-5, num_iterations=num_iterations, init_std=0.01, verbose=True, dtype='float32', # recompute_factors=batched_inv.recompute_factors_bias_batched, batch_size=batch_size, solve=solve) U, V = wmf.factorize(S, num_factors=num_factors, lambda_reg=1e-5, num_iterations=num_iterations, init_std=0.01, verbose=True, dtype='float32', recompute_factors=batched_inv_precompute.recompute_factors_bias_batched_precompute, batch_size=batch_size, solve=solve)
import numpy as np import wmf B = np.load("test_matrix.pkl") S = wmf.log_surplus_confidence_matrix(B, alpha=2.0, epsilon=1e-6) U, V = wmf.factorize(S, num_factors=41, lambda_reg=1e-5, num_iterations=2, init_std=0.01, verbose=True, dtype='float32', recompute_factors=wmf.recompute_factors_bias)
import numpy as np import pandas as pd import scipy.sparse as sparse import json import wmf if __name__ == "__main__": song_user_sparse_matrix = sparse.load_npz('..\\Metadata\\song_user_matrix.npz') #dense_matrix = song_user_sparse_matrix.todense() confidence_matrix = wmf.log_surplus_confidence_matrix(song_user_sparse_matrix,alpha=40,epsilon=10**-8) song_latent_factors,user_latent_factors = wmf.factorize(confidence_matrix,num_factors=100) np.savez_compressed('..\\Metadata\\song_latent_factors.npz',song_latent_factors) np.savez_compressed('..\\Metadata\\user_latent_factors.npz',user_latent_factors) print(user_latent_factors.shape,song_latent_factors.shape) #print(user_latent_factors)
def main(data, p, q, K, M, num_iterations, alpha, lambda_reg, init_std): # loc = "/Users/ekansh/repos/data/{}" # ds = "nyt/" # loss = 'kullback-leibler' # # loss = "frobenius" graph = (data[:, :2], data[:, 2]) graph, _ = zero_index_sparse_graph(graph, axis = 0) graph, _ = zero_index_sparse_graph(graph, axis = 1) U = np.unique(graph[0][:, 0]) nU = U.shape[0] I = np.unique(graph[0][:, 1]) nI = I.shape[0] # Split data # Data Split tr_graph, lu_graph, ts_graph, tr_U, lu_I = pq_samp_split(graph, p, q) U = np.unique(graph[0][:, 0]) nU = U.shape[0] # THIS IS CONFUSING. FIX IT! tr_U_zero_indexer = zero_index(tr_U, True) I = np.unique(graph[0][:, 1]) nI = I.shape[0] n_tr_U = tr_U.shape[0] tr_I = I n_tr_I = nI lu_U = ts_U = np.setdiff1d(U, tr_U, assume_unique=True) n_lu_U = n_ts_U = lu_U.shape[0] lu_U_zero_indexer = ts_U_zero_indexer = zero_index(lu_U, True) n_lu_I = lu_I.shape[0] lu_I_zero_indexer = zero_index(lu_I, True) ts_I = np.setdiff1d(I, lu_I, assume_unique=True) n_ts_I = ts_I.shape[0] ts_I_zero_indexer = zero_index(ts_I, True) ## Train NMF # K = 10 # print("Using {} loss".format(loss)) # model = NMF(n_components=K, init='random', random_state=0, beta_loss=loss, solver='mu', max_iter=1000) zi_tr_graph = zero_index_sparse_graph(tr_graph, axis=0, convert=tr_U_zero_indexer) zi_tr_graph_sparse = csr_matrix((zi_tr_graph[1], (zi_tr_graph[0][:, 0], zi_tr_graph[0][:, 1])), shape=(n_tr_U, n_tr_I)) S_tr_sparse = wmf.log_surplus_confidence_matrix(zi_tr_graph_sparse, alpha=alpha, epsilon=TINY) tr_U_f, tr_I_f = wmf.factorize(S_tr_sparse, num_factors=K, lambda_reg=lambda_reg, num_iterations=num_iterations, init_std=init_std, verbose=True, dtype='float32', recompute_factors=wmf.recompute_factors_bias) tr_I_f = tr_I_f.T # Train lookup model with item features fixed # 2. Train on graph_lookup: fix item_feat zi_lu_graph = zero_index_sparse_graph(lu_graph, axis=1, convert=lu_I_zero_indexer) zi_lu_graph = zero_index_sparse_graph(zi_lu_graph, axis=0, convert=lu_U_zero_indexer) zi_lu_graph_sparse = csr_matrix((zi_lu_graph[1], (zi_lu_graph[0][:, 0], zi_lu_graph[0][:, 1])), shape=(n_lu_U, n_lu_I)) S_lu_sparse = wmf.log_surplus_confidence_matrix(zi_lu_graph_sparse, alpha=alpha, epsilon=TINY) lu_U_f, _ = wmf.factorize(zi_lu_graph_sparse, num_factors=K, lambda_reg=lambda_reg, num_iterations=num_iterations, init_std=init_std, verbose=True, dtype='float32', recompute_factors=wmf.recompute_factors_bias, V=tr_I_f[:, lu_I].T) ts_U_f = lu_U_f ts_I_f = tr_I_f[:, ts_I] predictions = np.matmul(ts_U_f, ts_I_f) zi_ts_graph = zero_index_sparse_graph(ts_graph, axis=0, convert=ts_U_zero_indexer) zi_ts_graph = zero_index_sparse_graph(zi_ts_graph, axis=1, convert=ts_I_zero_indexer) # topk = torch.topk(torch.tensor(predictions), n_ts_I) nDCG_score = nDCG(np.r_[0:ts_U.shape[0]], topk[1], zi_ts_graph[0]) precision_score = precision_at_m(np.r_[0:ts_U.shape[0]], topk[1], zi_ts_graph[0], m=M) print("nDCG Score for is {}".format(np.mean(nDCG_score))) print("Precision at {} is {}".format(M, np.mean(precision_score))) pass
nI = I.shape[0] # Split data train, test = edge_samp_split(graph, 0.8) train_sparse = csr_matrix((train[1], (train[0][:, 0], train[0][:, 1])), shape=(nU, nI)) ## Train NMF K = 10 print("Using {} loss".format(loss)) # model = NMF(n_components=K, init='random', random_state=0, beta_loss=loss, solver='mu', max_iter=1000) S = wmf.log_surplus_confidence_matrix(train_sparse, alpha=2.0, epsilon=1e-6) user_features, item_features = wmf.factorize(S, num_factors=K, lambda_reg=1e-5, num_iterations=20, init_std=0.01, verbose=True, dtype='float32', recompute_factors=wmf.recompute_factors_bias) ts_U = np.unique(test[0][:, 0]) zi_test, test_convert = zero_index_sparse_graph(test) mask_edges = train[0][np.in1d(train[0][:, 0], ts_U)] mask_edges[:, 0] = test_convert[mask_edges[:, 0]] test_user_features = user_features[ts_U] predictions = np.matmul(test_user_features, item_features.T) # Scatter update might be faster but for correctness for edge in mask_edges: predictions[tuple(edge)] = 0. # Recommend top_k
def main(data, p, K, M, num_iterations, alpha, lambda_reg, init_std): # loc = "/Users/ekansh/repos/data/{}" # ds = "nyt/" # loss = 'kullback-leibler' # # loss = "frobenius" graph = (data[:, :2], data[:, 2]) graph, _ = zero_index_sparse_graph(graph, axis=0) graph, _ = zero_index_sparse_graph(graph, axis=1) U = np.unique(graph[0][:, 0]) nU = U.shape[0] I = np.unique(graph[0][:, 1]) nI = I.shape[0] # Split data train, test = edge_samp_split(graph, p) train_sparse = csr_matrix((train[1], (train[0][:, 0], train[0][:, 1])), shape=(nU, nI)) ## Train NMF # K = 10 # print("Using {} loss".format(loss)) # model = NMF(n_components=K, init='random', random_state=0, beta_loss=loss, solver='mu', max_iter=1000) S = wmf.log_surplus_confidence_matrix(train_sparse, alpha=alpha, epsilon=TINY) user_features, item_features = wmf.factorize( S, num_factors=K, lambda_reg=lambda_reg, num_iterations=num_iterations, init_std=init_std, verbose=True, dtype='float32', recompute_factors=wmf.recompute_factors_bias) ts_U = np.unique(test[0][:, 0]) zi_test, test_convert = zero_index_sparse_graph(test) mask_edges = train[0][np.in1d(train[0][:, 0], ts_U)] mask_edges[:, 0] = test_convert[mask_edges[:, 0]] test_user_features = user_features[ts_U] predictions = np.matmul(test_user_features, item_features.T) # Scatter update might be faster but for correctness for edge in mask_edges: predictions[tuple(edge)] = 0. # Recommend top_k topk = torch.topk(torch.tensor(predictions), I.shape[0])[1].numpy() # Evaluate: More metrics? nDCG_score = nDCG(np.r_[0:ts_U.shape[0]], topk, zi_test[0]) # m=20 precision_score = precision_at_m(np.r_[0:ts_U.shape[0]], topk, zi_test[0], M) print("nDCG Score for is {}".format(np.mean(nDCG_score))) print("Precision at {} is {}".format(m, np.mean(precision_score))) pass
np.random.shuffle(indx) temp = np.asarray(user_dict[i]) R_test[i, temp[indx[:l / 2], 0]] = temp[indx[:l / 2], 1] R_train[i, temp[indx[l / 2:], 0]] = temp[indx[l / 2:], 1] num_train += len(indx[l / 2:]) num_test += len(indx[:l / 2]) return R_train.tocsr() #path = 'ratings.csv' #R = get_data(path) R = load_data() R = R[:-1000000, :-100000] R.data = np.ones_like(R.data) S = wmf.log_surplus_confidence_matrix(R, alpha=20.0, epsilon=1e-6) num_iters = 10 num_factors = 50 U, V = wmf.factorize(S, num_factors, R=R, num_iterations=num_iters, verbose=True) print('rmse', rmse(R, U, V)) np.save('U_wmf_2', U) np.save('V_wmf_2', V)