def testImplicit(self): regularization = 1e-9 tolerance = math.sqrt(regularization) tolerance = 0.001 counts = csr_matrix([[1, 1, 0, 1, 0, 0], [0, 1, 1, 1, 0, 0], [1, 0, 1, 0, 0, 0], [1, 1, 0, 0, 0, 0], [0, 0, 1, 1, 0, 0], [0, 1, 0, 0, 0, 1], [0, 0, 0, 0, 1, 0]], dtype=np.float64) def check_solution(rows, cols, counts): reconstructed = rows.dot(cols.T) for i in range(counts.shape[0]): for j in range(counts.shape[1]): self.assertTrue(abs(counts[i, j] - reconstructed[i, j]) < tolerance) # check cython version rows, cols = implicit.alternating_least_squares(counts * 2, 7, regularization, use_native=True) check_solution(rows, cols, counts.todense()) # try out pure python version rows, cols = implicit.alternating_least_squares(counts, 7, regularization, use_native=False) check_solution(rows, cols, counts.todense())
def testImplicit(self): regularization = 1e-9 tolerance = math.sqrt(regularization) tolerance = 0.001 counts = csr_matrix( [[1, 1, 0, 1, 0, 0], [0, 1, 1, 1, 0, 0], [1, 0, 1, 0, 0, 0], [1, 1, 0, 0, 0, 0], [0, 0, 1, 1, 0, 0], [0, 1, 0, 0, 0, 1], [0, 0, 0, 0, 1, 0]], dtype=np.float64) def check_solution(rows, cols, counts): reconstructed = rows.dot(cols.T) for i in range(counts.shape[0]): for j in range(counts.shape[1]): self.assertTrue( abs(counts[i, j] - reconstructed[i, j]) < tolerance) # try all 8 variants of native/python, cg/cholesky, and # 64 vs 32 bit factors for dtype in (np.float32, np.float64): for use_cg in (True, False): for use_native in (True, False): rows, cols = implicit.alternating_least_squares( counts * 2, 7, regularization, use_native=use_native, use_cg=use_cg, dtype=dtype) check_solution(rows, cols, counts.todense())
def alsRec(self): score = self.preProcessData() data ,living = self.matrixData(score) weighted = self.bm25_weight(living) print weighted.shape user1_factors, user2_factors = implicit.alternating_least_squares(weighted, factors=5) print "save to redis" self.saveToRedis(data, user1_factors, user2_factors)
def fit(self, X, y=None): M = self.construct_sparse_matrix(X).tocoo() items = np.int32(M.col) self.popularity_ = self.count_popularity(items) M = M.toarray() M = M * self.alpha M = csr_matrix(M).astype('double') self.U, self.I = implicit.alternating_least_squares(M, factors=self.n_factors, regularization=self.regularization, iterations=self.n_epochs) return self
def recommend_boats(self): countrys_arr = np.array(self.countrys) boats_arr = np.array(self.boats) countrys_vecs, boats_vecs = implicit.alternating_least_squares( (self.boat_train * self.alpha).astype('double'), factors=20, regularization=0.1, iterations=50) return countrys_vecs, boats_vecs, boats_arr, countrys_arr
def recommend_destination(self): countrys_arr = np.array( self.countrys) # Array of destination IDs from the ratings matrix distinations_arr = np.array(self.distinations) user_vecs, item_vecs = implicit.alternating_least_squares( (self.distination_train * self.alpha).astype('double'), factors=20, regularization=0.1, iterations=50) return user_vecs, item_vecs, distinations_arr, countrys_arr
def get_aucs_vs_factors_als(): factors = [8, 16, 32, 64, 128] aucs = [] for factor in factors: subreddit_factors, user_factors = alternating_least_squares( bm25_weight(comments), factor) aucs.append( auc(test_set[:20000], user_factors, subreddit_factors, subreddits, users)) return aucs
def fit(self, X, y=None): M = self.construct_sparse_matrix(X).tocoo() items = np.int32(M.col) self.popularity_ = self.count_popularity(items) M = M.toarray() M = M * self.alpha M = csr_matrix(M).astype('double') self.U, self.I = implicit.alternating_least_squares( M, factors=self.n_factors, regularization=self.regularization, iterations=self.n_epochs) return self
def calculate_similar_artists(input_filename, output_filename, factors=50, regularization=0.01, iterations=15, exact=False, trees=20, use_native=True, dtype=numpy.float64): print("Calculating similar artists. This might take a while") print("reading data from %s", input_filename) start = time.time() df, transfers = read_data(input_filename) print("read data file in %s", time.time() - start) print("weighting matrix by bm25") weighted = bm25_weight(transfers) print("calculating factors") start = time.time() artist_factors, user_factors = alternating_least_squares(weighted, factors=factors, regularization=regularization, iterations=iterations, use_native=use_native, dtype=dtype) print("calculated factors in %s", time.time() - start) # write out artists by popularity print("calculating top artists") user_count = df.groupby('artist').size() artists = dict(enumerate(df['artist'].cat.categories)) to_generate = sorted(list(artists), key=lambda x: -user_count[x]) if exact: calc = TopRelated(artist_factors) else: calc = ApproximateTopRelated(artist_factors, trees) print("writing top related to %s", output_filename) with open(output_filename, "w") as o: for artistid in to_generate: artist = artists[artistid] for other, score in calc.get_related(artistid): o.write("%s\t%s\t%s\n" % (artist, artists[other], score)) recommendedClients = RecommendedClients() recommendedClients.empresa = Empresa.objects.get(fiscal_id=artist) recommendedClients.clientes_recomendados = Empresa.objects.get(fiscal_id=artists[other]) recommendedClients.similarity = score recommendedClients.save()
def fit(model, train_target, out_dir): """ Factorize the training matrix of playlist-song pairs. Return nothing. Parameters ---------- model: model file Model specification. train_target: numpy array, shape (num playlists, num songs) Matrix of playlist-song co-occurrences at the train split. out_dir: string Path to the results directory """ print('\nSetting up fit...') # identify dimensions num_playlists, num_songs = train_target.shape # initialize weights playlists = np.random.rand(num_playlists, model.num_factors) * 0.01 songs = np.random.rand(num_songs, model.num_factors) * 0.01 print('\nFitting...') for epoch in xrange(1, model.max_epochs + 1): # keep track of time start_time = time.time() playlists, songs = implicit.alternating_least_squares( Cui=model.positive_weight * train_target, factors=model.num_factors, X=playlists, Y=songs, iterations=1, regularization=model.L2_weight, use_cg=False) print('\tEpoch {} of {} took {:.3f}s'.format(epoch, model.max_epochs, time.time() - start_time)) # save the fit model print('\nSaving model weights...') params = (playlists, songs) params_file = '{}_params.pkl'.format(model.name) with open(os.path.join(out_dir, params_file), 'w') as f: cPickle.dump(params, f)
def als(trainSparse): """Training of the ALS algorithm Args: trainSparse: train sparse matrix Returns: user_vecs_arr: user matrix (users x latent_factors) item_vecs_arr: item matrix (items x latent_factors) """ print("-> Training ALS algorithm ...") k = 130 user_vecs_arr, item_vecs_arr = implicit.alternating_least_squares( trainSparse, factors=k, regularization=0.01, iterations=30) return (user_vecs_arr, item_vecs_arr)
def run_user(self, pid, k): """ Return a set of recommendations for the user """ user_vecs, item_vecs = implicit.alternating_least_squares( (self.inter_df * self.alpha).astype('double'), factors=64, regularization=0.1, iterations=50) rec_list = self.rec_items(pid, self.inter_df, user_vecs, item_vecs, num_items=k) return set(rec_list)
def train(self, training_set, alpha, factors, regularization, iterations): """ Using lib implicit python for recommend packages for that user :param training_set: training data :param alpha: linear_scaling factor alpha :param factors: latent vectors for each user and item :param regularization: parameter for avoiding overfit data :param iterations: iteration of algorithm :return: initialize instance variable training_set, user_vecs and item_vecs """ # Training algorithm with lib implicit an initial instance variable user_vecs and item_vecs self.user_vecs, self.item_vecs = \ implicit.alternating_least_squares((training_set*alpha).astype('double'), factors=factors, regularization=regularization, iterations=iterations)
def calculate_similar_artists(input_filename, output_filename, factors=50, regularization=0.01, iterations=15, exact=False, trees=20, use_native=True, dtype=numpy.float64, cg=False): logging.debug("Calculating similar artists. This might take a while") logging.debug("reading data from %s", input_filename) start = time.time() df, plays = read_data(input_filename) logging.debug("read data file in %s", time.time() - start) logging.debug("weighting matrix by bm25") weighted = bm25_weight(plays) logging.debug("calculating factors") start = time.time() artist_factors, user_factors = alternating_least_squares( weighted, factors=factors, regularization=regularization, iterations=iterations, use_native=use_native, dtype=dtype, use_cg=cg) logging.debug("calculated factors in %s", time.time() - start) # write out artists by popularity logging.debug("calculating top artists") user_count = df.groupby('artist').size() artists = dict(enumerate(df['artist'].cat.categories)) to_generate = sorted(list(artists), key=lambda x: -user_count[x]) if exact: calc = TopRelated(artist_factors) else: calc = ApproximateTopRelated(artist_factors, trees) logging.debug("writing top related to %s", output_filename) with open(output_filename, "w") as o: for artistid in to_generate: artist = artists[artistid] for other, score in calc.get_related(artistid): o.write("%s\t%s\t%s\n" % (artist, artists[other], score))
def train_model(input_filename, output_filename, factors=50, regularization=0.01, iterations=15, use_native=True, cg=True): logging.debug("Reading data from %s", input_filename) start = time.time() df, plays = load_data(input_filename) logging.debug("Read data file in %s", time.time() - start) logging.debug("Weighting matrix by bm25") weighted, params = bm25_weight(plays) params["regularization"] = regularization logging.debug("Calculating factors") start = time.time() subr_factors, user_factors = alternating_least_squares(weighted, factors=factors, regularization=regularization, iterations=iterations, use_native=use_native, dtype=np.float64, use_cg=cg) logging.debug("Calculated factors in %s", time.time() - start) logging.debug("Writing model to disk") with open("params.pickle", "wb") as b: pickle.dump(params, b) subreddits = dict(enumerate(df['subreddit'].cat.categories)) with open("dict.pickle", "wb") as d: pickle.dump(subreddits, d) with open("factors.pickle", "wb") as f: pickle.dump(subr_factors, f) model = TopRelated(subr_factors) # Print 10 most similar subreddits for each subreddit to evaluate the model with open(output_filename, "w") as out: for i, name in subreddits.items(): related = model.get_related(i) for other, score in related: out.write("{}\t{}\t{}\n".format(name, subreddits[other], score)) logging.debug("Training complete")
def main(): data = preprocess_data_rec_engine(status=False) item_table = data[0] p_sparse = data[1] customers = data[2] products = data[3] quantity = data[4] tti = split_data_mask(p_sparse, pct_test = 0.2) product_training_set = tti[0] product_test_set = tti[1] product_user_altered = tti[2] alpha = 15 vecs = implicit.alternating_least_squares((product_training_set*alpha).astype('double'), factors=20, regularization = 0.1, iterations = 50) user_vecs = vecs[0] item_vecs = vecs[1] customers_arr = np.array(customers) # Array of customer IDs from the ratings matrix products_arr = np.array(products) # Array of product IDs from the ratings matrix rf = rec_items(12346, product_training_set, user_vecs, item_vecs, customers_arr, products_arr, item_table, num_items = 10) print(get_items_purchased(12346, product_training_set, customers_arr, products_arr, item_table)) print(rf) l = list_rec(rf) print(l) print(lookup_customer_id(4338)) # df = pd.read_pickle('../data/final/df_final.pkl') # table_pickle_file = open('../data/final/df_customer_table.pkl', "rb") # customer_table = pickle.load(table_pickle_file) # table_pickle_file.close() # search_customer(3, df, customer_table) print("Done")
def calculate_similar_artists(input_filename, output_filename, factors=50, regularization=0.01, iterations=15, exact=False, trees=20, use_native=True): logging.debug("Calculating similar artists. This might take a while") logging.debug("reading data from %s", input_filename) start = time.time() df, plays = read_data(input_filename) logging.debug("read data file in %s", time.time() - start) logging.debug("weighting matrix by bm25") weighted = bm25_weight(plays) logging.debug("calculating factors") start = time.time() artist_factors, user_factors = alternating_least_squares(weighted, factors=factors, regularization=regularization, iterations=iterations, use_native=use_native) logging.debug("calculated factors in %s", time.time() - start) # write out artists by popularity logging.debug("calculating top artists") user_count = df.groupby('artist').size() artists = dict(enumerate(df['artist'].cat.categories)) to_generate = sorted(list(artists), key=lambda x: -user_count[x]) if exact: calc = TopRelated(artist_factors) else: calc = ApproximateTopRelated(artist_factors, trees) logging.debug("writing top related to %s", output_filename) with open(output_filename, "w") as o: for artistid in to_generate: artist = artists[artistid] for other, score in calc.get_related(artistid): o.write("%s\t%s\t%s\n" % (artist, artists[other], score))
def testALS(self): counts = csr_matrix( [[1, 1, 0, 1, 0, 0], [0, 1, 1, 1, 0, 0], [1, 0, 1, 0, 0, 0], [1, 1, 0, 0, 0, 0], [0, 0, 1, 1, 0, 1], [0, 1, 0, 0, 0, 1], [0, 0, 0, 0, 1, 1]], dtype=np.float64) # try all 8 variants of native/python, cg/cholesky, and # 64 vs 32 bit factors for dtype in (np.float32, np.float64): for use_cg in (False, True): for use_native in (True, False): try: np.random.seed(23) rows, cols = implicit.alternating_least_squares( counts * 2, 6, regularization=1e-10, use_native=use_native, use_cg=use_cg, dtype=dtype) except Exception as e: self.fail(msg="failed to factorize matrix. Error=%s" " dtype=%s, cg=%s, native=%s" % (e, dtype, use_cg, use_native)) reconstructed = rows.dot(cols.T) for i in range(counts.shape[0]): for j in range(counts.shape[1]): self.assertAlmostEqual( counts[i, j], reconstructed[i, j], delta=0.0001, msg="failed to reconstruct row=%s, col=%s," " value=%.5f, dtype=%s, cg=%s, native=%s" % (i, j, reconstructed[i, j], dtype, use_cg, use_native))
def calculate_similar_artists(input_filename, output_filename, factors=50, regularization=0.01, iterations=15, exact=False, trees=20, use_native=True, dtype=numpy.float64): # Calculo de clientes recomendados --- print("Calculating similar clients. This might take a while") print("reading data from %s", input_filename) start = time.time() df, transfers = read_data(input_filename, inv=False) print("read data file in %s", time.time() - start) print("weighting matrix by bm25") weighted = bm25_weight(transfers) print("calculating factors") start = time.time() artist_factors, user_factors = alternating_least_squares(weighted, factors=factors, regularization=regularization, iterations=iterations, use_native=use_native, dtype=dtype) print("calculated factors in %s", time.time() - start) # write out artists by popularity print("calculating top clients") user_count = df.groupby('artist').size() artists = dict(enumerate(df['artist'].cat.categories)) to_generate = sorted(list(artists), key=lambda x: -user_count[x]) if exact: calc = TopRelated(artist_factors) else: calc = ApproximateTopRelated(artist_factors, trees) list_of_recommended_clients = [] print("writing top related to %s", output_filename) for i, artistid in enumerate(to_generate): print(i) artist = artists[artistid] for other, score in calc.get_related(artistid): if (artist!=artists[other]): recommendedClients = RecommendedClients() recommendedClients.empresa = Empresa.objects.get(fiscal_id=artist) recommendedClients.clientes_recomendados = Empresa.objects.get(fiscal_id=artists[other]) recommendedClients.similarity = score list_of_recommended_clients.append(recommendedClients) print('All clients recommendations have been stored in a list, saving them to DB') RecommendedClients.objects.bulk_create(list_of_recommended_clients, batch_size=20000) # Calculo de proveedores recomendados --- print("Calculating similar providers. This might take a while") print("reading data from %s", input_filename) start = time.time() df, transfers = read_data(input_filename, inv=True) print("read data file in %s", time.time() - start) print("weighting matrix by bm25") weighted = bm25_weight(transfers) print("calculating factors") start = time.time() artist_factors, user_factors = alternating_least_squares(weighted, factors=factors, regularization=regularization, iterations=iterations, use_native=use_native, dtype=dtype) print("calculated factors in %s", time.time() - start) # write out artists by popularity print("calculating top providers") user_count = df.groupby('artist').size() artists = dict(enumerate(df['artist'].cat.categories)) to_generate = sorted(list(artists), key=lambda x: -user_count[x]) if exact: calc = TopRelated(artist_factors) else: calc = ApproximateTopRelated(artist_factors, trees) list_of_recommended_providers = [] print("writing top related to %s", output_filename) for i, artistid in enumerate(to_generate): print(i) artist = artists[artistid] for other, score in calc.get_related(artistid): if (artist!=artists[other]): list_of_recommended_providers.append(recommendedProviders) print('All providers recommendations have been stored in a list, saving them to DB') RecommendedProviders.objects.bulk_create(list_of_recommended_providers, batch_size=20000)
def main(): purchase_input = sys.argv[1] cold_start_input = sys.argv[2] K = int(sys.argv[3]) if K > 10: print("ERROR: Please recommend <= 10 products") exit(1) # read guest start, item start cold_start = pd.read_csv(cold_start_input) # read purchase data df = pd.read_csv(purchase_input) df.columns = ['qty', 'item_id', 'guest_id', 'purchase_date'] df = df[['guest_id', 'item_id', 'qty']] # drop missing value and negative qty df = df.dropna() df = df[df.qty > 0] df = df[df.guest_id.map(lambda x: x.isdigit()) & df.item_id.map(lambda x: x.isdigit())] # merge two df df['guest_id'] = df['guest_id'].astype(int) df['item_id'] = df['item_id'].astype(int) all = df.append(cold_start) # construct utility matrix guests = list(np.sort(all.guest_id.unique())) items = list(np.sort(all.item_id.unique())) quantity = list(all.qty) rows = all.guest_id.astype('category', categories=guests).cat.codes cols = all.item_id.astype('category', categories=items).cat.codes ori_rows = df.guest_id.astype('category', categories=guests).cat.codes ori_cols = df.item_id.astype('category', categories=items).cat.codes utility_mat = sparse.csr_matrix((quantity, (rows, cols)), shape=(len(guests), len(items))) # check sparsity sparsity = 100 * (1 - 1.0 * len(all) / (utility_mat.shape[0] * utility_mat.shape[1])) print( "Sparsity after content based initialization is: {}".format(sparsity)) # split training and testing data train_set, test_index = train_test_split(utility_mat, 0.1) # run ALS for implicit feedback to generate hidden features alpha = 40 guest_feature, item_feature = implicit.alternating_least_squares( (train_set * alpha).astype('double'), factors=10, regularization=0.1, iterations=50) # collect predicted values predict_matrix = guest_feature.dot(item_feature.T) # evaluate performance using average rank hidden_rank, all_rank = average_rank(predict_matrix, test_index, rows, cols) print('Expected percentile ranking on testing set:{}\n' 'Expected percentile ranking on total set: {}'.format( hidden_rank, all_rank)) # recommend top K items for each guest and output guest_array = np.array(guests) items_array = np.array(items) rcd_df = top_rcmd(predict_matrix, guest_array, items_array, ori_rows, ori_cols, k=K) rcd_df.to_csv('recommendations.csv', index=False)
def benchmark_implicit(matrix, factors, reg, iterations): start = time.time() alternating_least_squares(matrix, factors, reg, iterations) return time.time() - start
import implicit from scipy.sparse import coo_matrix import pandas as pd dataFile = ".\\data\\ml-100k\\u.data" data = pd.read_csv(dataFile, sep="\t", header=None, usecols=[0, 1, 2], names=["userId", "itemId", "rating"]) data["userId"] = data["userId"].astype("category") data["itemId"] = data["itemId"].astype("category") rating_matrix = coo_matrix( (data["rating"].astype(float), (data["itemId"].cat.codes.copy(), data["userId"].cat.codes.copy()))) user_factors, item_factors = implicit.alternating_least_squares( rating_matrix, factors=10, regularization=0.01) print(user_factors[196]) user196 = item_factors.dot(user_factors[196]) import heapq recommendations = heapq.nlargest(3, range(len(user196)), user196.take) print(recommendations)
comments = coo_matrix((data['comments'].astype(float), (data['subreddit'].cat.codes, data['user'].cat.codes))) #%% [markdown] # ### Latent Semantic Analysis #%% # toggle this variable if you want to recalculate the als factors read_als_factors_from_file = True #%% if read_als_factors_from_file: subreddit_factors = np.load('subreddit_factors_als.npy') user_factors = np.load('user_factors_als.npy') else: subreddit_factors, user_factors = alternating_least_squares( bm25_weight(comments), 20) #%% subreddit_factors, user_factors = alternating_least_squares( bm25_weight(comments), 20) #%% class TopRelated(object): def __init__(self, subreddit_factors): norms = np.linalg.norm(subreddit_factors, axis=-1) self.factors = subreddit_factors / norms[:, np.newaxis] self.subreddits = data['subreddit'].cat.categories.array.to_numpy() def get_related(self, subreddit, N=10): subredditid = np.where(self.subreddits == subreddit)[0][0]
indices=array.indices, indptr=array.indptr, shape=array.shape) return # ALS: Alternative Least Squares alpha = 40 factors = 300 regularization = 0.01 iterations = 20 matr = sps.lil_matrix((len(users), urm.shape[1])) min_max = MinMaxScaler() user_vecs, item_vecs = impl.alternating_least_squares( (urm * alpha).astype('double'), factors, regularization, iterations) l = len(users) for u in range(l): # dot product of user vector with all item vectors rec_vector = user_vecs[u, :].dot(item_vecs.T) rec_vector[items_nact] = 0 # scale recommendation vector rec_vector between 0 and 1 rec = min_max.fit_transform(rec_vector.reshape(-1, 1))[:, 0] cols = np.argsort(rec)[::-1][:1000] matr[u, cols] = rec[cols] print(u) save_sparse_csr('ALS4k', matr.tocsr()) print("done")
# Get the associated row indices cols = data['movieId'].astype( pd.CategoricalDtype(categories=products, ordered=True)).cat.codes train, test = train_test_split(rows.values, cols.values, quantity) del quantity, rows, cols train_sparse = sparse.csr_matrix((train[2], (train[0], train[1])), shape=(len(customers), len(products))) print("IO done in %f" % io_time.interval) alpha = 15 with Timer() as cython_als_t: user_vecs, item_vecs = implicit.alternating_least_squares( (train_sparse * alpha).astype('double'), factors=64, regularization=0.1, iterations=10, use_gpu=False) print(f"Time spent in implicit: {cython_als_t.interval}") evaluator = Evaluator(test[0], test[1], test[2], threshold=3.0) baseline_model = BaselinePredictor(train[1], train[2]) baseline_fpr, baseline_tpr, baseline_roc = evaluator.roc( lambda user, item: baseline_model.pred(item)) fpr, tpr, roc = evaluator.roc( lambda user, item: np.sum(user_vecs[user, :] * item_vecs[item, :])) print("AUC: %f" % roc) plt.clf() plt.plot(baseline_fpr, baseline_tpr, label='baseline')
return training_set, test_set, list(set(user_inds)) # Output the unique list of user rows that were altered places_train1, places_test1, places_users_altered1 = make_train(visits_sparse1, pct_test = 0.2) places_train2, places_test2, places_users_altered2 = make_train(visits_sparse2, pct_test = 0.2) places_train3, places_test3, places_users_altered3 = make_train(visits_sparse3, pct_test = 0.2) places_train4, places_test4, places_users_altered4 = make_train(visits_sparse4, pct_test = 0.2) ###################################### alpha = 40 user_vecs1, place_vecs1 = implicit.alternating_least_squares((places_train1*alpha).astype('double'), factors=100, regularization = 0.1, iterations = 80) user_vecs2, place_vecs2 = implicit.alternating_least_squares((places_train2*alpha).astype('double'), factors=100, regularization = 0.1, iterations = 80) user_vecs3, place_vecs3 = implicit.alternating_least_squares((places_train3*alpha).astype('double'), factors=100, regularization = 0.1, iterations = 80)
reg_list = [.01, .1, 1, 10] factor_list = [64, 128, 256] # store outcomes out_file = open("als_hyperparameters.txt", "a+") out_file.write("alpha\treg\tfactors\trec_auc\tpop_auc\n") # train test split u_to_a_train, u_to_a_test, altered_users = mflib.make_train(a_u_matrix.T.tocsr(), pct_test=0.2) for alpha_idx in range(len(alpha_list)): for reg_idx in range(len(reg_list)): for factor_idx in range(len(factor_list)): print(alpha_idx, reg_idx, factor_idx) # split original matrix into user matrix and artist matrix through ALS user_vecs, artists_vecs = implicit.alternating_least_squares( (u_to_a_train * alpha_list[alpha_idx]).astype('double'), factors=factor_list[factor_idx], regularization=reg_list[reg_idx], iterations=50, use_gpu=True) rec_auc, pop_auc = mflib.calc_mean_auc(u_to_a_train, altered_users, [sparse.csr_matrix(user_vecs), sparse.csr_matrix(artists_vecs.T)], u_to_a_test) out_file.write(str(alpha_list[alpha_idx]) + "\t" + str(reg_list[reg_idx]) + "\t" + str(factor_list[factor_idx]) + "\t" + str(rec_auc) + "\t" + str(pop_auc) + "\n") out_file.close()
def get_alternate_least_squares(self): alpha = 15 return implicit.alternating_least_squares((self.product_train*alpha).astype('double'), factors = 20, regularization = 0.1, iterations = 50)
with open('Y_training_pid_trackid_new_rating_csr.pkl', 'rb') as f: Y_training_pid_track_id_rating_sparse_csr = pickle.load(f) with open('Y_challenge_1_5_10_25_100track_pidnew_trackid_rating_csr.pkl', 'rb') as f: Y_challenge_track_pidnew_rating_sparse_csr = pickle.load(f) Y_training_pid_track_id_rating_sparse_csr = Y_training_pid_track_id_rating_sparse_csr.T Y_training = sparse.vstack([ Y_challenge_track_pidnew_rating_sparse_csr, Y_training_pid_track_id_rating_sparse_csr ], 'csr') W, X = implicit.alternating_least_squares((Y_training * 50).astype('double'), factors=400, regularization=0.01, iterations=50, use_gpu=False) def rec_items(pid): pref_vec = Y_training[pid].toarray() pref_vec = pref_vec.reshape(-1) + 1 pref_vec[pref_vec > 1] = 0 rec_vector = W[pid].dot(X.T) min_max = MinMaxScaler() rec_vector_scaled = min_max.fit_transform(rec_vector.reshape(-1, 1))[:, 0] recommend_vector = pref_vec * rec_vector_scaled idx = np.argsort(recommend_vector)[::-1][:500] recommendation = pd.DataFrame(idx) return recommendation.T
def calculate_similar_artists(input_filename, output_filename, model="als", factors=50, regularization=0.01, iterations=15, exact=False, trees=20, use_native=True, dtype=numpy.float64, cg=False): logging.debug("Calculating similar artists. This might take a while") logging.debug("reading data from %s", input_filename) start = time.time() df, plays = read_data(input_filename) logging.debug("read data file in %s", time.time() - start) # write out artists by popularity logging.debug("calculating top artists") user_count = df.groupby('artist').size() artists = dict(enumerate(df['artist'].cat.categories)) to_generate = sorted(list(artists), key=lambda x: -user_count[x]) start = time.time() if model == "als": logging.debug("weighting matrix by bm25") weighted = bm25_weight(plays, K1=100, B=0.8) logging.debug("calculating factors") artist_factors, user_factors = alternating_least_squares( weighted, factors=factors, regularization=regularization, iterations=iterations, use_native=use_native, dtype=dtype, use_cg=cg) logging.debug("calculated factors in %s", time.time() - start) if exact: calc = TopRelated(artist_factors) else: calc = ApproximateTopRelated(artist_factors, trees) logging.debug("writing top related to %s", output_filename) with open(output_filename, "w") as o: for artistid in to_generate: artist = artists[artistid] for other, score in calc.get_related(artistid): o.write("%s\t%s\t%s\n" % (artist, artists[other], score)) elif model in ("bm25", "tfidf", "cosine", "smoothed_cosine", "ochiai", "overlap"): if model == "bm25": scorer = BM25Recommender(K1=100, B=0.5) elif model == "tfidf": scorer = TFIDFRecommender() elif model == "cosine": scorer = CosineRecommender() else: raise NotImplementedError("TODO: model %s" % model) logging.debug("calculating similar items") start = time.time() scorer.fit(plays, K=11) logging.debug("calculated all_pairs_knn in %s", time.time() - start) with open(output_filename, "w") as o: for artistid in to_generate: artist = artists[artistid] for other, score in scorer.similar_items(artistid): o.write("%s\t%s\t%s\n" % (artist, artists[other], score))
Y[i] = spsolve(xTx + xTCiIX + lambda_eye, xTCiPi) # Solve for Yi = ((xTx + xT(Cu-I)X) + lambda*I)^-1)xTCiPi, equation 5 from the paper # End iterations return X, Y.T user_vecs, item_vecs = implicit_weighted_ALS(product_train, lambda_val=0.1, alpha=15, iterations=1, rank_size=20) user_vecs[0, :].dot(item_vecs).toarray()[0, :5] alpha = 15 user_vecs, item_vecs = implicit.alternating_least_squares( (product_train * alpha).astype('double'), factors=20, regularization=0.1, iterations=50) def auc_score(predictions, test): fpr, tpr, thresholds = metrics.roc_curve(test, predictions) return metrics.auc(fpr, tpr) def calc_mean_auc(training_set, altered_users, predictions, test_set): store_auc = [ ] # An empty list to store the AUC for each user that had an item removed from the training set popularity_auc = [] # To store popular AUC scores pop_items = np.array(test_set.sum(axis=0)).reshape( -1) # Get sum of item iteractions to find most popular
dataset = pd.read_csv('./data/dataset_users_match.csv', sep=',', index_col=0) interact = pd.read_csv('./data/inter_matr.csv', sep=',', index_col=0) colnames = list(interact.columns.values) # for i in range(len(colnames)): # colnames[i] = colnames[i]+str('_chosen') # interact.columns = colnames interact_sparse = sparse.csr_matrix(interact) users_chosen_train, users_chosen_test, users_users_altered = make_train( interact_sparse, pct_test=0.2) alpha = 15 user_vecs, users_chosen_vecs = implicit.alternating_least_squares( (users_chosen_train * alpha).astype('double'), factors=20, regularization=0.1, iterations=200) user_vecs = pd.DataFrame(user_vecs) user_vecs.index = list(interact.index.values) users_chosen_vecs = pd.DataFrame(users_chosen_vecs) users_chosen_vecs.index = list(interact.columns.values) """Make recomendations""" user_id = 141 position = list(interact.index.values).index(user_id) num_items = 10 pref_vec = users_chosen_train[position, :].toarray( ) # Get the ratings from the training set ratings matrix pref_vec = pref_vec.reshape( -1
def recommender(customer_id, status): # Start time start = time.time() if status: printGreen('✔ RetailBox started..\t\t{0:.1f}s'.format(time.time() - start)) start = time.time() # Validate User Input validate_customer_id(customer_id) # Load Dataframe and create item_table, purchase matrix, etc. data = preprocess_data_rec_engine(status=True) item_table = data[0] purchase_sparse_matrix = data[1] customers = data[2] products = data[3] quantity = data[4] if status: printGreen('✔ Processed Data..\t\t{0:.1f}s'.format(time.time() - start)) start = time.time() # Split Data (Training/Test Split) training_test_split_data = split_data_mask(purchase_sparse_matrix, pct_test=0.2) product_training_set = training_test_split_data[0] product_test_set = training_test_split_data[1] product_user_altered = training_test_split_data[2] if status: printGreen( '✔ Split Data into Training and Test Sets..\t\t{0:.1f}s'.format( time.time() - start)) start = time.time() # Train Recommendation Engine on given algorithm alpha = 15 recommender_vecs = implicit.alternating_least_squares( (product_training_set * alpha).astype('double'), factors=20, regularization=0.1, iterations=50) user_vecs = recommender_vecs[0] item_vecs = recommender_vecs[1] customers_arr = np.array(customers) products_arr = np.array(products) if status: printGreen('✔ Recommender System Training Done..\t\t{0:.1f}s'.format( time.time() - start)) start = time.time() # Lookup customer id cid = lookup_customer_id(customer_id) # Generate Recommendations for Customer rec_output = rec_items(cid, product_training_set, user_vecs, item_vecs, customers_arr, products_arr, item_table) # Display Customer df = pd.read_pickle('../data/final/df_final.pkl') table_pickle_file = open('../data/final/df_customer_table.pkl', "rb") customer_table = pickle.load(table_pickle_file) table_pickle_file.close() search_customer(customer_id, df, customer_table) # Display Item Recommendations recommended_items_list = list_rec(rec_output) display_recommender_items(recommended_items_list)