def calculate_recommendations(output_filename, model_name="als"): """ Generates artist recommendations for each user in the dataset """ # train the model based off input params artists, users, plays = get_lastfm() # for i in range(len(users)): # print(users[i], end=' ') # for j in range(len(artists)): # if plays[i, j]!=0: # print(plays[i, j], end=' ') # print() print(type(users), users.shape) print(type(artists), artists.shape) print(type(plays), plays.shape) return # create a model from the input data model = get_model(model_name) # if we're training an ALS based model, weight input for last.fm # by bm25 if issubclass(model.__class__, AlternatingLeastSquares): # lets weight these models by bm25weight. logging.debug("weighting matrix by bm25_weight") plays = bm25_weight(plays, K1=100, B=0.8) # also disable building approximate recommend index model.approximate_similar_items = False # this is actually disturbingly expensive: plays = plays.tocsr() logging.debug("training model %s", model_name) start = time.time() model.fit(plays) logging.debug("trained model '%s' in %0.2fs", model_name, time.time() - start) # generate recommendations for each user and write out to a file start = time.time() user_plays = plays.T.tocsr() with tqdm.tqdm(total=len(users)) as progress: with codecs.open(output_filename, "w", "utf8") as o: for userid, username in enumerate(users): for artistid, score in model.recommend(userid, user_plays): o.write("%s\t%s\t%s\n" % (username, artists[artistid], score)) progress.update(1) logging.debug("generated recommendations in %0.2fs", time.time() - start)
def calculate_similar_artists(output_filename, model_name="als"): """ generates a list of similar artists in lastfm by utiliizing the 'similar_items' api of the models """ artists, users, plays = get_lastfm() # create a model from the input data model = get_model(model_name) # if we're training an ALS based model, weight input for last.fm # by bm25 if issubclass(model.__class__, AlternatingLeastSquares): # lets weight these models by bm25weight. logging.debug("weighting matrix by bm25_weight") plays = bm25_weight(plays, K1=100, B=0.8) # also disable building approximate recommend index model.approximate_recommend = False # this is actually disturbingly expensive: plays = plays.tocsr() logging.debug("training model %s", model_name) start = time.time() model.fit(plays) logging.debug("trained model '%s' in %0.2fs", model_name, time.time() - start) # write out similar artists by popularity start = time.time() logging.debug("calculating top artists") user_count = np.ediff1d(plays.indptr) to_generate = sorted(np.arange(len(artists)), key=lambda x: -user_count[x]) # write out as a TSV of artistid, otherartistid, score logging.debug("writing similar items") with tqdm.tqdm(total=len(to_generate)) as progress: with codecs.open(output_filename, "w", "utf8") as o: for artistid in to_generate: artist = artists[artistid] for other, score in model.similar_items(artistid, 11): o.write("%s\t%s\t%s\n" % (artist, artists[other], score)) progress.update(1) logging.debug("generated similar artists in %0.2fs", time.time() - start)
def calculate_similar_artists(output_filename, model_name="als"): """ generates a list of similar artists in lastfm by utiliizing the 'similar_items' api of the models """ artists, users, plays = get_lastfm() # create a model from the input data model = get_model(model_name) # if we're training an ALS based model, weight input for last.fm # by bm25 if issubclass(model.__class__, AlternatingLeastSquares): # lets weight these models by bm25weight. logging.debug("weighting matrix by bm25_weight") plays = bm25_weight(plays, K1=100, B=0.8) # also disable building approximate recommend index model.approximate_recommend = False # this is actually disturbingly expensive: plays = plays.tocsr() logging.debug("training model %s", model_name) start = time.time() model.fit(plays) logging.debug("trained model '%s' in %0.2fs", model_name, time.time() - start) # write out similar artists by popularity start = time.time() logging.debug("calculating top artists") user_count = np.ediff1d(plays.indptr) to_generate = sorted(np.arange(len(artists)), key=lambda x: -user_count[x]) # write out as a TSV of artistid, otherartistid, score logging.debug("writing similar items") with tqdm.tqdm(total=len(to_generate)) as progress: with codecs.open(output_filename, "w", "utf8") as o: for artistid in to_generate: artist = artists[artistid] for other, score in model.similar_items(artistid, 11): o.write("%s\t%s\t%s\n" % (artist, artists[other], score)) progress.update(1) logging.debug("generated similar artists in %0.2fs", time.time() - start)
def lastfm(out_fn, n_dimensions, test_size=50000): # This tests out ANN methods for retrieval on simple matrix factorization # based recommendation algorithms. The idea being that the query/test # vectors are user factors and the train set are item factors from # the matrix factorization model. # Since the predictor is a dot product, we transform the factors first # as described in this # paper: https://www.microsoft.com/en-us/research/wp-content/uploads/2016/02/XboxInnerProduct.pdf # noqa # This hopefully replicates the experiments done in this post: # http://www.benfrederickson.com/approximate-nearest-neighbours-for-recommender-systems/ # noqa # The dataset is from "Last.fm Dataset - 360K users": # http://www.dtic.upf.edu/~ocelma/MusicRecommendationDataset/lastfm-360K.html # noqa # This requires the implicit package to generate the factors # (on my desktop/gpu this only takes 4-5 seconds to train - but # could take 1-2 minutes on a laptop) from implicit.datasets.lastfm import get_lastfm from implicit.approximate_als import augment_inner_product_matrix import implicit # train an als model on the lastfm data _, _, play_counts = get_lastfm() model = implicit.als.AlternatingLeastSquares(factors=n_dimensions) model.fit( implicit.nearest_neighbours.bm25_weight(play_counts, K1=100, B=0.8)) # transform item factors so that each one has the same norm, # and transform the user factors such by appending a 0 column _, item_factors = augment_inner_product_matrix(model.item_factors) user_factors = numpy.append(model.user_factors, numpy.zeros((model.user_factors.shape[0], 1)), axis=1) # only query the first 50k users (speeds things up signficantly # without changing results) user_factors = user_factors[:test_size] # after that transformation a cosine lookup will return the same results # as the inner product on the untransformed data write_output(item_factors, user_factors, out_fn, 'angular')
def calculate_recommendations(output_filename, model_name="als"): """ Generates artist recommendations for each user in the dataset """ # train the model based off input params artists, users, plays = get_lastfm() # create a model from the input data model = get_model(model_name) # if we're training an ALS based model, weight input for last.fm # by bm25 if issubclass(model.__class__, AlternatingLeastSquares): # lets weight these models by bm25weight. logging.debug("weighting matrix by bm25_weight") plays = bm25_weight(plays, K1=100, B=0.8) # also disable building approximate recommend index model.approximate_similar_items = False # this is actually disturbingly expensive: plays = plays.tocsr() logging.debug("training model %s", model_name) start = time.time() model.fit(plays) logging.debug("trained model '%s' in %0.2fs", model_name, time.time() - start) # generate recommendations for each user and write out to a file start = time.time() user_plays = plays.T.tocsr() with tqdm.tqdm(total=len(users)) as progress: with codecs.open(output_filename, "w", "utf8") as o: for userid, username in enumerate(users): for artistid, score in model.recommend(userid, user_plays): o.write("%s\t%s\t%s\n" % (username, artists[artistid], score)) progress.update(1) logging.debug("generated recommendations in %0.2fs", time.time() - start)