def main(): topics = int(sys.argv[1]) total_iters = int(sys.argv[2]) burn_in = float(sys.argv[3]) thinning = int(sys.argv[4]) ratings, _ = get_split_review_mats() bpf = BayesianPoissonFactorization(0.3, 0.3, 1.0, 0.3, 0.3, 1.0, topics, ratings) bpf.sample(total_iters, burn_in, thinning)
def __init__(self, numTopics, alpha, beta, gamma): # Setup logger self.log = logging.getLogger("Gibbs") self.log.setLevel(logging.DEBUG) formatter = logging.Formatter("%(asctime)s %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p") fh = logging.handlers.TimedRotatingFileHandler("logs/gibbs.log", when="D", interval=1, backupCount=10) ch = logging.StreamHandler() fh.setFormatter(formatter) ch.setFormatter(formatter) self.log.addHandler(fh) self.log.addHandler(ch) self.numTopics = numTopics self.alpha = alpha self.beta = beta self.gamma = gamma self.info = getMeta() self.user_movies, _ = get_split_review_mats() user_indices, movie_indices = self.user_movies.nonzero() self.user_movie_indices = zip(user_indices, movie_indices) self.CountMT = np.zeros((self.info["movies"], numTopics), dtype=np.int) self.CountRUT = np.zeros((6, self.info["users"], numTopics), dtype=np.int) # ratings 1-5 and 0 self.CountUT = np.zeros((self.info["users"], numTopics), dtype=np.int) self.topic_assignments = np.zeros( (self.info["users"], self.info["movies"]), dtype=np.int) # Normalization factors self.CountT = np.zeros(numTopics, dtype=np.int) self.CountU = np.zeros(self.info["users"], dtype=np.int) self.CountRU = np.zeros((6, self.info["users"]), dtype=np.int) for userid, movieid in self.user_movie_indices: topic = randint(0, numTopics - 1) self.CountMT[movieid, topic] += 1 rating = self.user_movies[userid, movieid] self.CountRUT[rating, userid, topic] += 1 self.CountUT[userid, topic] += 1 self.topic_assignments[userid, movieid] = topic self.CountT[topic] += 1 self.CountU[userid] += 1 self.CountRU[rating, userid] += 1
def __init__(self, numTopics, alpha, beta, gamma): # Setup logger self.log = logging.getLogger("Gibbs") self.log.setLevel(logging.DEBUG) formatter = logging.Formatter("%(asctime)s %(message)s", datefmt="%m/%d/%Y %I:%M:%S %p") fh = logging.handlers.TimedRotatingFileHandler("logs/gibbs.log", when="D", interval=1, backupCount=10) ch = logging.StreamHandler() fh.setFormatter(formatter) ch.setFormatter(formatter) self.log.addHandler(fh) self.log.addHandler(ch) self.numTopics = numTopics self.alpha = alpha self.beta = beta self.gamma = gamma self.info = getMeta() self.user_movies, _ = get_split_review_mats() user_indices, movie_indices = self.user_movies.nonzero() self.user_movie_indices = zip(user_indices, movie_indices) self.CountMT = np.zeros((self.info["movies"], numTopics), dtype=np.int) self.CountRUT = np.zeros((6, self.info["users"], numTopics), dtype=np.int) # ratings 1-5 and 0 self.CountUT = np.zeros((self.info["users"], numTopics), dtype=np.int) self.topic_assignments = np.zeros((self.info["users"], self.info["movies"]), dtype=np.int) # Normalization factors self.CountT = np.zeros(numTopics, dtype=np.int) self.CountU = np.zeros(self.info["users"], dtype=np.int) self.CountRU = np.zeros((6, self.info["users"]), dtype=np.int) for userid, movieid in self.user_movie_indices: topic = randint(0, numTopics - 1) self.CountMT[movieid, topic] += 1 rating = self.user_movies[userid, movieid] self.CountRUT[rating, userid, topic] += 1 self.CountUT[userid, topic] += 1 self.topic_assignments[userid, movieid] = topic self.CountT[topic] += 1 self.CountU[userid] += 1 self.CountRU[rating, userid] += 1
def test_iid_users(): train, test = get_split_review_mats() avg_ratings = train.sum(axis=0) / (train != 0).sum(axis=0).astype(np.float) rmse = 0.0 count = 0 for user, movie in izip(*test.nonzero()): true_rating = test[user, movie] predicted = avg_ratings[movie] if np.isnan(predicted): # The movie wasn't rated by any users in the training data set continue rmse += (predicted - true_rating) ** 2 count += 1 return math.sqrt(rmse / count)
def test_iid_users(): train, test = get_split_review_mats() avg_ratings = train.sum(axis=0) / (train != 0).sum(axis=0).astype(np.float) rmse = 0.0 count = 0 for user, movie in izip(*test.nonzero()): true_rating = test[user, movie] predicted = avg_ratings[movie] if np.isnan(predicted): # The movie wasn't rated by any users in the training data set continue rmse += (predicted - true_rating)**2 count += 1 return math.sqrt(rmse / count)
def top_recommendations_iid(): info = getMeta() train, reviews = get_split_review_mats() avg_ratings = train.sum(axis=0) / (train != 0).sum(axis=0).astype(np.float) top_movies = sorted(((movie, rating) for movie, rating in enumerate(avg_ratings)), key=lambda x: x[1]) top_movies = set(movie for movie, rating in top_movies[-1000:]) precision = 0.0 num_users = 0 for user in xrange(info["users"]): movies = reviews[user, :].nonzero()[0] user_precision = 0.0 for movie in movies: if movie in top_movies: user_precision += 1 if len(movies) > 0: num_users += 1 precision += (user_precision / len(movies)) return precision / num_users
def top_recommendations_iid(): info = getMeta() train, reviews = get_split_review_mats() avg_ratings = train.sum(axis=0) / (train != 0).sum(axis=0).astype(np.float) top_movies = sorted( ((movie, rating) for movie, rating in enumerate(avg_ratings)), key=lambda x: x[1]) top_movies = set(movie for movie, rating in top_movies[-1000:]) precision = 0.0 num_users = 0 for user in xrange(info["users"]): movies = reviews[user, :].nonzero()[0] user_precision = 0.0 for movie in movies: if movie in top_movies: user_precision += 1 if len(movies) > 0: num_users += 1 precision += (user_precision / len(movies)) return precision / num_users