def put_user_in_cluster(user): ratings = Rating.get_list({'user': user.key.id()}) rlist = {} for rating in ratings: if rating.not_known is False and rating.value > 0: place = rating.place.urlsafe() rlist['%s-%s' % (place, rating.purpose)] = rating.value ruser = {'ratings': rlist} centroids = {} cratings = ClusterRating.get_list({}) for rating in cratings: if rating.cluster_id not in centroids: centroids[rating.cluster_id] = {'key': rating.cluster_id, 'ratings': {}} if rating.avg_value > 0: place = rating.place.urlsafe() centroids[rating.cluster_id]['ratings']['%s-%s' % (place, rating.purpose)] = rating.avg_value max_sim = 0 cluster_id = None for clid in centroids: sim = logic.similarity(ruser, centroids[clid]) if sim >= max_sim: max_sim = sim cluster_id = clid user.cluster_id = cluster_id user.put() return cluster_id
def rating_list_get(filters): """ It retrieves a list of Ratings satisfying the characteristics described in filter. Parameters: - filters: a dict containing the characteristics the objects in the resulting list should have. Available filters: - 'user': the user key in string format setting only 'user', the function retrieves all the ratings of this user - 'place': the place key is string format setting only 'place', the function retrieves all the ratings of this place - 'purpose': the purpose setting only 'purpose', the function retrieves all the ratings added to any place by any user about this purpose usually it is used in combination with other filters - 'users' : list of user ids we are interested in - 'places' : list of place ids we are interested in It returns a tuple: - the list of Ratings that satisfy the filters (or None in case of errors in the input), - the status (a string indicating whether an error occurred) - the http code indicating the type of error, if any """ try: res = Rating.get_list(filters) except (TypeError, ValueError) as e: return None, str(e), 400 return res, "OK", 200
def place_delete(place_id, place_key_str): try: pkey = Place.make_key(place_id, place_key_str) cluster_ratings = ClusterRating.get_list({'place': pkey.urlsafe()}) for cr in cluster_ratings: ClusterRating.delete(cr.key) user_ratings = Rating.get_list({'place': pkey.urlsafe()}) for r in user_ratings: Rating.delete(r.key) res = Place.delete(pkey) except TypeError, e: return None, str(e), 400
def get(self): logging.info('kmeans.RecomputeClustersHandler.get START') # if 'X-AppEngine-QueueName' not in self.request.headers: # logging.info('recommender.RecomputeClustersHandler.get END called not from queue - 403') # # the request is not coming from a queue!! # self.response.set_status(403) # self.response.write("You cannot access this method!!!") # return client = memcache.Client() do_recompute = client.gets('recompute_clusters') # if do_recompute == True: if True: logging.info('KmeansComputeHandler -- recompute needed') #new ratings have been added, so clusters need to be recomputed # logging.info("Starting pipeline for kmeans computation!") ClusterRating.delete_all() logging.info("Removed old cluster ratings") # identify and store k random centroids centroids = {} users = PFuser().query().fetch(NUM_CLUSTERS, offset=10) num = 0 # logging.info("USERS: " + str(len(users))) for user in users: ratings = Rating.get_list({'user': user.key.id()}) rlist = {} for rating in ratings: if rating.not_known is False and rating.value > 0: place = rating.place.urlsafe() rlist['%s-%s' % (place, rating.purpose)] = rating.value user = {'key': 'center_%s' % str(num), 'ratings': rlist} centroids['center_%s' % str(num)] = user num = num + 1 logging.info("Centroids at start: %s" % str(centroids)) store_to_dfbucket(CENTROIDS_FILE, str(centroids)) pipe = KmeansPipeline() pipe.start() i =0 while i<20: i+=1 if client.cas('recompute_clusters', False): break; logging.info('recommender.RecomputeClustersHandler.get END') self.response.write('OK')
def map(data): """K-means map function.""" user_key = data.key.urlsafe() # retrieve past user info and use it instead of going to datastore again user = {} try: user = eval(read_from_dfbucket(USERS_FOLDER + user_key)) except Exception as e: # logging.info("Exception reading from bucket: %s." % str(e)) ratings = Rating.get_list({'user': data.key.id()}) rlist = {} for rating in ratings: if rating.not_known is False and rating.value > 0: place = rating.place.urlsafe() rlist['%s-%s' % (place, rating.purpose)] = rating.value user = {'key': user_key, 'ratings': rlist} centroids = eval(read_from_dfbucket(CENTROIDS_FILE)) # logging.warning("map centroids: %s" % str(centroids)) max_sim = 0 closest_centroid = None for key in centroids: centroid = centroids[key] sim = logic.similarity(user, centroid) if sim >= max_sim: max_sim = sim closest_centroid = centroid user['sim'] = max_sim user['cluster'] = closest_centroid['key'] # save user in a place that is easy and quick to access!! store_to_dfbucket(USERS_FOLDER + user_key, str(user)) if closest_centroid is not None: res = (closest_centroid['key'], str(user)) # logging.warning("map result: %s" % str(res)) logging.warning("map ended!") yield res else: yield ("None", str(user))
def cluster_based(user_id, places, purpose='dinner with tourists', np=5, loc_filters = None): """ It computes cluster-based recommendations. Clusters have been already computed, so only user's cluster information is needed to compute predictions for the user. Input: - user: the id of the requester - places: the list of places that can be recommended - purpose: the purpose the user is interested in - np: the number of recommendations the user needs Result: - list of np tuples (score, place_key), where score is the predicted rating for the user and place_key is the key of the palce to which it refers to - None if the clusters cannot be computed (no ratings in the system) """ logging.info('kmeans.cluster_based START - user='******', places: '+str(len(places))+', purpose:' + str(purpose) + ', np=' + str(np)) client = memcache.Client() purpose_str = purpose.replace(' ', '-') rec_name = 'cluster-scores_' + str(user_id) + '_' + purpose_str # memcache_scores is a dict containing: # - scores: list of items and scores # - purpose # - lat # - lon # - max_dist memcache_scores = client.get(rec_name) logging.info("CLUSTER SCORES from memcache: " + str(memcache_scores) + ' -- ' + str(loc_filters)) memcache_valid = False if memcache_scores is not None and 'purpose' in memcache_scores and memcache_scores['purpose'] == purpose: if loc_filters is not None and 'lat' in loc_filters: if 'lat' in memcache_scores and 'lon' in memcache_scores and 'max_dist' in memcache_scores: diff_lat = memcache_scores['lat'] - loc_filters['lat'] diff_lon = memcache_scores['lon'] - loc_filters['lon'] if diff_lat < 0.0002 and diff_lat > -0.0002 and diff_lon < 0.0002 and diff_lon > -0.0002 and memcache_scores['max_dist'] >= loc_filters['max_dist']: memcache_valid = True # else: # memcache_valid = True if memcache_valid: logging.info("CLUSTER SCORES loaded from memcache") scores = memcache_scores['scores'] scores = sorted(scores, key=lambda x: x[0], reverse = True) else: user = PFuser.get_by_key(PFuser.make_key(user_id, None)) if user.cluster_id is None or len(user.cluster_id) < 1: user.cluster_id = put_user_in_cluster(user) if user.cluster_id is None or len(user.cluster_id) < 1: logging.error("The system is not able to put the user in a cluster!") return None logging.info("USER %s is in cluster %s" % (user_id, user.cluster_id)) filters = {'cluster_id': user.cluster_id, 'purpose': purpose} if places is not None: filters['places'] = places avg_ratings = ClusterRating.get_list(filters) logging.info("Loaded cluster ratings: " + str(len(avg_ratings))) del filters['cluster_id'] filters['user'] = user_id user_ratings = Rating.get_list(filters) logging.info("Loaded user ratings: " + str(len(user_ratings))) scores = [] for cr in avg_ratings: for ur in user_ratings: if cr.avg_value < 3.0: #skip this place, too low rating continue if cr.place == ur.place and ur.value <3.0: #skip this place, user doesn't like it continue already_stored = False prev_value = None cr_key = cr.place.urlsafe() for value, key in scores: if key == cr_key: already_stored = True prev_value = value if already_stored: if value > prev_value: logging.info("Found same place with two different values!! (%s, %d, %d)" + (cr_key, prev_value, cr.avg_value)) scores.delete((prev_value, cr_key)) scores.append((cr.avg_value, cr_key)) continue scores.append((cr.avg_value, cr_key)) scores = sorted(scores, key=lambda x: x[0], reverse = True) logging.info("Scores: " + str(len(scores))) #save scores in memcache purpose_str = purpose.replace(' ', '-') rec_name = 'cluster-scores_' + str(user) + '_' + purpose_str memcache_scores = {} memcache_scores['scores'] = scores memcache_scores['purpose'] = purpose if loc_filters is not None and 'lat' in loc_filters: memcache_scores['lat'] = loc_filters['lat'] memcache_scores['lon'] = loc_filters['lon'] memcache_scores['max_dist'] = loc_filters['max_dist'] logging.info("CLUSTER SCORES saving in memcache ")# + str(memcache_scores)) client.set(rec_name, memcache_scores) res = scores[0:np] logging.info('kmeans.cluster_based END - res: ' + str(res)) return res
def recommend(user_id, filters, purpose='dinner with tourists', n=5): """ It computes the recommendations for the user, according to specified filters and parameters. When possible, the recommendation list is personalized, using the cluster-based algorithm. If the personalized algorithm fails to find the required number of recommended place, an average-based non-personalized recommendation algorithm is used. If still other places are needed, the recommendation list is filled with places ordered by distance from user. Input: - user_id: is of the requester - filters: filters for places of interest for the user - purpose: the purpose the user is interested in - n: number of recommended places requested by the user Available filters: //- 'city': 'city!province!state!country' The 'city' filter contains the full description of the city, with values separated with a '!'. This string is split and used to retrieve only the places that are in the specified city. 'null' is used if part of the full city description is not available [example: 'Trento!TN!null!Italy' or if a bigger reagion is considered [example: 'null!TN!null!Italy' retrieves all places in the province of Trento] - 'lat', 'lon' and 'max_dist': lat and lon indicates the user position, while max_dist is a measure expressed in meters and represnt the radius of the circular region the user is interested in. Returns a list of n places in json format """ logging.info("recommender.recommend START - user_id=" + str(user_id) + ', filters=' + str(filters) + ', purpose=' + str(purpose) + ', n=' + str(n)) # places is already a json list start = datetime.now() user_max_dist = None if filters is not None and 'max_dist' in filters and filters['max_dist'] is not None and filters['max_dist'] > 0: user_max_dist = filters['max_dist'] #get places for a larger area filters['max_dist'] = 2 * user_max_dist places, status, errcode = logic.place_list_get(filters, user_id) logging.info("RECOMMEND places loaded ") if status != "OK" or places is None or len(places) < 1: # the system do not know any place within these filters logging.info("recommender.recommend END - no places") logging.error(str(errcode) + ": " + status) return None logging.warning("Loaded places for double distance: " + str(datetime.now() - start)) start = datetime.now() closest = [] out_distance = [] for p in places: if 'lat' in filters and 'lon' in filters and filters['lat'] is not None and filters['lon'] is not None: # add distance to user for each place p['distance'] = distance( p['address']['lat'], p['address']['lon'], filters['lat'], filters['lon']) if p['distance'] is not None and user_max_dist is not None and p['distance'] <= user_max_dist: closest.append(p) else: out_distance.append(p) if len(closest) >= n: places = closest elif len(closest) == 0: places = out_distance else: #TODO: fill missing spaces with outliers? places = closest logging.warning("removing places that are too far: " + str(datetime.now() - start)) place_ids = [] if places is not None: place_ids = [Place.make_key(None, place['key']).id() for place in places] scores = None purpose_list = ["dinner with tourists", "romantic dinner", "dinner with friends", "best price/quality ratio"] start = datetime.now() # logging.warning("RECOMMEND START get cluster-based predictions for all purposes: " + str(start)) for p in purpose_list: if p == purpose: start2 = datetime.now() scores = cluster_based(user_id, place_ids, p, n, loc_filters=filters) logging.warning("RECOMMEND END get cluster-based predictions: " + str(datetime.now()-start2)) else: q = taskqueue.Queue('recommendations') task = taskqueue.Task(params={'user_id': user_id, 'place_ids': place_ids, 'purpose': p, 'n': n, 'loc_filters': str(filters)}, url='/recommender/compute_cluster_based', method='POST', countdown=10) q.add(task) logging.warning("Getting recommendations from cluster and starting computation for other purposes: " + str(datetime.now() - start)) log_text = "RECOMMEND scores from cluster-based : " if scores is None: log_text += "None" else: log_text += str(len(scores)) logging.info(log_text) start = datetime.now() if scores is None or (len(scores) < n and len(scores) < len(places)): # cluster-based recommendation failed # non-personalized recommendation rating_filters = {} if places is not None: rating_filters['places'] = place_ids rating_filters['purpose'] = purpose ratings = load_data(rating_filters) if ratings is None: logging.info("ratings for places: None") else: logging.info("ratings for places: " + str(len(ratings))) items = {} if ratings is not None: for other in ratings: if other != user_id: for item in ratings[other]: if purpose in ratings[other][item]: if item not in items.keys(): items[item] = [] items[item].append(ratings[other][item][purpose]) avg_scores = [(sum(items[item]) / len(items[item]), item) for item in items] logging.info("avg_scores: " + str(len(avg_scores))) filters = {'purpose': purpose, 'user': user_id} if places is not None: filters['places'] = place_ids user_ratings = Rating.get_list(filters) logging.info("Loaded user ratings: " + str(len(user_ratings))) if scores is None: scores = [] for value, key in avg_scores: toadd = True for ur in user_ratings: if value < 3.0: #skip this place, too low rating toadd = False continue if key == ur.place.urlsafe() and ur.value < 3.0: #skip this place, user doesn't like it toadd = False continue for svalue, skey in scores: if key == skey: #already in list because of cluster toadd = False break if toadd: scores.append((value, key)) logging.info("Appending place with value " + str(value)) if len(scores) >= n: # we have enough recommended places break scores = sorted(scores, key=lambda x: x[0], reverse = True) if len(scores) > n: scores = scores[0:n] # if debug: # log_text = "RECOMMEND scores from average-based : " # if scores is None: # log_text += "None" # else: # log_text += str(len(scores)) # logging.info(log_text) # # if scores is None or (len(scores) < n and len(scores) < len(places)): # # cluster-based and average recommendations both failed to fill the recommendation list # # just add some other places # for p in places: # in_list = False # for score, key in scores: # if key == p['key']: # in_list = True # break # if not in_list: # scores.append((0, p['key'])) # if len(scores) >= n: # # we have enough recommended places # break # # if debug: # log_text = "RECOMMEND final scores : " # if scores is None: # log_text += "None" # else: # log_text += str(len(scores)) # logging.info(log_text) logging.warning("Filling empty space with full average predictions: " + str(datetime.now() - start)) start = datetime.now() places_scores = [] for p in places: # found = False for (score, item) in scores: if item == p['key']: places_scores.append((score, p)) # found = True # if not found: # places_scores.append((0, p)) logging.info('places_scores: ' + str(len(places_scores))) places_scores = sorted(places_scores, key=lambda x: x[0], reverse = True) logging.warning("Moving mapping from place ids to full place data: " + str(datetime.now() - start)) if len(places_scores) > n: places_scores = places_scores[0:n] # logging.info('recommender.recommend - places_scores: ' + str(places_scores)) items = [] start = datetime.now() for (score, place) in places_scores: #TODO: make discount loading asynchronous in javascript page, after visualization of places!!! disc_filters = {'place': place['key'], 'published': 'True', 'passed': 'False'} discounts, status, errcode = logic.discount_list_get(disc_filters, user_id) logging.info("discounts loaded: " + str(errcode) + " - " + status) if discounts is not None and status == "OK": try: json_discounts = [Discount.to_json(d, None, None) for d in discounts] place['discounts'] = json_discounts except (TypeError, ValueError) as e: #do nothing logging.error('Discounts not loaded: ' + str(e)) pass place['predicted'] = score items.append(place) logging.warning("Time for loading discounts: " + str(datetime.now() - start)) # logging.info("Recommended items: " + str(items)) logging.info("recommender.recommend END ")#- items: " + str(items)) return items