Пример #1
0
def put_user_in_cluster(user):
    
    ratings = Rating.get_list({'user': user.key.id()})
    rlist = {}
    for rating in ratings:
        if rating.not_known is False and rating.value > 0:
            place = rating.place.urlsafe()
            rlist['%s-%s' % (place, rating.purpose)] = rating.value
    ruser = {'ratings': rlist}
    
    centroids = {}
    cratings = ClusterRating.get_list({})
    for rating in cratings:
        if rating.cluster_id not in centroids:
            centroids[rating.cluster_id] = {'key': rating.cluster_id, 'ratings': {}}
        if rating.avg_value > 0:
            place = rating.place.urlsafe()
            centroids[rating.cluster_id]['ratings']['%s-%s' % (place, rating.purpose)] = rating.avg_value
    max_sim = 0
    cluster_id = None
    for clid in centroids:
        sim = logic.similarity(ruser, centroids[clid])
        if sim >= max_sim:
            max_sim = sim
            cluster_id = clid
    user.cluster_id = cluster_id
    user.put()
    return cluster_id
Пример #2
0
def rating_list_get(filters):
    """
    It retrieves a list of Ratings satisfying the characteristics described in filter.

    Parameters:
    - filters: a dict containing the characteristics the objects in the resulting list should have.

    Available filters:
    - 'user': the user key in string format
        setting only 'user', the function retrieves all the ratings of this user
    - 'place': the place key is string format
        setting only 'place', the function retrieves all the ratings of this place
    - 'purpose': the purpose
        setting only 'purpose', the function retrieves all the ratings added to any place by any user about this purpose
        usually it is used in combination with other filters
    - 'users' : list of user ids we are interested in
    - 'places' : list of place ids we are interested in

    It returns a tuple: 
    - the list of Ratings that satisfy the filters (or None in case of errors in the input),
    - the status (a string indicating whether an error occurred)
    - the http code indicating the type of error, if any
    """
    try:
        res = Rating.get_list(filters)
    except (TypeError, ValueError) as e:
        return None, str(e), 400

    return res, "OK", 200
Пример #3
0
def place_delete(place_id, place_key_str):
    try:
        pkey = Place.make_key(place_id, place_key_str)
        cluster_ratings = ClusterRating.get_list({'place': pkey.urlsafe()})
        for cr in cluster_ratings:
            ClusterRating.delete(cr.key)
        user_ratings = Rating.get_list({'place': pkey.urlsafe()})
        for r in user_ratings:
            Rating.delete(r.key)
        res = Place.delete(pkey)
    except TypeError, e:
        return None, str(e), 400
Пример #4
0
    def get(self):
        
        logging.info('kmeans.RecomputeClustersHandler.get START')
#         if 'X-AppEngine-QueueName' not in self.request.headers:
#             logging.info('recommender.RecomputeClustersHandler.get END called not from queue - 403')
#             # the request is not coming from a queue!!
#             self.response.set_status(403)
#             self.response.write("You cannot access this method!!!")
#             return
         
        client = memcache.Client()
        do_recompute = client.gets('recompute_clusters')
#         if do_recompute == True:
        if True:
            logging.info('KmeansComputeHandler -- recompute needed')
            #new ratings have been added, so clusters need to be recomputed
#             logging.info("Starting pipeline for kmeans computation!")
            ClusterRating.delete_all()
            logging.info("Removed old cluster ratings")
            # identify and store k random centroids
            centroids = {}
            users = PFuser().query().fetch(NUM_CLUSTERS, offset=10)
            num = 0
#         logging.info("USERS: " + str(len(users)))
            for user in users:
                ratings = Rating.get_list({'user': user.key.id()})
                rlist = {}
                for rating in ratings:
                    if rating.not_known is False and rating.value > 0:
                        place = rating.place.urlsafe()
                        rlist['%s-%s' % (place, rating.purpose)] = rating.value

                user = {'key': 'center_%s' % str(num), 'ratings': rlist}
                centroids['center_%s' % str(num)] = user
                num = num + 1
            logging.info("Centroids at start: %s" % str(centroids))
            store_to_dfbucket(CENTROIDS_FILE, str(centroids))

            pipe = KmeansPipeline()
            pipe.start()
            i =0
            while i<20:
                i+=1
                if client.cas('recompute_clusters', False):
                    break;
        logging.info('recommender.RecomputeClustersHandler.get END')
        self.response.write('OK')
Пример #5
0
def map(data):
    """K-means map function."""
    user_key = data.key.urlsafe()
    # retrieve past user info and use it instead of going to datastore again
    user = {}
    try:
        user = eval(read_from_dfbucket(USERS_FOLDER + user_key))
    except Exception as e:
        #         logging.info("Exception reading from bucket: %s." % str(e))
        ratings = Rating.get_list({'user': data.key.id()})
        rlist = {}
        for rating in ratings:
            if rating.not_known is False and rating.value > 0:
                place = rating.place.urlsafe()
                rlist['%s-%s' % (place, rating.purpose)] = rating.value

        user = {'key': user_key, 'ratings': rlist}

    centroids = eval(read_from_dfbucket(CENTROIDS_FILE))
#     logging.warning("map centroids: %s" % str(centroids))

    max_sim = 0
    closest_centroid = None
    for key in centroids:
        centroid = centroids[key]
        sim = logic.similarity(user, centroid)
        if sim >= max_sim:
            max_sim = sim
            closest_centroid = centroid
    user['sim'] = max_sim
    user['cluster'] = closest_centroid['key']
    
    # save user in a place that is easy and quick to access!!
    store_to_dfbucket(USERS_FOLDER + user_key, str(user))
    if closest_centroid is not None:
        res = (closest_centroid['key'], str(user))

#         logging.warning("map result: %s" % str(res))
        logging.warning("map ended!")
        yield res
    else:
        yield ("None", str(user))
Пример #6
0
def cluster_based(user_id, places, purpose='dinner with tourists', np=5, loc_filters = None):
    """
    It computes cluster-based recommendations.
    Clusters have been already computed, so only user's cluster information is needed to compute predictions for the user.

    Input:
    - user: the id of the requester
    - places: the list of places that can be recommended
    - purpose: the purpose the user is interested in
    - np: the number of recommendations the user needs

    Result: 
    - list of np tuples (score, place_key), where score is the predicted rating for the user and place_key is the key of the palce to which it refers to
    - None if the clusters cannot be computed (no ratings in the system) 
    """
    logging.info('kmeans.cluster_based START - user='******', places: '+str(len(places))+', purpose:' + str(purpose) + ', np=' + str(np))
    
    client = memcache.Client()
    purpose_str = purpose.replace(' ', '-')
    rec_name = 'cluster-scores_' + str(user_id) + '_' + purpose_str 
    # memcache_scores is a dict containing: 
    # - scores: list of items and scores
    # - purpose
    # - lat
    # - lon
    # - max_dist
    memcache_scores = client.get(rec_name)
    logging.info("CLUSTER SCORES from memcache: " + str(memcache_scores) + ' -- ' + str(loc_filters))
    memcache_valid = False
    if memcache_scores is not None and 'purpose' in memcache_scores and memcache_scores['purpose'] == purpose:
        if loc_filters is not None and 'lat' in loc_filters:
            if 'lat' in memcache_scores and 'lon' in memcache_scores and 'max_dist' in memcache_scores:
                diff_lat = memcache_scores['lat'] - loc_filters['lat']
                diff_lon = memcache_scores['lon'] - loc_filters['lon']
                if diff_lat < 0.0002 and diff_lat > -0.0002 and diff_lon < 0.0002 and diff_lon > -0.0002  and memcache_scores['max_dist'] >= loc_filters['max_dist']:
                    memcache_valid = True
#         else:
#             memcache_valid = True
    
    if  memcache_valid:
        logging.info("CLUSTER SCORES loaded from memcache")
        scores = memcache_scores['scores']
        scores = sorted(scores, key=lambda x: x[0], reverse = True)
    else:
        user = PFuser.get_by_key(PFuser.make_key(user_id, None))
        if user.cluster_id is None or len(user.cluster_id) < 1:
            user.cluster_id = put_user_in_cluster(user)
        if user.cluster_id is None or len(user.cluster_id) < 1:
            logging.error("The system is not able to put the user in a cluster!")
            return None
        logging.info("USER %s is in cluster %s" % (user_id, user.cluster_id))
        filters = {'cluster_id': user.cluster_id, 'purpose': purpose}
        if places is not None:
            filters['places'] = places
        avg_ratings = ClusterRating.get_list(filters)
        logging.info("Loaded cluster ratings: " + str(len(avg_ratings)))
        del filters['cluster_id']
        filters['user'] = user_id
        user_ratings = Rating.get_list(filters)
        logging.info("Loaded user ratings: " + str(len(user_ratings)))
        scores = []
        for cr in avg_ratings:
            for ur in user_ratings:
                if cr.avg_value < 3.0:
                    #skip this place, too low rating
                    continue
                if cr.place == ur.place and ur.value <3.0:
                    #skip this place, user doesn't like it
                    continue
                already_stored = False
                prev_value = None
                cr_key = cr.place.urlsafe()
                for value, key in scores:
                    if key == cr_key:
                        already_stored = True
                        prev_value = value
                if already_stored:
                    if value > prev_value:
                        logging.info("Found same place with two different values!! (%s, %d, %d)" + (cr_key, prev_value, cr.avg_value))
                        scores.delete((prev_value, cr_key))
                        scores.append((cr.avg_value, cr_key))
                    continue
                scores.append((cr.avg_value, cr_key))
                
        scores = sorted(scores, key=lambda x: x[0], reverse = True)
        logging.info("Scores: " + str(len(scores)))
        #save scores in memcache
        purpose_str = purpose.replace(' ', '-')
        rec_name = 'cluster-scores_' + str(user) + '_' + purpose_str
        memcache_scores = {}
        memcache_scores['scores'] = scores
        memcache_scores['purpose'] = purpose
        if loc_filters is not None and 'lat' in loc_filters:
            memcache_scores['lat'] = loc_filters['lat']
            memcache_scores['lon'] = loc_filters['lon']
            memcache_scores['max_dist'] = loc_filters['max_dist']
        
        logging.info("CLUSTER SCORES saving in memcache ")# + str(memcache_scores))
        client.set(rec_name, memcache_scores)
    
    res = scores[0:np]
    logging.info('kmeans.cluster_based END - res: ' + str(res))
    return res
Пример #7
0
def recommend(user_id, filters, purpose='dinner with tourists', n=5):
    """
    It computes the recommendations for the user, according to specified filters and parameters.
    When possible, the recommendation list is personalized, using the cluster-based algorithm.
    If the personalized algorithm fails to find the required number of recommended place, an average-based
    non-personalized recommendation algorithm is used.
    If still other places are needed, the recommendation list is filled with places ordered by distance from user. 

    Input:
    - user_id: is of the requester
    - filters: filters for places of interest for the user
    - purpose: the purpose the user is interested in
    - n: number of recommended places requested by the user

    Available filters:
    //- 'city': 'city!province!state!country'
        The 'city' filter contains the full description of the city, with values separated with a '!'. 
        This string is split and used to retrieve only the places that are in the specified city. 
        'null' is used if part of the full city description is not available [example: 'Trento!TN!null!Italy'
        or if a bigger reagion is considered [example: 'null!TN!null!Italy' retrieves all places in the province of Trento]
    - 'lat', 'lon' and 'max_dist': lat and lon indicates the user position, while max_dist is a measure expressed in meters 
        and represnt the radius of the circular region the user is interested in. 

    Returns a list of n places in json format
    """
    logging.info("recommender.recommend START - user_id=" + str(user_id) +
                 ', filters=' + str(filters) + ', purpose=' + str(purpose) + ', n=' + str(n))
    
    # places is already a json list
    start = datetime.now()
    user_max_dist = None
    if filters is not None and 'max_dist' in filters and filters['max_dist'] is not None and filters['max_dist'] > 0:
        user_max_dist = filters['max_dist']
        #get places for a larger area
        filters['max_dist'] = 2 * user_max_dist
    places, status, errcode = logic.place_list_get(filters, user_id)
    logging.info("RECOMMEND places loaded ")

    if status != "OK" or places is None or len(places) < 1:
        # the system do not know any place within these filters
        logging.info("recommender.recommend END - no places")
        logging.error(str(errcode) + ": " + status)
        return None
    logging.warning("Loaded places for double distance: " + str(datetime.now() - start))
    start = datetime.now()
    closest = []
    out_distance = []
    for p in places:
        if 'lat' in filters and 'lon' in filters and filters['lat'] is not None and filters['lon'] is not None:
            # add distance to user for each place
            p['distance'] = distance(
                    p['address']['lat'], p['address']['lon'], filters['lat'], filters['lon'])
        if p['distance'] is not None and user_max_dist is not None and p['distance'] <= user_max_dist:
            closest.append(p)
        else:
            out_distance.append(p)
    if len(closest) >= n:
        places = closest
    elif len(closest) == 0:
        places = out_distance
    else:
        #TODO: fill missing spaces with outliers?
        places = closest
    logging.warning("removing places that are too far: " + str(datetime.now() - start))
    place_ids = []
    if places is not None:
        place_ids = [Place.make_key(None, place['key']).id() for place in places]
    scores = None
    purpose_list = ["dinner with tourists", "romantic dinner", "dinner with friends", "best price/quality ratio"]
    start = datetime.now()
#     logging.warning("RECOMMEND START get cluster-based predictions for all purposes: " + str(start))
    for p in purpose_list:
        if p == purpose:
            start2 = datetime.now()
            scores = cluster_based(user_id, place_ids, p, n, loc_filters=filters)
            logging.warning("RECOMMEND END get cluster-based predictions: " + str(datetime.now()-start2))
        else:
            q = taskqueue.Queue('recommendations')
             
            task = taskqueue.Task(params={'user_id': user_id, 'place_ids': place_ids, 'purpose': p, 'n': n, 'loc_filters': str(filters)},
                url='/recommender/compute_cluster_based', method='POST', countdown=10)
            q.add(task)

    logging.warning("Getting recommendations from cluster and starting computation for other purposes: " + str(datetime.now() - start))
    log_text = "RECOMMEND scores from cluster-based : "
    if scores is None:
        log_text += "None"
    else:
        log_text += str(len(scores))
    logging.info(log_text)
    

    start = datetime.now()
    if scores is None or (len(scores) < n and len(scores) < len(places)):
        # cluster-based recommendation failed
        # non-personalized recommendation
        rating_filters = {}
        if places is not None:
            rating_filters['places'] = place_ids
        rating_filters['purpose'] = purpose
        ratings = load_data(rating_filters)
        if ratings is None:
            logging.info("ratings for places: None")
        else:
            logging.info("ratings for places: " + str(len(ratings)))
        items = {}
        if ratings is not None:
            for other in ratings:
                if other != user_id:
                    for item in ratings[other]:
                        if purpose in ratings[other][item]:
                            if item not in items.keys():
                                items[item] = []
                            items[item].append(ratings[other][item][purpose])
 
        avg_scores = [(sum(items[item]) / len(items[item]), item)
                      for item in items]
        logging.info("avg_scores: " + str(len(avg_scores)))
        filters = {'purpose': purpose, 'user': user_id}
        if places is not None:
            filters['places'] = place_ids
        
        user_ratings = Rating.get_list(filters)
        logging.info("Loaded user ratings: " + str(len(user_ratings)))
        if scores is None:
            scores = []
        for value, key in avg_scores:
            toadd = True
            for ur in user_ratings:
                if value < 3.0:
                    #skip this place, too low rating
                    toadd = False
                    continue
                if key == ur.place.urlsafe() and ur.value < 3.0:
                    #skip this place, user doesn't like it
                    toadd = False
                    continue
                
                for svalue, skey in scores:
                    if key == skey:
                        #already in list because of cluster
                        toadd = False
                        break
                    
            if toadd:
                scores.append((value, key))
                logging.info("Appending place with value " + str(value))
            if len(scores) >= n:
                # we have enough recommended places
                break
                
                
        scores = sorted(scores, key=lambda x: x[0], reverse = True)
        if len(scores) > n:
            scores = scores[0:n]
#         if debug:
#             log_text = "RECOMMEND scores from average-based : "
#             if scores is None:
#                 log_text += "None"
#             else:
#                 log_text += str(len(scores))
#             logging.info(log_text)
# 
#     if scores is None or (len(scores) < n and len(scores) < len(places)):
#         # cluster-based and average recommendations both failed to fill the recommendation list
#         # just add some other places
#         for p in places:
#             in_list = False
#             for score, key in scores:
#                 if key == p['key']:
#                     in_list = True
#                     break
#             if not in_list:
#                 scores.append((0, p['key']))
#             if len(scores) >= n:
#                 # we have enough recommended places
#                 break
#             
#     if debug:
#         log_text = "RECOMMEND final scores : "
#         if scores is None:
#             log_text += "None"
#         else:
#             log_text += str(len(scores))
#         logging.info(log_text)

    logging.warning("Filling empty space with full average predictions: " + str(datetime.now() - start))

    start = datetime.now()
    places_scores = []
    for p in places:
#         found = False
        for (score, item) in scores:
            if item == p['key']:
                places_scores.append((score, p))
#                 found = True
#         if not found:
#             places_scores.append((0, p))
    logging.info('places_scores: ' + str(len(places_scores)))
    places_scores = sorted(places_scores, key=lambda x: x[0], reverse = True)
    logging.warning("Moving mapping from place ids to full place data: " + str(datetime.now() - start))
    if len(places_scores) > n:
        places_scores = places_scores[0:n]
#     logging.info('recommender.recommend - places_scores: ' + str(places_scores))
    items = []
    start = datetime.now()
    for (score, place) in places_scores:
        
        #TODO: make discount loading asynchronous in javascript page, after visualization of places!!!
        
        disc_filters = {'place': place['key'], 'published': 'True', 'passed': 'False'}
        discounts, status, errcode = logic.discount_list_get(disc_filters, user_id)
        logging.info("discounts loaded: " + str(errcode) + " - " + status)
        if discounts is not None and status == "OK":
            try:
                json_discounts = [Discount.to_json(d, None, None) for d in discounts]
                place['discounts'] = json_discounts
            except (TypeError, ValueError) as e:
                #do nothing
                logging.error('Discounts not loaded: ' + str(e))
                pass
        place['predicted'] = score
        items.append(place)
    logging.warning("Time for loading discounts: " + str(datetime.now() - start))
#     logging.info("Recommended items: " + str(items))
    logging.info("recommender.recommend END ")#- items: " + str(items))
    return items