Exemplo n.º 1
0
    def knn_textual(k, an_id, model, item_type, database, this_vector=None):
        nearest = {}
        if not this_vector:
            this_vector = Vectorizer.text_vector(an_id, database, model,
                                                 item_type)

        # Get the id's for all items of the same type.
        if item_type == 'photo':
            an_id = int(an_id)
            others = database.get_photo_ids()
        elif item_type == 'user':
            others = database.get_user_ids()
        elif item_type == 'poi':
            an_id = int(an_id)
            others = database.get_location_ids()
        else:
            raise ValueError(
                '[ERROR] The provided type was invalid.\ntype = ' +
                str(item_type))

        # remove this id from the list
        others.remove(an_id)

        # for each other one, get their vector and calculate distance.
        for other in others:

            other_vector = Vectorizer.text_vector(other, database, model,
                                                  item_type)
            if len(other_vector
                   ) == 0:  # Find any elements with no textual descriptors
                continue

            distance = Distance.l_p_distance(3, this_vector, other_vector)

            if len(nearest) < k:
                largest_key, largest_best = None, inf
            else:
                largest_key, largest_best = max(nearest.items(),
                                                key=itemgetter(1))

            if distance < largest_best:
                # remove the key with the largest distance if it exists
                if largest_key:
                    nearest.pop(largest_key)

                nearest[other] = distance

            if all([value == 0 for value in nearest.values()]):
                break

        # Return your K nearest
        return nearest
Exemplo n.º 2
0
    def nearest_text(self, *args):
        """
            arg[0] = type (user, photo, poi)
            arg[1] = id
            arg[2] = model (tf, df, tf-idf)
            arg[3] = k
        """
        
        if not self.database:
            print("[ERROR] The database must be loaded for this action.")
            return

        if not len(args) is 4:
            print("Nearest Text expected 4 arguments but got " + str(len(args)) + ".")
            print("\targs = " + str(args))
            return

        # Get the first argument
        try:
            k = int(args[3])
        except:
            print("[ERROR] K Value provided is invalid.")
            print("\tk = " + str(args[0]))
            return
        
        # Get the type of item we are considering.
        itype = args[0]
        if not itype in self.valid_types:
            print("[ERROR] Item Type value provided was invalid.")
            print("\tItem Type = " + str(args[1]))
            return
        
        # Get the model to use. We do this before the item as it is easier
        #   to differentiate valid from invalid
        model = args[2]
        model = model.lower()
        if not model in self.valid_txt_models:
            print("[ERROR] Model Type value provided was invalid.")
            print("\tModel Type = " + str(args[2]))
            return

        try:
            # get vector representing item associated w/ id
            an_id = args[1]
            this_vector = Vectorizer.text_vector(an_id, self.database, model, itype)
        except:
            print("[ERROR] The ID specified was not found in the dataset.")
            print("\tID = " + an_id + "; Type = " + itype + "; Model = " + model)
            return
        
        nearest = Neighbor.knn_textual(k, an_id, model, itype, self.database, this_vector=this_vector)
        contribs = Neighbor.similarity_by_id(this_vector, nearest.keys(), 
                                            self.database, model, itype)

        print(str(k) + " Nearest Neighbors:")
        for i, (an_id, distance) in enumerate(nearest.items()):
            print('\t' + str(i) + ". " + str(an_id) + "; Distance = " + str(distance))
        print('Top 3 Features:')
        for i, item in enumerate(contribs):
            print('\t' + str(i) + '. ' + str(item))
Exemplo n.º 3
0
 def similarity_by_id(this_vector, ids, database, model, itemtype, k=3):
     other_vectors = [
         Vectorizer.text_vector(an_id, database, model, itemtype)
         for an_id in ids
     ]
     return Neighbor.similarity_contribution(this_vector, other_vectors, k)