def knn_textual(k, an_id, model, item_type, database, this_vector=None): nearest = {} if not this_vector: this_vector = Vectorizer.text_vector(an_id, database, model, item_type) # Get the id's for all items of the same type. if item_type == 'photo': an_id = int(an_id) others = database.get_photo_ids() elif item_type == 'user': others = database.get_user_ids() elif item_type == 'poi': an_id = int(an_id) others = database.get_location_ids() else: raise ValueError( '[ERROR] The provided type was invalid.\ntype = ' + str(item_type)) # remove this id from the list others.remove(an_id) # for each other one, get their vector and calculate distance. for other in others: other_vector = Vectorizer.text_vector(other, database, model, item_type) if len(other_vector ) == 0: # Find any elements with no textual descriptors continue distance = Distance.l_p_distance(3, this_vector, other_vector) if len(nearest) < k: largest_key, largest_best = None, inf else: largest_key, largest_best = max(nearest.items(), key=itemgetter(1)) if distance < largest_best: # remove the key with the largest distance if it exists if largest_key: nearest.pop(largest_key) nearest[other] = distance if all([value == 0 for value in nearest.values()]): break # Return your K nearest return nearest
def nearest_text(self, *args): """ arg[0] = type (user, photo, poi) arg[1] = id arg[2] = model (tf, df, tf-idf) arg[3] = k """ if not self.database: print("[ERROR] The database must be loaded for this action.") return if not len(args) is 4: print("Nearest Text expected 4 arguments but got " + str(len(args)) + ".") print("\targs = " + str(args)) return # Get the first argument try: k = int(args[3]) except: print("[ERROR] K Value provided is invalid.") print("\tk = " + str(args[0])) return # Get the type of item we are considering. itype = args[0] if not itype in self.valid_types: print("[ERROR] Item Type value provided was invalid.") print("\tItem Type = " + str(args[1])) return # Get the model to use. We do this before the item as it is easier # to differentiate valid from invalid model = args[2] model = model.lower() if not model in self.valid_txt_models: print("[ERROR] Model Type value provided was invalid.") print("\tModel Type = " + str(args[2])) return try: # get vector representing item associated w/ id an_id = args[1] this_vector = Vectorizer.text_vector(an_id, self.database, model, itype) except: print("[ERROR] The ID specified was not found in the dataset.") print("\tID = " + an_id + "; Type = " + itype + "; Model = " + model) return nearest = Neighbor.knn_textual(k, an_id, model, itype, self.database, this_vector=this_vector) contribs = Neighbor.similarity_by_id(this_vector, nearest.keys(), self.database, model, itype) print(str(k) + " Nearest Neighbors:") for i, (an_id, distance) in enumerate(nearest.items()): print('\t' + str(i) + ". " + str(an_id) + "; Distance = " + str(distance)) print('Top 3 Features:') for i, item in enumerate(contribs): print('\t' + str(i) + '. ' + str(item))
def similarity_by_id(this_vector, ids, database, model, itemtype, k=3): other_vectors = [ Vectorizer.text_vector(an_id, database, model, itemtype) for an_id in ids ] return Neighbor.similarity_contribution(this_vector, other_vectors, k)