def real_main(database, table, smooth_func, lambda_, out_folder): with AnnotReader(database) as reader: reader.change_table(table) #Create Graph create_graph(reader.iterate(), out_folder) #Compute popularity tag_pop = collections.defaultdict(int) for annotation in reader.iterate(): tag = annotation['tag'] tag_pop[tag] += 1 #Compute tag value tag_to_item, item_to_tag = \ index_creator.create_double_occurrence_index(reader.iterate(), 'tag', 'item') compute_tag_values(smooth_func, lambda_, reader.iterate(), tag_to_item, tag_pop, out_folder) with io.open(os.path.join(out_folder, 'relevant_item.tags'), 'w') as rel: rel.write(u'#ITEM TAG\n') for item in item_to_tag: for tag in item_to_tag[tag]: rel.write(u'%d %d\n' %(item, tag))
def compute_for_user(database, table, user, relevant, annotated, smooth_func, lambda_, user_profile_size, out_folder): with AnnotReader(database) as reader: reader.change_table(table) #Relevant items by user are left out with this query query = {'$or' : [ { 'user':{'$ne' : user} }, { 'item':{'$nin' : relevant} } ] } #Probability estimator est = SmoothEstimator(smooth_func, lambda_, reader.iterate(query = query), user_profile_size = user_profile_size) value_calc = value_calculator.ValueCalculator(est) fname = 'user_%d' % user user_folder = os.path.join(out_folder, fname) os.mkdir(user_folder) #Initial information with io.open(os.path.join(user_folder, 'info'), 'w') as info: info.write(u'#UID: %d\n' %user) relevant_str = ' '.join([str(i) for i in relevant]) annotated_str = ' '.join([str(i) for i in annotated]) info.write(u'# %d relevant items: %s\n' %(len(relevant), str(relevant_str))) info.write(u'# %d annotated items: %s\n' %(len(annotated), str(annotated_str))) #Create Graph iterator = reader.iterate(query = query) tag_to_item, item_to_tag = \ index_creator.create_double_occurrence_index(iterator, 'tag', 'item') #Items to consider <-> Gamma items items_to_consider = set(xrange(est.num_items())) annotated_set = set(annotated) items_to_consider.difference_update(annotated_set) compute_tag_values(est, value_calc, tag_to_item, user, user_folder, np.array([i for i in items_to_consider])) relevant_tags_fpath = os.path.join(user_folder, 'relevant_item.tags') with io.open(relevant_tags_fpath, 'w') as rel: rel.write(u'#ITEM TAG\n') for item in relevant: for tag in item_to_tag[item]: rel.write(u'%d %d\n' %(item, tag))
def user_tag_pairs_to_filter(users_to_consider, annotations, perc_tags=.1, \ num_random_tags=100): ''' Gets use tag pairs to filter. Random tags are filtered if they are used by more than one user. This method also returns random tags to compute value for. ''' user_to_tags, tags_to_user = create_double_occurrence_index( annotations, 'user', 'tag') #Generate candidate tags for removal, they have to be used by more than #one user. tags_to_remove = {} for user in users_to_consider: possible_tags = [] for tag in user_to_tags[user]: if len(tags_to_user[tag]) > 1: #We only consider tags with >1 user possible_tags.append(tag) tags_to_user[tag].remove( user) #Remove this user from the count #num tags to remove for this user num_tags = int(perc_tags * len(user_to_tags[user])) #Generate random candidates candidate_tags = possible_tags[:num_tags] shuffle(candidate_tags) #In place tags_to_remove[user] = candidate_tags #Generate Random tags possible_tags = range(len(tags_to_user)) shuffle(possible_tags) random_tags = [] for tag in possible_tags: used_or_hidden = False for user in users_to_consider: #gets tags not used by any considered user (hidden or not) if tag in user_to_tags[user] or tag in tags_to_remove[user]: used_or_hidden = True break if not used_or_hidden: random_tags.append(tag) if len(random_tags) == num_random_tags: break return tags_to_remove, random_tags
def user_tag_pairs_to_filter(users_to_consider, annotations, perc_tags=.1, \ num_random_tags=100): ''' Gets use tag pairs to filter. Random tags are filtered if they are used by more than one user. This method also returns random tags to compute value for. ''' user_to_tags, tags_to_user = create_double_occurrence_index(annotations, 'user', 'tag') #Generate candidate tags for removal, they have to be used by more than #one user. tags_to_remove = {} for user in users_to_consider: possible_tags = [] for tag in user_to_tags[user]: if len(tags_to_user[tag]) > 1: #We only consider tags with >1 user possible_tags.append(tag) tags_to_user[tag].remove(user) #Remove this user from the count #num tags to remove for this user num_tags = int(perc_tags * len(user_to_tags[user])) #Generate random candidates candidate_tags = possible_tags[:num_tags] shuffle(candidate_tags) #In place tags_to_remove[user] = candidate_tags #Generate Random tags possible_tags = range(len(tags_to_user)) shuffle(possible_tags) random_tags = [] for tag in possible_tags: used_or_hidden = False for user in users_to_consider: #gets tags not used by any considered user (hidden or not) if tag in user_to_tags[user] or tag in tags_to_remove[user]: used_or_hidden = True break if not used_or_hidden: random_tags.append(tag) if len(random_tags) == num_random_tags: break return tags_to_remove, random_tags
def test_double_occurrence_index(self): no_impact = 1 a1 = data_parser.to_json(1, no_impact, 1, no_impact) a2 = data_parser.to_json(1, no_impact, 2, no_impact) a3 = data_parser.to_json(1, no_impact, 1, no_impact) a4 = data_parser.to_json(2, no_impact, 2, no_impact) a5 = data_parser.to_json(2, no_impact, 3, no_impact) from_to, inv = create_double_occurrence_index([a1, a2, a3, a4, a5], 'user', 'tag') self.assertEqual(from_to[1], set([1, 2, 1])) self.assertEqual(from_to[2], set([2, 3])) self.assertEqual(inv[1], set([1])) self.assertEqual(inv[2], set([1, 2])) self.assertEqual(inv[3], set([2]))
def test_double_occurrence_index(self): no_impact = 1 a1 = data_parser.to_json(1, no_impact, 1, no_impact) a2 = data_parser.to_json(1, no_impact, 2, no_impact) a3 = data_parser.to_json(1, no_impact, 1, no_impact) a4 = data_parser.to_json(2, no_impact, 2, no_impact) a5 = data_parser.to_json(2, no_impact, 3, no_impact) from_to, inv = create_double_occurrence_index([a1, a2, a3, a4, a5], 'user', 'tag') self.assertEqual(from_to[1], set([1, 2, 1])) self.assertEqual(from_to[2], set([2, 3])) self.assertEqual(inv[1], set([1])) self.assertEqual(inv[2], set([1, 2])) self.assertEqual(inv[3], set([2]))
def iedge_from_annotations(annotation_it, use=1, return_sink=True): ''' Returns the edge list for the navigational graph. Arguments --------- annotation_it: iterator Iterator to annotations to use use = int {1, 2} Indicates whether to use items or users: 1: Items 2: Users return_sink = bool (defaults to True) Tells whether to return tag to sink edges ''' choices = {1: 'item', 2: 'user'} dest = choices[use] tag_index, sink_index = create_double_occurrence_index( annotation_it, 'tag', dest) return iedge_from_indexes(tag_index, sink_index, return_sink)
def iedge_from_annotations(annotation_it, use=1, return_sink = True): ''' Returns the edge list for the navigational graph. Arguments --------- annotation_it: iterator Iterator to annotations to use use = int {1, 2} Indicates whether to use items or users: 1: Items 2: Users return_sink = bool (defaults to True) Tells whether to return tag to sink edges ''' choices = {1:'item', 2:'user'} dest = choices[use] tag_index, sink_index = create_double_occurrence_index(annotation_it, 'tag', dest) return iedge_from_indexes(tag_index, sink_index, return_sink)
def get_user_item_pairs_to_filter(users_to_consider, annotations, perc_items=.1): ''' Gets user item pairs to filter. A percentage (`perc_items`) is filtered for each user. The code to guarantees that we do not delete items from the trace completely, that is, while removing items for users we guarantee that at we do not make an item be annotated by zero users. Thus, this code does not guarantee that exactly `perc_items` will be removed per user. ''' user_to_items = {} item_to_users = {} user_to_items, item_to_users = create_double_occurrence_index( annotations, 'user', 'item') user_item_pairs_to_filter = {} for user in user_to_items: #num items to remove for this user num_item = int(perc_items * len(user_to_items[user])) #Generate random candidates user_items = [item for item in user_to_items[user]] shuffle(user_items) #in place shuffle to_remove = [] for item in user_items[:num_item]: if len(item_to_users[item]) > 1: #at least one user left item_to_users[item].remove(user) to_remove.append(item) user_item_pairs_to_filter[user] = to_remove return user_item_pairs_to_filter
def get_user_item_pairs_to_filter(users_to_consider, annotations, perc_items=.1): ''' Gets user item pairs to filter. A percentage (`perc_items`) is filtered for each user. The code to guarantees that we do not delete items from the trace completely, that is, while removing items for users we guarantee that at we do not make an item be annotated by zero users. Thus, this code does not guarantee that exactly `perc_items` will be removed per user. ''' user_to_items = {} item_to_users = {} user_to_items, item_to_users = create_double_occurrence_index( annotations, 'user', 'item') user_item_pairs_to_filter = {} for user in user_to_items: #num items to remove for this user num_item = int(perc_items * len(user_to_items[user])) #Generate random candidates user_items = [item for item in user_to_items[user]] shuffle(user_items) #in place shuffle to_remove = [] for item in user_items[:num_item]: if len(item_to_users[item]) > 1: #at least one user left item_to_users[item].remove(user) to_remove.append(item) user_item_pairs_to_filter[user] = to_remove return user_item_pairs_to_filter
def compute_for_user(database, table, user, relevant, annotated, smooth_func, lambda_, user_profile_size, out_folder): with AnnotReader(database) as reader: reader.change_table(table) #Relevant items by user are left out with this query query = { '$or': [{ 'user': { '$ne': user } }, { 'item': { '$nin': relevant } }] } #Probability estimator est = SmoothEstimator(smooth_func, lambda_, reader.iterate(query=query), user_profile_size=user_profile_size) value_calc = value_calculator.ValueCalculator(est) fname = 'user_%d' % user user_folder = os.path.join(out_folder, fname) os.mkdir(user_folder) #Initial information with io.open(os.path.join(user_folder, 'info'), 'w') as info: info.write(u'#UID: %d\n' % user) relevant_str = ' '.join([str(i) for i in relevant]) annotated_str = ' '.join([str(i) for i in annotated]) info.write(u'# %d relevant items: %s\n' % (len(relevant), str(relevant_str))) info.write(u'# %d annotated items: %s\n' % (len(annotated), str(annotated_str))) #Create Graph tag_to_item, item_to_tag = \ index_creator.create_double_occurrence_index(reader.iterate(query = query), 'tag', 'item') create_graph(tag_to_item, item_to_tag, user_folder) #Items to consider <-> Gamma items annotated_set = set(annotated) iestimates = value_calc.item_value(user) #Filter top 10 top_vals = iestimates.argsort() items_to_consider = set() for item in top_vals: if item in annotated_set: continue items_to_consider.add(item) if len(items_to_consider) == 10: break compute_tag_values(est, value_calc, tag_to_item, user, user_folder, np.array([i for i in items_to_consider])) with io.open(os.path.join(user_folder, 'relevant_item.tags'), 'w') as rel: rel.write(u'#ITEM TAG\n') for item in relevant: for tag in item_to_tag[item]: rel.write(u'%d %d\n' % (item, tag))