def get_baselines(annot_filter, reader, user_to_tags): annotations = annot_filter.annotations(reader.iterate()) user_to_item = create_occurrence_index(annotations, 'user', 'item') annotations = annot_filter.annotations(reader.iterate()) item_to_tags = create_occurrence_index(annotations, 'item', 'tag') overlap = {} for user in user_to_tags: for item in user_to_item: for tag in item_to_tags[item]: if (user, tag) not in overlap: overlap[user, tag] = 0 if tag not in user_to_tags[user]: overlap[user, tag] += 1 idf = {} annotations = annot_filter.annotations(reader.iterate()) for annot in annotations: tag = annot['tag'] if tag not in idf: idf[tag] = 0 idf[tag] += 1 for tag in idf.keys(): idf[tag] = 1.0 / idf[tag] return idf, overlap
def get_baselines(annot_filter, reader, user_to_tags): annotations = annot_filter.annotations(reader.iterate()) user_to_item = create_occurrence_index(annotations, 'user', 'item') annotations = annot_filter.annotations(reader.iterate()) item_to_tags = create_occurrence_index(annotations, 'item', 'tag') overlap = {} for user in user_to_tags: for item in user_to_item: for tag in item_to_tags[item]: if (user, tag) not in overlap: overlap[user, tag] = 0 if tag not in user_to_tags[user]: overlap[user, tag] += 1 idf = {} annotations = annot_filter.annotations(reader.iterate()) for annot in annotations: tag = annot['tag'] if tag not in idf: idf[tag] = 0 idf[tag] += 1 for tag in idf.keys(): idf[tag] = 1.0 / idf[tag] return idf, overlap
def fetch_tags_and_items(reader, min_tag_freq=1): ''' This method retrieves an array of every item id, another one for every tag id and a dict mapping tag ids to the items ids annotated by every tag. We also return the popularity of each tag. Arguments --------- reader: `AnnotReader` reader which connects to DB min_tag_freq: int Indicates that we should ignore tags with a frequency lower than this argument. ''' tag_to_item = {} tags = [] items = set() #Filter some very infrequent tags? tag_pop = defaultdict(int) for row in reader.iterate(): items.add(row['item']) tag_pop[row['tag']] += 1 temp_index = create_occurrence_index(reader.iterate(), 'tag', 'item') for tag_id in temp_index: if min_tag_freq == -1 or tag_pop[tag_id] >= min_tag_freq: tags.append(tag_id) tag_to_item[tag_id] = np.array([i for i in temp_index[tag_id]]) return np.arange(len(items)), np.array(sorted(tags), dtype='int64'), \ tag_to_item, tag_pop
def fetch_tags_and_items(reader, min_tag_freq=1): ''' This method retrieves an array of every item id, another one for every tag id and a dict mapping tag ids to the items ids annotated by every tag. We also return the popularity of each tag. Arguments --------- reader: `AnnotReader` reader which connects to DB min_tag_freq: int Indicates that we should ignore tags with a frequency lower than this argument. ''' tag_to_item = {} tags = [] items = set() #Filter some very infrequent tags? tag_pop = defaultdict(int) for row in reader.iterate(): items.add(row['item']) tag_pop[row['tag']] += 1 temp_index = create_occurrence_index(reader.iterate(), 'tag', 'item') for tag_id in temp_index: if min_tag_freq == -1 or tag_pop[tag_id] >= min_tag_freq: tags.append(tag_id) tag_to_item[tag_id] = np.array([i for i in temp_index[tag_id]]) return np.arange(len(items)), np.array(sorted(tags), dtype='int64'), \ tag_to_item, tag_pop
def main(library_thing_annotations_fpath, output_folder, num_users=20, perc_items=.1, estimator='lda', rand_seed=None): seed(rand_seed) #Basic asserts for the folder assert os.path.isdir(output_folder) assert len(os.listdir(output_folder)) == 0 #Load LT file base_annotations, user_ids, item_ids, tag_ids = \ create_annotations(library_thing_annotations_fpath) #Get most popular users user_pop = np.zeros(len(user_ids)) for annot in base_annotations: user_pop[annot['user']] += 1 users_to_consider = user_pop.argsort()[::-1][:num_users] user_item_pairs_to_filter = \ get_user_item_pairs_to_filter(users_to_consider, base_annotations) #Create estimator filtered_annotations = FilteredAnnotations(user_item_pairs_to_filter) annotations = filtered_annotations.annotations(base_annotations) if estimator == 'smooth': est = create_smooth_estimator(annotations) elif estimator == 'lda': est = create_lda_estimator(annotations, len(item_ids), len(tag_ids)) else: raise Exception('Unknown estimator, please choose from {lda, smooth}') #Run experiment! annotations = filtered_annotations.annotations(base_annotations) user_to_item = create_occurrence_index(annotations, 'user', 'item') for user in users_to_consider: gamma_items = [item for item in xrange(len(item_ids)) \ if item not in user_to_item[item]] probs_i_given_u = est.prob_items_given_user(user, np.asarray(gamma_items)) piu_fpath = os.path.join(output_folder, 'probs-user-%d.dat' % user) np.savetxt(piu_fpath, probs_i_given_u) hidden_fpath = os.path.join(output_folder, 'hidden-items-for-user-%d.dat' % user) np.savetxt(hidden_fpath, user_item_pairs_to_filter[user]) item_ids_fpath = os.path.join(output_folder, 'gamma-item-ids-user-%d.dat' % user) np.savetxt(item_ids_fpath, gamma_items)
def main(library_thing_annotations_fpath, output_folder, num_users=20, perc_items=.1, estimator='lda', rand_seed=None): seed(rand_seed) #Basic asserts for the folder assert os.path.isdir(output_folder) assert len(os.listdir(output_folder)) == 0 #Load LT file base_annotations, user_ids, item_ids, tag_ids = \ create_annotations(library_thing_annotations_fpath) #Get most popular users user_pop = np.zeros(len(user_ids)) for annot in base_annotations: user_pop[annot['user']] += 1 users_to_consider = user_pop.argsort()[::-1][:num_users] user_item_pairs_to_filter = \ get_user_item_pairs_to_filter(users_to_consider, base_annotations) #Create estimator filtered_annotations = FilteredAnnotations(user_item_pairs_to_filter) annotations = filtered_annotations.annotations(base_annotations) if estimator == 'smooth': est = create_smooth_estimator(annotations) elif estimator == 'lda': est = create_lda_estimator(annotations, len(item_ids), len(tag_ids)) else: raise Exception('Unknown estimator, please choose from {lda, smooth}') #Run experiment! annotations = filtered_annotations.annotations(base_annotations) user_to_item = create_occurrence_index(annotations, 'user', 'item') for user in users_to_consider: gamma_items = [item for item in xrange(len(item_ids)) \ if item not in user_to_item[item]] probs_i_given_u = est.prob_items_given_user(user, np.asarray(gamma_items)) piu_fpath = os.path.join(output_folder, 'probs-user-%d.dat' % user) np.savetxt(piu_fpath, probs_i_given_u) hidden_fpath = os.path.join(output_folder, 'hidden-items-for-user-%d.dat' % user) np.savetxt(hidden_fpath, user_item_pairs_to_filter[user]) item_ids_fpath = os.path.join(output_folder, 'gamma-item-ids-user-%d.dat' % user) np.savetxt(item_ids_fpath, gamma_items)
def main(library_thing_annotations_fpath, output_folder, num_users=20, perc_tags=.1, estimator='lda', num_random_tags=100, rand_seed=None): seed(rand_seed) #Basic asserts for the folder assert os.path.isdir(output_folder) assert len(os.listdir(output_folder)) == 0 #Load LT file base_annotations, user_ids, item_ids, tag_ids = \ create_annotations(library_thing_annotations_fpath) #Get most popular users user_pop = np.zeros(len(user_ids)) for annot in base_annotations: user_pop[annot['user']] += 1 users_to_consider = user_pop.argsort()[::-1][:num_users] #Get user tag pairs to filter and random tags user_to_hidden_tags, random_tags = \ user_tag_pairs_to_filter(users_to_consider, base_annotations, perc_tags) #Create estimator filtered_annotations = FilteredAnnotations(user_to_hidden_tags) annotations = filtered_annotations.annotations(base_annotations) if estimator == 'smooth': est = create_smooth_estimator(annotations) elif estimator == 'lda': est = create_lda_estimator(annotations, len(item_ids), len(tag_ids)) else: raise Exception('Unknown estimator, please choose from {lda, smooth}') #This next line is needed to create a new generator annotations = filtered_annotations.annotations(base_annotations) value_calculator = ValueCalculator(est, annotations) #Run experiment! annotations = filtered_annotations.annotations(base_annotations) user_to_item = create_occurrence_index(annotations, 'user', 'item') for user in users_to_consider: gamma_items = [item for item in xrange(len(item_ids)) \ if item not in user_to_item[item]] tags_hidden = user_to_hidden_tags[user] run_one_user(user, value_calculator, gamma_items, tags_hidden, \ random_tags, output_folder)
def test_occurence_index_user_to_item(self): #Not the best of names, but we attribute this to fields #which have no impact on the test. no_impact = 1 a1 = data_parser.to_json(1, 1, no_impact, no_impact) a2 = data_parser.to_json(1, 2, no_impact, no_impact) a3 = data_parser.to_json(1, 1, no_impact, no_impact) a4 = data_parser.to_json(2, 2, no_impact, no_impact) a5 = data_parser.to_json(2, 3, no_impact, no_impact) index = create_occurrence_index([a1, a2, a3, a4, a5], 'user', 'item') self.assertEqual(index[1], set([1, 2, 1])) self.assertEqual(index[2], set([2, 3]))
def test_occurence_index_user_to_item(self): #Not the best of names, but we attribute this to fields #which have no impact on the test. no_impact = 1 a1 = data_parser.to_json(1, 1, no_impact, no_impact) a2 = data_parser.to_json(1, 2, no_impact, no_impact) a3 = data_parser.to_json(1, 1, no_impact, no_impact) a4 = data_parser.to_json(2, 2, no_impact, no_impact) a5 = data_parser.to_json(2, 3, no_impact, no_impact) index = create_occurrence_index([a1, a2, a3, a4, a5], 'user', 'item') self.assertEqual(index[1], set([1, 2, 1])) self.assertEqual(index[2], set([2, 3]))
def main(library_thing_annotations_fpath, output_folder, num_users=20, perc_tags=.1, estimator='lda', num_random_tags=100, rand_seed=None): seed(rand_seed) #Basic asserts for the folder assert os.path.isdir(output_folder) assert len(os.listdir(output_folder)) == 0 #Load LT file base_annotations, user_ids, item_ids, tag_ids = \ create_annotations(library_thing_annotations_fpath) #Get most popular users user_pop = np.zeros(len(user_ids)) for annot in base_annotations: user_pop[annot['user']] += 1 users_to_consider = user_pop.argsort()[::-1][:num_users] #Get user tag pairs to filter and random tags user_to_hidden_tags, random_tags = \ user_tag_pairs_to_filter(users_to_consider, base_annotations, perc_tags) #Create estimator filtered_annotations = FilteredAnnotations(user_to_hidden_tags) annotations = filtered_annotations.annotations(base_annotations) if estimator == 'smooth': est = create_smooth_estimator(annotations) elif estimator == 'lda': est = create_lda_estimator(annotations, len(item_ids), len(tag_ids)) else: raise Exception('Unknown estimator, please choose from {lda, smooth}') #This next line is needed to create a new generator annotations = filtered_annotations.annotations(base_annotations) value_calculator = ValueCalculator(est, annotations) #Run experiment! annotations = filtered_annotations.annotations(base_annotations) user_to_item = create_occurrence_index(annotations, 'user', 'item') for user in users_to_consider: gamma_items = [item for item in xrange(len(item_ids)) \ if item not in user_to_item[item]] tags_hidden = user_to_hidden_tags[user] run_one_user(user, value_calculator, gamma_items, tags_hidden, \ random_tags, output_folder)
def generator(): with AnnotReader(database) as reader: '''Yields parameters for each user''' reader.change_table(table) uitem_idx = index_creator.create_occurrence_index( reader.iterate(), 'user', 'item') filt = lambda u: len(uitem_idx[u]) >= 10 for user in ifilter(filt, uitem_idx.iterkeys()): items = [item for item in uitem_idx[user]] half = len(items) // 2 relevant = items[:half] annotated = items[half:] yield database, table, user, relevant, annotated, \ smooth_func, lambda_, user_profile_size, out_folder
def generator(): with AnnotReader(database) as reader: '''Yields parameters for each user''' reader.change_table(table) uitem_idx = index_creator.create_occurrence_index( reader.iterate(), 'user', 'item') filt = lambda u: len(uitem_idx[u]) >= 30 for user in ifilter(filt, uitem_idx.iterkeys()): items = [item for item in uitem_idx[user]] cut = len(items) - num_relevant relevant = items[cut:] annotated = items[:cut] yield database, table, user, relevant, annotated, \ smooth_func, lambda_, user_profile_size, out_folder
def run_exp(user_validation_tags, user_test_tags, user_test_items, est, annot_filter, reader): user_to_tags = {} for user in est.get_valid_users(): #Remove validation tags. The script focuses on test tags tags_to_compute = [] tags = est.tags_for_user(user) for tag in tags: if tag not in user_validation_tags[user]: tags_to_compute.append(tag) user_to_tags[user] = tags_to_compute annotations = annot_filter.annotations(reader.iterate()) tag_to_items = create_occurrence_index(annotations, 'tag', 'item') # item_to_tags = create_occurrence_index(annotations, 'item', 'tag') print('#user', 'tag', 'precision', 'recall', 'hidden') for user in est.get_valid_users(): tags = user_to_tags[user] for tag in tags: hidden = tag in user_test_tags[user] relevant = user_test_items[user] retrieved = tag_to_items[tag] intersect = retrieved.intersection(relevant) precision = len(intersect) / len(retrieved) recall = len(intersect) / len(relevant) # tags_for_relevant = set() # for item in relevant: # tags_for_relevant.update(item_to_tags[item]) print(user, tag, precision, recall, hidden)