def main(database, table, smooth_func, lambda_, min_tag_freq): with AnnotReader(database) as reader: reader.change_table(table) #Builds value calculator estimator = SmoothEstimator(smooth_func, lambda_, reader.iterate()) calculator = ValueCalculator(estimator) #Determine tags which will be considered tags_to_consider = [] if min_tag_freq < 0: #All tags tags_to_consider = range(estimator.num_tags()) else: counter = Counter(annot['tag'] for annot in reader.iterate()) for tag, pop in counter.iteritems(): if pop >= min_tag_freq: tags_to_consider.append(tag) #Dumps probabilities connection = None database = None try: items = np.arange(estimator.num_items()) for tag in tags_to_consider: v_prob_it = calculator.rnorm_prob_items_given_tag(tag, items) for item in xrange(len(v_prob_it)): prob = float(v_prob_it[item]) print({'tag':tag, 'item':item, 'prob_it':prob}) finally: if connection: connection.disconnect()
def test_prob_user_given_item(self): self.__init_test(test.SMALL_DEL_FILE) smooth_func = 'JM' lamb = 0.5 p = SmoothEstimator(smooth_func, lamb, self.annots, 1) prob = p.prob_user_given_item(0, 0) self.assertTrue(prob > 0)
def test_prob_user_given_item_profsize(self): self.__init_test(test.SMALL_DEL_FILE) smooth_func = 'JM' lamb = 0.5 p = SmoothEstimator(smooth_func, lamb, self.annots, user_profile_fract_size = 0) prob = p.prob_user_given_item(0, 0) self.assertEquals(prob, 0.0)
def compute_for_user(database, table, user, relevant, annotated, smooth_func, lambda_, user_profile_size, out_folder): with AnnotReader(database) as reader: reader.change_table(table) #Relevant items by user are left out with this query query = {'$or' : [ { 'user':{'$ne' : user} }, { 'item':{'$nin' : relevant} } ] } #Probability estimator est = SmoothEstimator(smooth_func, lambda_, reader.iterate(query = query), user_profile_size = user_profile_size) value_calc = value_calculator.ValueCalculator(est) fname = 'user_%d' % user user_folder = os.path.join(out_folder, fname) os.mkdir(user_folder) #Initial information with io.open(os.path.join(user_folder, 'info'), 'w') as info: info.write(u'#UID: %d\n' %user) relevant_str = ' '.join([str(i) for i in relevant]) annotated_str = ' '.join([str(i) for i in annotated]) info.write(u'# %d relevant items: %s\n' %(len(relevant), str(relevant_str))) info.write(u'# %d annotated items: %s\n' %(len(annotated), str(annotated_str))) #Create Graph iterator = reader.iterate(query = query) tag_to_item, item_to_tag = \ index_creator.create_double_occurrence_index(iterator, 'tag', 'item') #Items to consider <-> Gamma items items_to_consider = set(xrange(est.num_items())) annotated_set = set(annotated) items_to_consider.difference_update(annotated_set) compute_tag_values(est, value_calc, tag_to_item, user, user_folder, np.array([i for i in items_to_consider])) relevant_tags_fpath = os.path.join(user_folder, 'relevant_item.tags') with io.open(relevant_tags_fpath, 'w') as rel: rel.write(u'#ITEM TAG\n') for item in relevant: for tag in item_to_tag[item]: rel.write(u'%d %d\n' %(item, tag))
def test_prob_user_given_item_profsize(self): self.__init_test(test.SMALL_DEL_FILE) smooth_func = 'JM' lamb = 0.5 p = SmoothEstimator(smooth_func, lamb, self.annots, user_profile_fract_size=0) prob = p.prob_user_given_item(0, 0) self.assertEquals(prob, 0.0)
def test_prob_item(self): self.__init_test(test.SMALL_DEL_FILE) smooth_func = 'JM' lamb = 0.5 p = SmoothEstimator(smooth_func, lamb, self.annots, 1) #Item probabilities self.assertAlmostEquals(p.prob_item(0), 5 / 10) self.assertAlmostEquals(p.prob_item(1), 1 / 10) self.assertAlmostEquals(p.prob_item(2), 2 / 10) self.assertAlmostEquals(p.prob_item(3), 1 / 10) self.assertAlmostEquals(p.prob_item(4), 1 / 10)
def test_prob_item_given_tag(self): self.__init_test(test.SMALL_DEL_FILE) lambda_ = 0.3 smooth_func = 'Bayes' p = SmoothEstimator(smooth_func, lambda_, self.annots, 1) for tag in [0, 1, 2, 3, 4, 5]: pis = [] pits = [] for item in [0, 1, 2, 3, 4]: pi = p.prob_item(item) pti = p.prob_tag_given_item(item, tag) pis.append(pi) pits.append(pti * pi) #Assert pis = np.array(pis) pis /= pis.sum() pits = np.array(pits) pits /= pits.sum() gamma_items = np.array([0, 1, 2, 3, 4]) assert_array_almost_equal(pis, p.prob_items(gamma_items)) assert_array_almost_equal(pits, p.prob_items_given_tag(tag, gamma_items)) self.assertAlmostEqual(1, sum(p.prob_items(gamma_items))) self.assertAlmostEqual( 1, sum(p.prob_items_given_tag(tag, gamma_items)))
def test_gamma_items_prob_items(self): self.__init_test(test.SMALL_DEL_FILE) lambda_ = 0.3 smooth_func = 'Bayes' p = SmoothEstimator(smooth_func, lambda_, self.annots, 1) gamma_items = np.array([1, 2]) pi_1 = p.prob_item(1) pi_2 = p.prob_item(2) gamma_pi = p.prob_items(gamma_items) self.assertEqual(gamma_pi[0], pi_1 / (pi_1 + pi_2)) self.assertEqual(gamma_pi[1], pi_2 / (pi_1 + pi_2))
def test_prob_item_given_tag(self): self.__init_test(test.SMALL_DEL_FILE) lambda_ = 0.3 smooth_func = 'Bayes' p = SmoothEstimator(smooth_func, lambda_, self.annots, 1) for tag in [0, 1, 2, 3, 4, 5]: pis = [] pits = [] for item in [0, 1, 2, 3, 4]: pi = p.prob_item(item) pti = p.prob_tag_given_item(item, tag) pis.append(pi) pits.append(pti * pi) #Assert pis = np.array(pis) pis /= pis.sum() pits = np.array(pits) pits /= pits.sum() gamma_items = np.array([0, 1, 2, 3, 4]) assert_array_almost_equal(pis, p.prob_items(gamma_items)) assert_array_almost_equal(pits, p.prob_items_given_tag(tag, gamma_items)) self.assertAlmostEqual(1, sum(p.prob_items(gamma_items))) self.assertAlmostEqual(1, sum(p.prob_items_given_tag(tag, gamma_items)))
def test_tag_given_item(self): self.__init_test(test.SMALL_DEL_FILE) smooth_func = 'JM' lamb = 0.5 p = SmoothEstimator(smooth_func, lamb, self.annots, 1) #Tag given item prob_i0_t0 = jelinek_mercer(2, 5, 3, 10, lamb) prob_i1_t0 = jelinek_mercer(0, 5, 3, 10, lamb) prob_i2_t0 = jelinek_mercer(1, 2, 3, 10, lamb) prob_i3_t0 = jelinek_mercer(0, 5, 3, 10, lamb) prob_i4_t0 = jelinek_mercer(0, 5, 3, 10, lamb) self.assertEquals(p.prob_tag_given_item(0, 0), prob_i0_t0) self.assertEquals(p.prob_tag_given_item(1, 0), prob_i1_t0) self.assertEquals(p.prob_tag_given_item(2, 0), prob_i2_t0) self.assertEquals(p.prob_tag_given_item(3, 0), prob_i3_t0) self.assertEquals(p.prob_tag_given_item(4, 0), prob_i4_t0)
def real_main(database, table, smooth_func, lambda_, user): with AnnotReader(database) as reader: reader.change_table(table) est = SmoothEstimator(smooth_func, lambda_, reader.iterate()) vc = value_calculator.ValueCalculator(est) iitem_value = vc.item_value(user) for item, item_val in iitem_value.iteritems(): print(item, item_val)
def compute_for_user(database, table, user, relevant, annotated, smooth_func, lambda_, user_profile_size, out_folder): with AnnotReader(database) as reader: reader.change_table(table) #Relevant items by user are left out with this query query = {'$or' : [ { 'user':{'$ne' : user} }, { 'item':{'$nin' : relevant} } ] } #Probability estimator est = SmoothEstimator(smooth_func, lambda_, reader.iterate(query=query), user_profile_size = user_profile_size) value_calc = value_calculator.ValueCalculator(est) fname = 'user_%d' % user user_folder = os.path.join(out_folder, fname) os.mkdir(user_folder) #Initial information with open(os.path.join(user_folder, 'info'), 'w') as info: print('#UID: %d' %user, file=info) relevant_str = ' '.join([str(i) for i in relevant]) annotated_str = ' '.join([str(i) for i in annotated]) print('#%d relevant: %s' %(len(relevant), str(relevant_str)), file=info) print('#%d annotated: %s' %(len(annotated), str(annotated_str)), file=info) items = np.array(relevant, dtype='l') v_piu = value_calc.rnorm_prob_items_given_user(user, items) v_dkl = value_calc.tag_value_personalized(user, gamma_items=items) v_dkl_argsort = v_dkl.argsort() top_5_tags = v_dkl_argsort[:5] bottom_5_tags = v_dkl_argsort[len(v_dkl) - 5:] write_points_file(v_piu, os.path.join(user_folder, 'v_piu.dat')) write_points_file(v_dkl, os.path.join(user_folder, 'v_dkl.dat')) for i, tag in enumerate(top_5_tags): v_pitu = value_calc.rnorm_prob_items_given_user_tag(user, tag, items) write_points_file(v_pitu, os.path.join(user_folder, 'v_pitu_tag_%d_top_%d.dat' % (tag, i + 1))) for i, tag in enumerate(bottom_5_tags): v_pitu = value_calc.rnorm_prob_items_given_user_tag(user, tag, items) write_points_file(v_pitu, os.path.join(user_folder, 'v_pitu_tag_%d_bottom_%d.dat' % (tag, 5 - i)))
def test_bayes(self): self.__init_test(test.SMALL_DEL_FILE) smooth_func = 'Bayes' lamb = 0.3 p = SmoothEstimator(smooth_func, lamb, self.annots, 1) prob_i0_t0 = bayes(2, 5, 3, 10, lamb) prob_i0_t1 = bayes(1, 5, 3, 10, lamb) prob_i0_t2 = bayes(0, 5, 1, 10, lamb) prob_i0_t3 = bayes(1, 5, 1, 10, lamb) prob_i0_t4 = bayes(1, 5, 1, 10, lamb) prob_i0_t5 = bayes(0, 5, 1, 10, lamb) self.assertAlmostEquals(p.prob_tag_given_item(0, 0), prob_i0_t0) self.assertAlmostEquals(p.prob_tag_given_item(0, 1), prob_i0_t1) self.assertAlmostEquals(p.prob_tag_given_item(0, 2), prob_i0_t2) self.assertAlmostEquals(p.prob_tag_given_item(0, 3), prob_i0_t3) self.assertAlmostEquals(p.prob_tag_given_item(0, 4), prob_i0_t4) self.assertAlmostEquals(p.prob_tag_given_item(0, 5), prob_i0_t5)
def compute_tag_values(smooth_func, lambda_, annotation_it, tag_to_item, tag_pops, out_folder): est = SmoothEstimator(smooth_func, lambda_, annotation_it) value_calc = value_calculator.ValueCalculator(est) tag_value = value_calc.tag_value_item_search() with io.open(os.path.join(out_folder, 'tag.values'), 'w') as values: for tag, tag_val in enumerate(tag_value): items = np.array([item for item in tag_to_item[tag]]) mean_prob = value_calc.rnorm_prob_items(items).mean() final_val = tag_val * mean_prob values.write(u'%d %d %.15f %.15f %.15f\n' % (tag, tag_pops[tag], tag_val, mean_prob, final_val))
def create_bayes_estimator(annotations, lambda_, user_profile_fract_size=.4): ''' Creates smooth estimator with the best Bayes parameter described in [1]_ References ---------- [1]_ Personalization of Tagging Systems, Wang, Jun, Clements Maarten, Yang J., de Vries Arjen P., and Reinders Marcel J. T. , Information Processing and Management, Volume 46, Issue 1, p.58-70, (2010) ''' smooth_estimator = SmoothEstimator('Bayes', lambda_, annotations, user_profile_fract_size) return smooth_estimator
def main(database, table, smooth_func, lambda_, alpha, output_folder, min_tag_freq=1): assert os.path.isdir( output_folder), '%s is not a directory' % output_folder tag_value_fpath = os.path.join(output_folder, 'tag.values') item_tag_fpath = os.path.join(output_folder, 'item_tag.pairs') item_probs_fpath = os.path.join(output_folder, 'item.probs') with AnnotReader(database) as reader: reader.change_table(table) #Determine the items annotated by each tag and array of all items items_array, tags_array, tag_to_item, tag_pop = \ fetch_tags_and_items(reader, min_tag_freq) #Generates user profile based on zipf and computes value n_items = items_array.shape[0] seeker_profile = np.zeros(n_items, dtype='float64') n_dists = 10 for i in xrange(n_dists): seeker_profile += np.random.zipf(alpha, n_items) #Average it out and transform to probabilities seeker_profile /= n_dists seeker_profile /= seeker_profile.sum() #Tag Value estimator = SmoothEstimator(smooth_func, lambda_, reader.iterate()) with open(tag_value_fpath, 'w') as tag_value_file: tag_values(estimator, tags_array, items_array, tag_to_item, seeker_profile, tag_pop, tag_value_file) #Item tag pairs with open(item_tag_fpath, 'w') as item_tag_file: print('#tag_id', 'item_id', file=item_tag_file) for tag_id in tag_to_item: for item_id in tag_to_item[tag_id]: print(tag_id, item_id, file=item_tag_file) with open(item_probs_fpath, 'w') as item_probs_file: print('#item_id', 'prob', file=item_probs_file) for item_id, prob in enumerate(seeker_profile): print(item_id, prob, file=item_probs_file)
def test_prob_items_given_user_and_tag(self): self.__init_test(test.SMALL_DEL_FILE) lambda_ = 0.3 smooth_func = 'Bayes' p = SmoothEstimator(smooth_func, lambda_, self.annots, 1) for user in [0, 1, 2]: for tag in [0, 1, 2, 3, 4, 5]: pitus = [] pius = [] for item in [0, 1, 2, 3, 4]: pi = p.prob_item(item) pti = p.prob_tag_given_item(item, tag) pui = p.prob_user_given_item(item, user) piu = pui * pi pitu = pti * pui * pi pitus.append(pitu) pius.append(piu) sum_pitus = sum(pitus) sum_pius = sum(pius) for item in [0, 1, 2, 3, 4]: pitus[item] = pitus[item] / sum_pitus pius[item] = pius[item] / sum_pius #Assert gamma_items = np.array([0, 1, 2, 3, 4]) assert_array_almost_equal( pius, p.prob_items_given_user(user, gamma_items)) assert_array_almost_equal( pitus, p.prob_items_given_user_tag(user, tag, gamma_items)) self.assertAlmostEqual( 1, sum(p.prob_items_given_user(user, gamma_items))) self.assertAlmostEqual( 1, sum(p.prob_items_given_user_tag(user, tag, gamma_items)))
def test_prob_items_given_user_and_tag(self): self.__init_test(test.SMALL_DEL_FILE) lambda_ = 0.3 smooth_func = 'Bayes' p = SmoothEstimator(smooth_func, lambda_, self.annots, 1) for user in [0, 1, 2]: for tag in [0, 1, 2, 3, 4, 5]: pitus = [] pius = [] for item in [0, 1, 2, 3, 4]: pi = p.prob_item(item) pti = p.prob_tag_given_item(item, tag) pui = p.prob_user_given_item(item, user) piu = pui * pi pitu = pti * pui * pi pitus.append(pitu) pius.append(piu) sum_pitus = sum(pitus) sum_pius = sum(pius) for item in [0, 1, 2, 3, 4]: pitus[item] = pitus[item] / sum_pitus pius[item] = pius[item] / sum_pius #Assert gamma_items = np.array([0, 1, 2, 3, 4]) assert_array_almost_equal(pius, p.prob_items_given_user(user, gamma_items)) assert_array_almost_equal(pitus, p.prob_items_given_user_tag(user, tag, gamma_items)) self.assertAlmostEqual(1, sum(p.prob_items_given_user(user, gamma_items))) self.assertAlmostEqual(1, sum(p.prob_items_given_user_tag(user, tag, gamma_items)))
def compute_for_user(database, table, user, relevant, annotated, smooth_func, lambda_, user_profile_size, out_folder): with AnnotReader(database) as reader: reader.change_table(table) #Relevant items by user are left out with this query query = { '$or': [{ 'user': { '$ne': user } }, { 'item': { '$nin': relevant } }] } #Probability estimator est = SmoothEstimator(smooth_func, lambda_, reader.iterate(query=query), user_profile_size=user_profile_size) value_calc = value_calculator.ValueCalculator(est) fname = 'user_%d' % user user_folder = os.path.join(out_folder, fname) os.mkdir(user_folder) #Initial information with io.open(os.path.join(user_folder, 'info'), 'w') as info: info.write(u'#UID: %d\n' % user) relevant_str = ' '.join([str(i) for i in relevant]) annotated_str = ' '.join([str(i) for i in annotated]) info.write(u'# %d relevant items: %s\n' % (len(relevant), str(relevant_str))) info.write(u'# %d annotated items: %s\n' % (len(annotated), str(annotated_str))) #Create Graph tag_to_item, item_to_tag = \ index_creator.create_double_occurrence_index(reader.iterate(query = query), 'tag', 'item') create_graph(tag_to_item, item_to_tag, user_folder) #Items to consider <-> Gamma items annotated_set = set(annotated) iestimates = value_calc.item_value(user) #Filter top 10 top_vals = iestimates.argsort() items_to_consider = set() for item in top_vals: if item in annotated_set: continue items_to_consider.add(item) if len(items_to_consider) == 10: break compute_tag_values(est, value_calc, tag_to_item, user, user_folder, np.array([i for i in items_to_consider])) with io.open(os.path.join(user_folder, 'relevant_item.tags'), 'w') as rel: rel.write(u'#ITEM TAG\n') for item in relevant: for tag in item_to_tag[item]: rel.write(u'%d %d\n' % (item, tag))
def build_value_calculator(self, annots, smooth_func, lambda_): est = SmoothEstimator(smooth_func, lambda_, annots, 1) return est, value_calculator.ValueCalculator(est, annots)