Пример #1
def real_main(database, table, smooth_func, lambda_,
    with AnnotReader(database) as reader:
        #Create Graph
        create_graph(reader.iterate(), out_folder)
        #Compute popularity
        tag_pop = collections.defaultdict(int)
        for annotation in reader.iterate():
            tag = annotation['tag']
            tag_pop[tag] += 1
        #Compute tag value
        tag_to_item, item_to_tag = \
                                                        'tag', 'item')
        compute_tag_values(smooth_func, lambda_,
                           reader.iterate(), tag_to_item, tag_pop, out_folder)

        with io.open(os.path.join(out_folder, 'relevant_item.tags'), 'w') as rel:
            rel.write(u'#ITEM TAG\n')
            for item in item_to_tag:
                for tag in item_to_tag[tag]:
                    rel.write(u'%d %d\n' %(item, tag))  
Пример #2
def compute_for_user(database, table, user, relevant, annotated, 
                     smooth_func, lambda_, user_profile_size, out_folder):
    with AnnotReader(database) as reader:
        #Relevant items by user are left out with this query
        query = {'$or' : [
                          { 'user':{'$ne'  : user} }, 
                          { 'item':{'$nin' : relevant} }
        #Probability estimator
        est = SmoothEstimator(smooth_func, lambda_, 
                              reader.iterate(query = query),
                              user_profile_size = user_profile_size)
        value_calc = value_calculator.ValueCalculator(est)
        fname = 'user_%d' % user
        user_folder = os.path.join(out_folder, fname)
        #Initial information
        with io.open(os.path.join(user_folder, 'info'), 'w') as info:
            info.write(u'#UID: %d\n' %user)
            relevant_str = ' '.join([str(i) for i in relevant])
            annotated_str = ' '.join([str(i) for i in annotated])
            info.write(u'# %d relevant  items: %s\n' %(len(relevant), 
            info.write(u'# %d annotated items: %s\n' %(len(annotated), 
        #Create Graph
        iterator = reader.iterate(query = query)
        tag_to_item, item_to_tag = \
                                                         'tag', 'item')
        #Items to consider <-> Gamma items
        items_to_consider = set(xrange(est.num_items()))
        annotated_set = set(annotated)
        compute_tag_values(est, value_calc, tag_to_item, user, 
                           np.array([i for i in items_to_consider]))
        relevant_tags_fpath = os.path.join(user_folder, 'relevant_item.tags')
        with io.open(relevant_tags_fpath, 'w') as rel:
            rel.write(u'#ITEM TAG\n')
            for item in relevant:
                for tag in item_to_tag[item]:
                    rel.write(u'%d %d\n' %(item, tag))
Пример #3
def user_tag_pairs_to_filter(users_to_consider, annotations, perc_tags=.1, \
    Gets use tag pairs to filter. Random tags are filtered if they are used
    by more than one user. This method also returns random tags to compute
    value for.

    user_to_tags, tags_to_user = create_double_occurrence_index(
        annotations, 'user', 'tag')

    #Generate candidate tags for removal, they have to be used by more than
    #one user.
    tags_to_remove = {}
    for user in users_to_consider:
        possible_tags = []
        for tag in user_to_tags[user]:
            if len(tags_to_user[tag]) > 1:  #We only consider tags with >1 user
                    user)  #Remove this user from the count

        #num tags to remove for this user
        num_tags = int(perc_tags * len(user_to_tags[user]))

        #Generate random candidates
        candidate_tags = possible_tags[:num_tags]
        shuffle(candidate_tags)  #In place

        tags_to_remove[user] = candidate_tags

    #Generate Random tags
    possible_tags = range(len(tags_to_user))
    random_tags = []

    for tag in possible_tags:
        used_or_hidden = False

        for user in users_to_consider:
            #gets tags not used by any considered user (hidden or not)
            if tag in user_to_tags[user] or tag in tags_to_remove[user]:
                used_or_hidden = True

        if not used_or_hidden:

        if len(random_tags) == num_random_tags:

    return tags_to_remove, random_tags
Пример #4
def user_tag_pairs_to_filter(users_to_consider, annotations, perc_tags=.1, \
    Gets use tag pairs to filter. Random tags are filtered if they are used
    by more than one user. This method also returns random tags to compute
    value for.
    user_to_tags, tags_to_user = create_double_occurrence_index(annotations, 
            'user', 'tag')
    #Generate candidate tags for removal, they have to be used by more than
    #one user.
    tags_to_remove = {}
    for user in users_to_consider:
        possible_tags = []
        for tag in user_to_tags[user]:
            if len(tags_to_user[tag]) > 1: #We only consider tags with >1 user
                tags_to_user[tag].remove(user) #Remove this user from the count
        #num tags to remove for this user
        num_tags = int(perc_tags * len(user_to_tags[user]))
        #Generate random candidates
        candidate_tags = possible_tags[:num_tags]
        shuffle(candidate_tags) #In place
        tags_to_remove[user] = candidate_tags
    #Generate Random tags
    possible_tags = range(len(tags_to_user))
    random_tags = []
    for tag in possible_tags:
        used_or_hidden = False
        for user in users_to_consider:
            #gets tags not used by any considered user (hidden or not)
            if tag in user_to_tags[user] or tag in tags_to_remove[user]: 
                used_or_hidden = True
        if not used_or_hidden:
        if len(random_tags) == num_random_tags:
    return tags_to_remove, random_tags
Пример #5
 def test_double_occurrence_index(self):
     no_impact = 1
     a1 = data_parser.to_json(1, no_impact, 1, no_impact)
     a2 = data_parser.to_json(1, no_impact, 2, no_impact)
     a3 = data_parser.to_json(1, no_impact, 1, no_impact)
     a4 = data_parser.to_json(2, no_impact, 2, no_impact)
     a5 = data_parser.to_json(2, no_impact, 3, no_impact)
     from_to, inv = create_double_occurrence_index([a1, a2, a3, a4, a5], 
                                                   'user', 'tag')
     self.assertEqual(from_to[1], set([1, 2, 1]))
     self.assertEqual(from_to[2], set([2, 3]))
     self.assertEqual(inv[1], set([1]))
     self.assertEqual(inv[2], set([1, 2]))
     self.assertEqual(inv[3], set([2]))
Пример #6
    def test_double_occurrence_index(self):
        no_impact = 1

        a1 = data_parser.to_json(1, no_impact, 1, no_impact)
        a2 = data_parser.to_json(1, no_impact, 2, no_impact)
        a3 = data_parser.to_json(1, no_impact, 1, no_impact)
        a4 = data_parser.to_json(2, no_impact, 2, no_impact)
        a5 = data_parser.to_json(2, no_impact, 3, no_impact)

        from_to, inv = create_double_occurrence_index([a1, a2, a3, a4, a5],
                                                      'user', 'tag')
        self.assertEqual(from_to[1], set([1, 2, 1]))
        self.assertEqual(from_to[2], set([2, 3]))

        self.assertEqual(inv[1], set([1]))
        self.assertEqual(inv[2], set([1, 2]))
        self.assertEqual(inv[3], set([2]))
Пример #7
def iedge_from_annotations(annotation_it, use=1, return_sink=True):
    Returns the edge list for the navigational graph.
    annotation_it: iterator
        Iterator to annotations to use
    use = int {1, 2}
        Indicates whether to use items or users:
            1: Items
            2: Users
    return_sink = bool (defaults to True)
        Tells whether to return tag to sink edges
    choices = {1: 'item', 2: 'user'}
    dest = choices[use]

    tag_index, sink_index = create_double_occurrence_index(
        annotation_it, 'tag', dest)
    return iedge_from_indexes(tag_index, sink_index, return_sink)
Пример #8
def iedge_from_annotations(annotation_it, use=1, return_sink = True):
    Returns the edge list for the navigational graph.
    annotation_it: iterator
        Iterator to annotations to use
    use = int {1, 2}
        Indicates whether to use items or users:
            1: Items
            2: Users
    return_sink = bool (defaults to True)
        Tells whether to return tag to sink edges
    choices = {1:'item',
    dest = choices[use]
    tag_index, sink_index = create_double_occurrence_index(annotation_it, 
                                                           'tag', dest)
    return iedge_from_indexes(tag_index, sink_index, return_sink)
Пример #9
def get_user_item_pairs_to_filter(users_to_consider,
    Gets user item pairs to filter. A percentage (`perc_items`) is filtered for
    each user. 

    The code to guarantees that we do not delete items from the trace 
    completely, that is, while removing items for users we guarantee that
    at we do not make an item be annotated by zero users. Thus, this code does 
    not guarantee that exactly `perc_items` will be removed per user.

    user_to_items = {}
    item_to_users = {}

    user_to_items, item_to_users = create_double_occurrence_index(
        annotations, 'user', 'item')

    user_item_pairs_to_filter = {}
    for user in user_to_items:

        #num items to remove for this user
        num_item = int(perc_items * len(user_to_items[user]))

        #Generate random candidates
        user_items = [item for item in user_to_items[user]]
        shuffle(user_items)  #in place shuffle

        to_remove = []
        for item in user_items[:num_item]:
            if len(item_to_users[item]) > 1:  #at least one user left

        user_item_pairs_to_filter[user] = to_remove

    return user_item_pairs_to_filter
def get_user_item_pairs_to_filter(users_to_consider, annotations, 
    Gets user item pairs to filter. A percentage (`perc_items`) is filtered for
    each user. 

    The code to guarantees that we do not delete items from the trace 
    completely, that is, while removing items for users we guarantee that
    at we do not make an item be annotated by zero users. Thus, this code does 
    not guarantee that exactly `perc_items` will be removed per user.
    user_to_items = {}
    item_to_users = {}
    user_to_items, item_to_users = create_double_occurrence_index(
            annotations, 'user', 'item')
    user_item_pairs_to_filter = {}
    for user in user_to_items:

        #num items to remove for this user
        num_item = int(perc_items * len(user_to_items[user]))
        #Generate random candidates
        user_items = [item for item in user_to_items[user]]
        shuffle(user_items) #in place shuffle
        to_remove = []
        for item in user_items[:num_item]:
            if len(item_to_users[item]) > 1: #at least one user left

        user_item_pairs_to_filter[user] = to_remove
    return user_item_pairs_to_filter
Пример #11
def compute_for_user(database, table, user, relevant, annotated, smooth_func,
                     lambda_, user_profile_size, out_folder):
    with AnnotReader(database) as reader:

        #Relevant items by user are left out with this query
        query = {
            '$or': [{
                'user': {
                    '$ne': user
            }, {
                'item': {
                    '$nin': relevant

        #Probability estimator
        est = SmoothEstimator(smooth_func,
        value_calc = value_calculator.ValueCalculator(est)

        fname = 'user_%d' % user
        user_folder = os.path.join(out_folder, fname)

        #Initial information
        with io.open(os.path.join(user_folder, 'info'), 'w') as info:
            info.write(u'#UID: %d\n' % user)

            relevant_str = ' '.join([str(i) for i in relevant])
            annotated_str = ' '.join([str(i) for i in annotated])

            info.write(u'# %d relevant  items: %s\n' %
                       (len(relevant), str(relevant_str)))
            info.write(u'# %d annotated items: %s\n' %
                       (len(annotated), str(annotated_str)))

        #Create Graph
        tag_to_item, item_to_tag = \
            index_creator.create_double_occurrence_index(reader.iterate(query = query),
                                                         'tag', 'item')

        create_graph(tag_to_item, item_to_tag, user_folder)

        #Items to consider <-> Gamma items
        annotated_set = set(annotated)
        iestimates = value_calc.item_value(user)

        #Filter top 10
        top_vals = iestimates.argsort()
        items_to_consider = set()
        for item in top_vals:
            if item in annotated_set:

            if len(items_to_consider) == 10:

        compute_tag_values(est, value_calc, tag_to_item, user, user_folder,
                           np.array([i for i in items_to_consider]))

        with io.open(os.path.join(user_folder, 'relevant_item.tags'),
                     'w') as rel:
            rel.write(u'#ITEM TAG\n')
            for item in relevant:
                for tag in item_to_tag[item]:
                    rel.write(u'%d %d\n' % (item, tag))