示例#1
0
class SRRecommendation(tdb_cassandra.View):
    _use_db = True

    _compare_with = tdb_cassandra.LongType()

    # don't keep these around if a run hasn't happened lately, or if the last
    # N runs didn't generate recommendations for a given subreddit
    _ttl = timedelta(days=2)

    # we know that we mess with these but it's okay
    _warn_on_partial_ttl = False

    @classmethod
    def for_sr(cls, srid36, count=5):
        """
        Return the subreddits ID36s recommended by the sr whose id36 is passed
        """

        cq = tdb_cassandra.ColumnQuery(cls, [srid36],
                                       column_count = count+1,
                                       column_reversed = True)

        recs = [ r.values()[0] for r in cq if r.values()[0] != srid36 ][:count]

        return recs

    def _to_recs(self):
        recs = self._values() # [ {rank, srid} ]
        recs = sorted(recs.items(), key=lambda x: int(x[0]))
        recs = [x[1] for x in recs]
        return recs
示例#2
0
class SRRecommendation(tdb_cassandra.View):
    _use_db = True

    _compare_with = tdb_cassandra.LongType()

    # don't keep these around if a run hasn't happened lately, or if the last
    # N runs didn't generate recommendations for a given subreddit
    _ttl = timedelta(days=7, hours=12)

    # we know that we mess with these but it's okay
    _warn_on_partial_ttl = False

    @classmethod
    def for_srs(cls, srid36, to_omit, count, source, match_set=True):
        # It's usually better to use get_recommendations() than to call this
        # function directly because it does privacy filtering.

        srid36s = tup(srid36)
        to_omit = set(to_omit)
        to_omit.update(srid36s)  # don't show the originals
        rowkeys = ['%s.%s' % (source, srid36) for srid36 in srid36s]

        # fetch multiple sets of recommendations, one for each input srid36
        rows = cls._byID(rowkeys, return_dict=False)

        if match_set:
            sorted_recs = cls._merge_and_sort_by_count(rows)
            # heuristic: if input set is large, rec should match more than one
            min_count = math.floor(.1 * len(srid36s))
            sorted_recs = (rec[0] for rec in sorted_recs if rec[1] > min_count)
        else:
            sorted_recs = cls._merge_roundrobin(rows)
        # remove duplicates and ids listed in to_omit
        filtered = []
        for r in sorted_recs:
            if r not in to_omit:
                filtered.append(r)
                to_omit.add(r)
        return filtered[:count]

    @classmethod
    def _merge_roundrobin(cls, rows):
        """Combine multiple sets of recs, preserving order.

        Picks items equally from each input sr, which can be useful for
        getting a diverse set of recommendations instead of one that matches
        a theme. Preserves ordering, so all rank 1 recs will be listed first,
        then all rank 2, etc.

        Returns a list of id36s.

        """
        return roundrobin(*[row._values().itervalues() for row in rows])

    @classmethod
    def _merge_and_sort_by_count(cls, rows):
        """Combine and sort multiple sets of recs.

        Combines multiple sets of recs and sorts by number of times each rec
        appears, the reasoning being that an item recommended for several of
        the original srs is more likely to match the "theme" of the set.

        """
        # combine recs from all input srs
        rank_id36_pairs = chain.from_iterable(row._values().iteritems()
                                              for row in rows)
        ranks = defaultdict(list)
        for rank, id36 in rank_id36_pairs:
            ranks[id36].append(rank)
        recs = [(id36, len(ranks), max(ranks))
                for id36, ranks in ranks.iteritems()]
        # first, sort ascending by rank
        recs = sorted(recs, key=itemgetter(2))
        # next, sort descending by number of times the rec appeared. since
        # python sort is stable, tied items will still be ordered by rank
        return sorted(recs, key=itemgetter(1), reverse=True)
示例#3
0
class SRRecommendation(tdb_cassandra.View):
    _use_db = True

    _compare_with = tdb_cassandra.LongType()

    # don't keep these around if a run hasn't happened lately, or if the last
    # N runs didn't generate recommendations for a given subreddit
    _ttl = timedelta(days=7, hours=12)

    # we know that we mess with these but it's okay
    _warn_on_partial_ttl = False

    @classmethod
    def for_srs(cls, srid36, to_omit, count=10, source=SRC_MULTIREDDITS):
        # It's usually better to use get_recommendations() than to call this
        # function directly because it does privacy filtering.

        srid36s = tup(srid36)
        to_omit = set(to_omit)
        to_omit.update(srid36s)  # don't show the originals
        rowkeys = ['%s.%s' % (source, srid36) for srid36 in srid36s]

        # fetch multiple sets of recommendations, one for each input srid36
        d = sgm(g.cache, rowkeys, SRRecommendation._byID, prefix='srr.')
        rows = d.values()

        sorted_recs = SRRecommendation._merge_and_sort_by_count(rows)

        # heuristic: if the input set is large, rec should match more than one
        min_count = math.floor(.1 * len(srid36s))
        sorted_recs = (rec[0] for rec in sorted_recs if rec[1] > min_count)

        # remove duplicates and ids listed in to_omit
        filtered = []
        for r in sorted_recs:
            if r not in to_omit:
                filtered.append(r)
                to_omit.add(r)
        return filtered[:count]

    @classmethod
    def _merge_and_sort_by_count(cls, rows):
        """Combine and sort multiple sets of recs.

        Combines multiple sets of recs and sorts by number of times each rec
        appears, the reasoning being that an item recommended for several of
        the original srs is more likely to match the "theme" of the set.

        """
        # combine recs from all input srs
        rank_id36_pairs = itertools.chain(
            *[row._values().iteritems() for row in rows])
        ranks = defaultdict(list)
        for rank, id36 in rank_id36_pairs:
            ranks[id36].append(rank)
        recs = [(id36, len(ranks), max(ranks))
                for id36, ranks in ranks.iteritems()]
        # first, sort ascending by rank
        recs = sorted(recs, key=itemgetter(2))
        # next, sort descending by number of times the rec appeared. since
        # python sort is stable, tied items will still be ordered by rank
        return sorted(recs, key=itemgetter(1), reverse=True)

    def _to_recs(self):
        recs = self._values()  # [ {rank, srid} ]
        recs = sorted(recs.items(), key=lambda x: int(x[0]))
        recs = [x[1] for x in recs]
        return recs