Пример #1
0
 def update_candidateset(self):
     full_warm_seq_ids = self.get_full_dataset()
     from article.models import Article
     articles = Article.objects(seq_id__in=full_warm_seq_ids).only('id', 'category', 'seq_id', 'feature', 'published_at', 'quality', 'hot')
     now = datetime.datetime.utcnow()
     update_time = self.candidate_updated_time
     if not update_time:
         update_time = get_global_cal_time()
     seq_id_article_mapping = dict(((a.seq_id, (a, calculate_scores(0, 0, a.published_at, a.quality, a.hot, update_time, now))) for a in articles))
     short_feature = self.get_feature_by_name('recent_feature')
     long_feature = self.feature_matrix
     User.recompute_candidateset(self, short_feature, long_feature, seq_id_article_mapping, subset=full_warm_seq_ids)
     update_time = datetime.datetime.utcnow()
     self.candidate_updated_time = update_time
     self.save()
Пример #2
0
    def batch_add_to_users_candidateset(self):
        if not self.usable:
            return
        from people.models import User
        from people.mixins import default_key_user

        users = User.objects.all().only('id', 'seq_id', 'feature', 'recent_feature').order_by('id')
        short_ur_mapping = self._generate_user_relation_scores(users, 'recent_feature')
        long_ur_mapping = self._generate_user_relation_scores(users)
        if not len(short_ur_mapping) == len(long_ur_mapping):
            # NOTE: logging
            raise Exception('Notice')
        update_time = get_global_cal_time()
        default_short_score, default_long_score = calculate_scores(0, 0, self.published_at, self.quality, self.hot, update_time)
        seq_id = str(self.seq_id)

        pipeline = warm_conn.pipeline()
        keys = default_key_user.get_category_keys(self.category)

        long_key = default_key_user.rec_longterm_dataset_key
        pipeline.zadd(long_key, seq_id, default_long_score)
        long_cate_recs = default_key_user.get_longterm_dataset_key(self.category)
        for long_cate_rec in long_cate_recs:
            pipeline.zadd(long_cate_rec, seq_id, default_long_score)

        short_key = default_key_user.rec_shortterm_dataset_key
        pipeline.zadd(short_key, seq_id, default_long_score)

        short_cate_recs = default_key_user.get_shortterm_dataset_key(self.category)
        for short_cate_rec in short_cate_recs:
            pipeline.zadd(short_cate_rec, seq_id, default_short_score)

        for u in users:
            short_ur_score = short_ur_mapping[u.seq_id]
            long_ur_score = long_ur_mapping[u.seq_id]
            short_score = 1000*short_ur_score + default_short_score
            long_score = 1000*long_ur_score + default_long_score
            long_key = u.rec_longterm_dataset_key
            pipeline.zadd(long_key, seq_id, long_score)
            long_cate_recs = u.get_longterm_dataset_key(self.category)
            for long_cate_rec in long_cate_recs:
                pipeline.zadd(long_cate_rec, seq_id, long_score)
            short_key = u.rec_shortterm_dataset_key
            pipeline.zadd(short_key, seq_id, short_score)
            short_cate_recs = u.get_shortterm_dataset_key(self.category)
            for short_cate_rec in short_cate_recs:
                pipeline.zadd(short_cate_rec, seq_id, short_score)
        pipeline.execute()
def main():
    usable_ids = warm_conn.keys('b:s:a*')
    usable_ids = map(lambda _id: int(_id.split(':')[-1]), usable_ids)
    articles = Article.objects(seq_id__in=usable_ids).only('seq_id', 'id', 'category', 'hot', 'quality', 'published_at', 'feature')
    update_time = datetime.datetime.utcnow()
    set_global_cal_time(update_time)
    seq_id_article_mapping = dict(((a.seq_id, (a, calculate_scores(0, 0, a.published_at, a.quality, a.hot, update_time, update_time))) for a in articles))
    User.recompute_common_candidateset(usable_ids, seq_id_article_mapping)
    users = User.objects.only('id', 'seq_id', 'feature', 'recent_feature', 'feature_updated_time', 'candidate_updated_time').all()
    usable_ids_set = set(usable_ids)
    for user in users:
        ids, _ = user.get_seen_article_ids_and_last_datetime()
        _usable_ids = usable_ids_set - set(ids)
        print user, user.seq_id, len(_usable_ids)
        short_feature, long_feature, _ = user.update_warm_feature()
        User.recompute_candidateset(user, short_feature, long_feature, seq_id_article_mapping, subset=_usable_ids)
Пример #4
0
    def calcualte_useless_articles(cls):
        from people.mixins import default_key_user
        full_warm_seq_ids = default_key_user.get_full_dataset()
        articles = cls.objects(seq_id__in=full_warm_seq_ids).only('id', 'seq_id', 'category', 'published_at', 'quality', 'hot')
        line_article = cls.get_earliest_valid_obj()
        outdate_articles = [article for article in articles if article.seq_id < line_article.seq_id]
        usable_articles = set(articles) - set(outdate_articles)
        useless_articles = outdate_articles

        today = date.today()
        now = datetime.utcnow()
        valid_deadline = now - VALID_DURATION
        update_time = get_global_cal_time()
        for pubdate, _articles in groupby(sorted(useless_articles, key=lambda a: a.published_at.date()), lambda a: a.published_at.date()):
            if pubdate >= now.date():
                continue
            __articles = list(_articles)
            __aritlces = sorted(__articles, key=lambda _ar: calculate_scores(0, 0, _ar.published_at, _ar.quality, _ar.hot, 0)[0], reverse=True)
            count = len(__articles)
            if pubdate < valid_deadline.date():
                useless_articles.extend(__articles)
            else:
                useless_articles.extend(__articles[count * 3 / 4:])
        return useless_articles