Exemplo n.º 1
0
 def test_take_outdated(self):
     self.assertFalse(Feed.get_by_pk(self._feed.id).dt_checked)
     outdated1 = Feed.take_outdated_feeds()
     self.assertEqual(len(outdated1), 1)
     self.assertEqual(outdated1[0]['feed_id'], self._feed.id)
     self.assertTrue(Feed.get_by_pk(self._feed.id).dt_checked)
     outdated2 = Feed.take_outdated_feeds()
     self.assertEqual(len(outdated2), 0)
Exemplo n.º 2
0
 def setUp(self):
     feed = Feed(
         title='test feed',
         url='https://blog.example.com/feed.xml',
         status=FeedStatus.READY,
         dt_updated=timezone.now(),
     )
     feed.save()
Exemplo n.º 3
0
def do_save_feed_creation_result(
        ctx: ActorContext,
        feed_creation_id: T.int,
        messages: T.list(T.str),
        feed: FeedSchema.optional,
):
    with transaction.atomic():
        feed_dict = feed
        try:
            feed_creation = FeedCreation.get_by_pk(feed_creation_id)
        except FeedCreation.DoesNotExist:
            LOG.warning(f'feed creation {feed_creation_id} not exists')
            return
        if feed_creation.status == FeedStatus.READY:
            LOG.info(f'feed creation {feed_creation_id} is ready')
            return
        feed_creation.message = '\n\n'.join(messages)
        feed_creation.dt_updated = timezone.now()
        if not feed_dict:
            feed_creation.status = FeedStatus.ERROR
            feed_creation.save()
            FeedUrlMap(source=feed_creation.url,
                       target=FeedUrlMap.NOT_FOUND).save()
            return
        url = feed_dict['url']
        feed = Feed.get_first_by_url(url)
        if not feed:
            now = timezone.now()
            feed = Feed(url=url,
                        status=FeedStatus.READY,
                        reverse_url=reverse_url(url),
                        dt_updated=now,
                        dt_checked=now,
                        dt_synced=now)
            feed.save()
        feed_creation.status = FeedStatus.READY
        feed_creation.feed_id = feed.id
        feed_creation.save()
        user_feed = UserFeed.objects.filter(user_id=feed_creation.user_id,
                                            feed_id=feed.id).first()
        if user_feed:
            LOG.info('UserFeed#{} user_id={} feed_id={} already exists'.format(
                user_feed.id, feed_creation.user_id, feed.id))
        else:
            user_feed = UserFeed(
                user_id=feed_creation.user_id,
                feed_id=feed.id,
                is_from_bookmark=feed_creation.is_from_bookmark,
            )
            user_feed.save()
        FeedUrlMap(source=feed_creation.url, target=feed.url).save()
        if feed.url != feed_creation.url:
            FeedUrlMap(source=feed.url, target=feed.url).save()
    ctx.hope('harbor_rss.update_feed',
             dict(
                 feed_id=feed.id,
                 feed=validate_feed_output(feed_dict),
             ))
Exemplo n.º 4
0
def do_update_feed(
    ctx: ActorContext,
    feed_id: T.int,
    feed: FeedSchema,
    is_refresh: T.bool.default(False).desc('Deprecated'),
):
    with transaction.atomic():
        feed_dict = feed
        storys = feed_dict.pop('storys')
        feed = Feed.get_by_pk(feed_id)
        is_feed_url_changed = feed.url != feed_dict['url']
        if is_feed_url_changed:
            target_feed = Feed.get_first_by_url(feed_dict['url'])
            if target_feed:
                LOG.info(f'merge feed#{feed.id} url={feed.url} into '
                         f'feed#{target_feed.id} url={target_feed.url}')
                target_feed.merge(feed)
                return
        for k, v in feed_dict.items():
            if v != '' and v is not None:
                setattr(feed, k, v)
        now = timezone.now()
        now_sub_30d = now - timezone.timedelta(days=30)
        if not feed.dt_updated:
            feed.dt_updated = now
        feed.dt_checked = feed.dt_synced = now
        feed.status = FeedStatus.READY
        feed.save()
        for s in storys:
            if not s['dt_updated']:
                s['dt_updated'] = now
            if not s['dt_published']:
                # set dt_published to now - 30d to avoid these storys
                # take over mushroom page, i.e. Story.query_recent_by_user
                s['dt_published'] = now_sub_30d
        modified_storys = Story.bulk_save_by_feed(feed.id, storys)
        LOG.info(
            'feed#%s save storys total=%s num_modified=%s',
            feed.id, len(storys), len(modified_storys)
        )
    feed.refresh_from_db()
    if modified_storys:
        feed.unfreeze()
    need_fetch_story = _is_feed_need_fetch_storys(feed)
    for story in modified_storys:
        if not story.link:
            continue
        if need_fetch_story and (not is_fulltext_story(feed, story)):
            ctx.tell('worker_rss.fetch_story', dict(
                url=story.link,
                story_id=str(story.id)
            ))
        else:
            _detect_story_images(ctx, story)
Exemplo n.º 5
0
def _feed_merge_duplicate(found: list):
    for feed_ids in found:
        primary_id, *duplicates = feed_ids
        with transaction.atomic():
            primary = Feed.get_by_pk(primary_id)
            primary_info = f'#{primary.id} url={primary.url!r}'
            for feed_id in duplicates:
                other = Feed.get_by_pk(feed_id)
                other_info = f'#{other.id} url={other.url!r}'
                LOG.info('merge duplicate feed %s into %s', other_info, primary_info)
                FeedUrlMap(source=other.url, target=primary.url).save()
                primary.merge(other)
Exemplo n.º 6
0
 def setUp(self):
     feed = Feed(
         title='测试1',
         url='https://blog.example.com/feed1.xml',
         status=FeedStatus.READY,
         dt_updated=timezone.now(),
     )
     feed.save()
     tester = User.objects.create_superuser('tester',
                                            email=None,
                                            password='******')
     self._tester = tester
Exemplo n.º 7
0
def _create_test_feed(url):
    feed = Feed.get_first_by_url(url)
    if not feed:
        now = timezone.now()
        feed = Feed(url=url,
                    status=FeedStatus.DISCARD,
                    reverse_url=reverse_url(url),
                    title='蚁阅测试订阅',
                    dt_updated=now,
                    dt_checked=now,
                    dt_synced=now)
        feed.save()
    return feed
Exemplo n.º 8
0
def do_clean_by_retention(ctx: ActorContext):
    retention = CONFIG.feed_story_retention
    feeds = Feed.take_retention_feeds(retention=retention, limit=50)
    LOG.info('found {} feeds need clean by retention'.format(len(feeds)))
    for feed in feeds:
        feed_id = feed['feed_id']
        url = feed['url']
        n = STORY_SERVICE.delete_by_retention(feed_id, retention=retention)
        LOG.info(f'deleted {n} storys of feed#{feed_id} {url} by retention')
Exemplo n.º 9
0
def do_sync_story_fulltext(
    ctx: ActorContext,
    feed_id: T.int,
    offset: T.int,
) -> T.dict(
        feed_id=T.int,
        offset=T.int.min(0),
        use_proxy=T.bool,
        url=T.url,
        response_status=T.int,
        accept=T_ACCEPT,
):
    with log_django_context_metric('harbor_rss.sync_story_fulltext:read'):
        feed = Feed.get_by_pk(feed_id, detail='+use_proxy')
        story = STORY_SERVICE.get_by_offset(feed_id, offset, detail=True)
    assert story, f'story#{feed_id},{offset} not found'
    story_content_info = StoryContentInfo(story.content)
    num_sub_sentences = len(split_sentences(story_content_info.text))
    ret = dict(
        feed_id=feed_id,
        offset=offset,
        url=story.link,
        use_proxy=feed.use_proxy,
        accept=FulltextAcceptStrategy.REJECT.value,
    )
    try:
        result = ctx.ask(
            'worker_rss.fetch_story',
            dict(
                url=story.link,
                use_proxy=feed.use_proxy,
                feed_id=feed_id,
                offset=offset,
                num_sub_sentences=num_sub_sentences,
            ))
    except _TIMEOUT_ERRORS as ex:
        LOG.error(f'Ask worker_rss.fetch_story timeout: {ex}')
        ret.update(response_status=FeedResponseStatus.CONNECTION_TIMEOUT)
        return ret
    else:
        ret.update(
            response_status=result['response_status'],
            use_proxy=result['use_proxy'],
        )
        if not result['content']:
            return ret
    with log_django_context_metric('harbor_rss.sync_story_fulltext:write'):
        accept = _update_story(
            story=story,
            story_content_info=story_content_info,
            content=result['content'],
            summary=None,  # not need update summary
            url=result['url'],
            sentence_count=result['sentence_count'],
        )
        ret.update(accept=accept.value)
    return ret
Exemplo n.º 10
0
def do_feed_detect_and_merge_duplicate(ctx: ActorContext):
    begin_time = time.time()
    checkpoint = None
    while True:
        found, checkpoint = Feed.find_duplicate_feeds(checkpoint=checkpoint)
        _feed_merge_duplicate(found)
        if not checkpoint:
            break
    cost = time.time() - begin_time
    LOG.info('feed_detect_and_merge_duplicate cost {:.1f}ms'.format(cost * 1000))
Exemplo n.º 11
0
def do_check_feed(ctx: ActorContext):
    rand_sec = random.random() * CHECK_FEED_SECONDS / 10
    outdate_seconds = CHECK_FEED_SECONDS + rand_sec
    feeds = Feed.take_outdated_feeds(outdate_seconds)
    expire_at = time.time() + outdate_seconds
    LOG.info('found {} feeds need sync'.format(len(feeds)))
    for feed in feeds:
        ctx.hope('worker_rss.sync_feed', dict(
            feed_id=feed['feed_id'],
            url=feed['url'],
        ), expire_at=expire_at)
Exemplo n.º 12
0
def do_update_feed_info(
    ctx: ActorContext,
    feed_id: T.int,
    feed: FeedInfoSchema,
):
    with transaction.atomic():
        feed_dict = feed
        feed = Feed.get_by_pk(feed_id)
        for k, v in feed_dict.items():
            setattr(feed, k, v)
        feed.dt_updated = timezone.now()
        feed.save()
Exemplo n.º 13
0
    def setUp(self):
        print('setUp')
        storys = []
        updated_storys = []
        now = timezone.datetime(2020, 6, 1, 12, 12, 12, tzinfo=timezone.utc)
        for i in range(200):
            dt = now + timezone.timedelta(minutes=i)
            content = f'test story content {i}' * (i % 5)
            content_hash_base64 = compute_hash_base64(content)
            summary = content[:30]
            story = {
                'unique_id': f'blog.example.com/{i}',
                'title': f'test story {i}',
                'content_hash_base64': content_hash_base64,
                'author': 'tester',
                'link': f'https://blog.example.com/{i}.html',
                'dt_published': dt,
                'dt_updated': dt,
                'summary': summary,
                'content': content,
            }
            storys.append(validate_story(story))
            updated_story = dict(story)
            updated_content = f'test story content updated {i}' * (i % 5 + 1)
            updated_story.update(
                content=updated_content,
                content_hash_base64=compute_hash_base64(updated_content),
            )
            updated_storys.append(validate_story(updated_story))
        self.storys = storys
        self.updated_storys = updated_storys

        feed = Feed(
            title='test feed',
            url='https://blog.example.com/feed.xml',
            status=FeedStatus.READY,
            dt_updated=timezone.now(),
        )
        feed.save()
        self.feed_id = feed.id
Exemplo n.º 14
0
Arquivo: rss.py Projeto: sun816/rssant
def update_feed_dryness(feeds=None):
    feed_ids = _get_feed_ids(feeds)
    LOG.info('total %s feeds', len(feed_ids))
    for feed_id in tqdm.tqdm(feed_ids, ncols=80, ascii=True):
        with transaction.atomic():
            feed = Feed.get_by_pk(feed_id)
            if feed.total_storys <= 0:
                continue
            cnt = feed.monthly_story_count
            if not cnt:
                Story.refresh_feed_monthly_story_count(feed_id)
            feed.refresh_from_db()
            feed.dryness = feed.monthly_story_count.dryness()
            feed.save()
Exemplo n.º 15
0
 def _import_feeds(self, imports: list):
     result = UnionFeed.create_by_imports(user_id=self._tester.id,
                                          imports=imports)
     for creation in result.feed_creations:
         creation: FeedCreation
         feed = Feed(
             title=creation.title,
             url=creation.url,
             status=FeedStatus.READY,
             dt_updated=timezone.now(),
         )
         feed.save()
         user_feed = UserFeed(
             user=self._tester,
             feed=feed,
             title=creation.title,
             group=creation.group,
             dt_updated=timezone.now(),
         )
         user_feed.save()
         FeedUrlMap(source=creation.url, target=feed.url).save()
         FeedUrlMap(source=creation.url + '.c', target=feed.url).save()
     return result
Exemplo n.º 16
0
Arquivo: rss.py Projeto: sun816/rssant
def delete_feed(key):
    try:
        key = int(key)
    except ValueError:
        pass  # ignore
    if isinstance(key, int):
        feed = Feed.get_by_pk(key)
    else:
        feed = Feed.objects.filter(
            Q(url__contains=key) | Q(title__contains=key)).first()
    if not feed:
        print(f'not found feed like {key}')
        return
    if click.confirm(f'delete {feed} ?'):
        feed.delete()
Exemplo n.º 17
0
Arquivo: rss.py Projeto: sun816/rssant
def update_feed_dt_first_story_published(feeds=None):
    feed_ids = _get_feed_ids(feeds)
    LOG.info('total %s feeds', len(feed_ids))
    for feed_id in tqdm.tqdm(feed_ids, ncols=80, ascii=True):
        with transaction.atomic():
            feed = Feed.get_by_pk(feed_id)
            if feed.dt_first_story_published:
                continue
            if feed.total_storys <= 0:
                continue
            try:
                story = Story.get_by_offset(feed_id, 0, detail=True)
            except Story.DoesNotExist:
                LOG.warning(f'story feed_id={feed_id} offset=0 not exists')
                continue
            feed.dt_first_story_published = story.dt_published
            feed.save()
Exemplo n.º 18
0
 def test_get_feed_by_url(self):
     url = 'https://blog.example.com/feed.xml'
     got = Feed.get_first_by_url(url)
     self.assertEqual(got.title, 'test feed')
Exemplo n.º 19
0
def do_feed_refresh_freeze_level(ctx: ActorContext):
    begin_time = time.time()
    Feed.refresh_freeze_level()
    cost = time.time() - begin_time
    LOG.info('feed_refresh_freeze_level cost {:.1f}ms'.format(cost * 1000))
Exemplo n.º 20
0
def do_update_feed(
        ctx: ActorContext,
        feed_id: T.int,
        feed: FeedSchema,
        is_refresh: T.bool.default(False),
):
    with transaction.atomic():
        feed_dict = feed
        storys = feed_dict.pop('storys')
        feed = Feed.get_by_pk(feed_id)
        is_feed_url_changed = feed.url != feed_dict['url']
        if is_feed_url_changed:
            target_feed = Feed.get_first_by_url(feed_dict['url'])
            # FIXME: feed merge 无法正确处理订阅重定向问题。
            # 对于这种情况,暂时保留旧的订阅,以后再彻底解决。
            # if target_feed:
            #     LOG.info(f'merge feed#{feed.id} url={feed.url} into '
            #              f'feed#{target_feed.id} url={target_feed.url}')
            #     target_feed.merge(feed)
            #     return
            if target_feed:
                LOG.warning(
                    f'FIXME: redirect feed#{feed.id} url={feed.url!r} into '
                    f'feed#{target_feed.id} url={target_feed.url!r}')
                feed_dict.pop('url')
        # only update dt_updated if has storys or feed fields updated
        is_feed_updated = bool(storys)
        for k, v in feed_dict.items():
            if k == 'dt_updated':
                continue
            if (v != '' and v is not None) or k in {'warnings'}:
                old_v = getattr(feed, k, None)
                if v != old_v:
                    is_feed_updated = True
                    setattr(feed, k, v)
        now = timezone.now()
        now_sub_30d = now - timezone.timedelta(days=30)
        if is_feed_updated:
            # set dt_updated to now, not trust rss date
            feed.dt_updated = now
        feed.dt_checked = feed.dt_synced = now
        feed.reverse_url = reverse_url(feed.url)
        feed.status = FeedStatus.READY
        feed.save()
    # save storys, bulk_save_by_feed has standalone transaction
    for s in storys:
        if not s['dt_updated']:
            s['dt_updated'] = now
        if not s['dt_published']:
            # set dt_published to now - 30d to avoid these storys
            # take over mushroom page, i.e. Story.query_recent_by_user
            s['dt_published'] = now_sub_30d
    modified_storys = STORY_SERVICE.bulk_save_by_feed(feed.id,
                                                      storys,
                                                      is_refresh=is_refresh)
    LOG.info('feed#%s save storys total=%s num_modified=%s', feed.id,
             len(storys), len(modified_storys))
    feed = Feed.get_by_pk(feed_id)
    is_freezed = feed.freeze_level is None or feed.freeze_level > 1
    if modified_storys and is_freezed:
        Feed.unfreeze_by_id(feed_id)
    need_fetch_story = _is_feed_need_fetch_storys(feed, modified_storys)
    for story in modified_storys:
        if not story.link:
            continue
        if need_fetch_story and (not _is_fulltext_story(story)):
            text = processor.story_html_to_text(story.content)
            num_sub_sentences = len(split_sentences(text))
            ctx.tell(
                'worker_rss.fetch_story',
                dict(
                    url=story.link,
                    use_proxy=feed.use_proxy,
                    feed_id=story.feed_id,
                    offset=story.offset,
                    num_sub_sentences=num_sub_sentences,
                ))
Exemplo n.º 21
0
 def assert_feed_total_storys(self, expect):
     total_storys = Feed.get_by_pk(self.feed_id).total_storys
     self.assertEqual(total_storys, expect)
Exemplo n.º 22
0
def do_update_feed(
    ctx: ActorContext,
    feed_id: T.int,
    feed: FeedSchema,
    is_refresh: T.bool.default(False),
):
    with transaction.atomic():
        feed_dict = feed
        storys = feed_dict.pop('storys')
        feed = Feed.get_by_pk(feed_id)
        is_feed_url_changed = feed.url != feed_dict['url']
        if is_feed_url_changed:
            target_feed = Feed.get_first_by_url(feed_dict['url'])
            if target_feed:
                LOG.info(f'merge feed#{feed.id} url={feed.url} into '
                         f'feed#{target_feed.id} url={target_feed.url}')
                target_feed.merge(feed)
                return
        # only update dt_updated if has storys or feed fields updated
        is_feed_updated = bool(storys)
        for k, v in feed_dict.items():
            if k == 'dt_updated':
                continue
            if v != '' and v is not None:
                old_v = getattr(feed, k, None)
                if v != old_v:
                    is_feed_updated = True
                    setattr(feed, k, v)
        now = timezone.now()
        now_sub_30d = now - timezone.timedelta(days=30)
        if is_feed_updated:
            # set dt_updated to now, not trust rss date
            feed.dt_updated = now
        feed.dt_checked = feed.dt_synced = now
        feed.status = FeedStatus.READY
        feed.save()
        for s in storys:
            if not s['dt_updated']:
                s['dt_updated'] = now
            if not s['dt_published']:
                # set dt_published to now - 30d to avoid these storys
                # take over mushroom page, i.e. Story.query_recent_by_user
                s['dt_published'] = now_sub_30d
        modified_storys = Story.bulk_save_by_feed(feed.id, storys, is_refresh=is_refresh)
        LOG.info(
            'feed#%s save storys total=%s num_modified=%s',
            feed.id, len(storys), len(modified_storys)
        )
    feed.refresh_from_db()
    if modified_storys:
        feed.unfreeze()
    need_fetch_story = _is_feed_need_fetch_storys(feed, modified_storys)
    for story in modified_storys:
        if not story.link:
            continue
        if need_fetch_story and (not is_fulltext_story(story)):
            text = processor.story_html_to_text(story.content)
            num_sub_sentences = len(split_sentences(text))
            ctx.tell('worker_rss.fetch_story', dict(
                url=story.link,
                use_proxy=feed.use_proxy,
                story_id=str(story.id),
                num_sub_sentences=num_sub_sentences,
            ))
        else:
            _detect_story_images(ctx, story)