def do_save_feed_creation_result( ctx: ActorContext, feed_creation_id: T.int, messages: T.list(T.str), feed: FeedSchema.optional, ): with transaction.atomic(): feed_dict = feed try: feed_creation = FeedCreation.get_by_pk(feed_creation_id) except FeedCreation.DoesNotExist: LOG.warning(f'feed creation {feed_creation_id} not exists') return if feed_creation.status == FeedStatus.READY: LOG.info(f'feed creation {feed_creation_id} is ready') return feed_creation.message = '\n\n'.join(messages) feed_creation.dt_updated = timezone.now() if not feed_dict: feed_creation.status = FeedStatus.ERROR feed_creation.save() FeedUrlMap(source=feed_creation.url, target=FeedUrlMap.NOT_FOUND).save() return url = feed_dict['url'] feed = Feed.get_first_by_url(url) if not feed: now = timezone.now() feed = Feed(url=url, status=FeedStatus.READY, reverse_url=reverse_url(url), dt_updated=now, dt_checked=now, dt_synced=now) feed.save() feed_creation.status = FeedStatus.READY feed_creation.feed_id = feed.id feed_creation.save() user_feed = UserFeed.objects.filter(user_id=feed_creation.user_id, feed_id=feed.id).first() if user_feed: LOG.info('UserFeed#{} user_id={} feed_id={} already exists'.format( user_feed.id, feed_creation.user_id, feed.id)) else: user_feed = UserFeed( user_id=feed_creation.user_id, feed_id=feed.id, is_from_bookmark=feed_creation.is_from_bookmark, ) user_feed.save() FeedUrlMap(source=feed_creation.url, target=feed.url).save() if feed.url != feed_creation.url: FeedUrlMap(source=feed.url, target=feed.url).save() ctx.hope('harbor_rss.update_feed', dict( feed_id=feed.id, feed=validate_feed_output(feed_dict), ))
def do_update_feed( ctx: ActorContext, feed_id: T.int, feed: FeedSchema, is_refresh: T.bool.default(False).desc('Deprecated'), ): with transaction.atomic(): feed_dict = feed storys = feed_dict.pop('storys') feed = Feed.get_by_pk(feed_id) is_feed_url_changed = feed.url != feed_dict['url'] if is_feed_url_changed: target_feed = Feed.get_first_by_url(feed_dict['url']) if target_feed: LOG.info(f'merge feed#{feed.id} url={feed.url} into ' f'feed#{target_feed.id} url={target_feed.url}') target_feed.merge(feed) return for k, v in feed_dict.items(): if v != '' and v is not None: setattr(feed, k, v) now = timezone.now() now_sub_30d = now - timezone.timedelta(days=30) if not feed.dt_updated: feed.dt_updated = now feed.dt_checked = feed.dt_synced = now feed.status = FeedStatus.READY feed.save() for s in storys: if not s['dt_updated']: s['dt_updated'] = now if not s['dt_published']: # set dt_published to now - 30d to avoid these storys # take over mushroom page, i.e. Story.query_recent_by_user s['dt_published'] = now_sub_30d modified_storys = Story.bulk_save_by_feed(feed.id, storys) LOG.info( 'feed#%s save storys total=%s num_modified=%s', feed.id, len(storys), len(modified_storys) ) feed.refresh_from_db() if modified_storys: feed.unfreeze() need_fetch_story = _is_feed_need_fetch_storys(feed) for story in modified_storys: if not story.link: continue if need_fetch_story and (not is_fulltext_story(feed, story)): ctx.tell('worker_rss.fetch_story', dict( url=story.link, story_id=str(story.id) )) else: _detect_story_images(ctx, story)
def _create_test_feed(url): feed = Feed.get_first_by_url(url) if not feed: now = timezone.now() feed = Feed(url=url, status=FeedStatus.DISCARD, reverse_url=reverse_url(url), title='蚁阅测试订阅', dt_updated=now, dt_checked=now, dt_synced=now) feed.save() return feed
def do_update_feed( ctx: ActorContext, feed_id: T.int, feed: FeedSchema, is_refresh: T.bool.default(False), ): with transaction.atomic(): feed_dict = feed storys = feed_dict.pop('storys') feed = Feed.get_by_pk(feed_id) is_feed_url_changed = feed.url != feed_dict['url'] if is_feed_url_changed: target_feed = Feed.get_first_by_url(feed_dict['url']) # FIXME: feed merge 无法正确处理订阅重定向问题。 # 对于这种情况,暂时保留旧的订阅,以后再彻底解决。 # if target_feed: # LOG.info(f'merge feed#{feed.id} url={feed.url} into ' # f'feed#{target_feed.id} url={target_feed.url}') # target_feed.merge(feed) # return if target_feed: LOG.warning( f'FIXME: redirect feed#{feed.id} url={feed.url!r} into ' f'feed#{target_feed.id} url={target_feed.url!r}') feed_dict.pop('url') # only update dt_updated if has storys or feed fields updated is_feed_updated = bool(storys) for k, v in feed_dict.items(): if k == 'dt_updated': continue if (v != '' and v is not None) or k in {'warnings'}: old_v = getattr(feed, k, None) if v != old_v: is_feed_updated = True setattr(feed, k, v) now = timezone.now() now_sub_30d = now - timezone.timedelta(days=30) if is_feed_updated: # set dt_updated to now, not trust rss date feed.dt_updated = now feed.dt_checked = feed.dt_synced = now feed.reverse_url = reverse_url(feed.url) feed.status = FeedStatus.READY feed.save() # save storys, bulk_save_by_feed has standalone transaction for s in storys: if not s['dt_updated']: s['dt_updated'] = now if not s['dt_published']: # set dt_published to now - 30d to avoid these storys # take over mushroom page, i.e. Story.query_recent_by_user s['dt_published'] = now_sub_30d modified_storys = STORY_SERVICE.bulk_save_by_feed(feed.id, storys, is_refresh=is_refresh) LOG.info('feed#%s save storys total=%s num_modified=%s', feed.id, len(storys), len(modified_storys)) feed = Feed.get_by_pk(feed_id) is_freezed = feed.freeze_level is None or feed.freeze_level > 1 if modified_storys and is_freezed: Feed.unfreeze_by_id(feed_id) need_fetch_story = _is_feed_need_fetch_storys(feed, modified_storys) for story in modified_storys: if not story.link: continue if need_fetch_story and (not _is_fulltext_story(story)): text = processor.story_html_to_text(story.content) num_sub_sentences = len(split_sentences(text)) ctx.tell( 'worker_rss.fetch_story', dict( url=story.link, use_proxy=feed.use_proxy, feed_id=story.feed_id, offset=story.offset, num_sub_sentences=num_sub_sentences, ))
def test_get_feed_by_url(self): url = 'https://blog.example.com/feed.xml' got = Feed.get_first_by_url(url) self.assertEqual(got.title, 'test feed')
def do_update_feed( ctx: ActorContext, feed_id: T.int, feed: FeedSchema, is_refresh: T.bool.default(False), ): with transaction.atomic(): feed_dict = feed storys = feed_dict.pop('storys') feed = Feed.get_by_pk(feed_id) is_feed_url_changed = feed.url != feed_dict['url'] if is_feed_url_changed: target_feed = Feed.get_first_by_url(feed_dict['url']) if target_feed: LOG.info(f'merge feed#{feed.id} url={feed.url} into ' f'feed#{target_feed.id} url={target_feed.url}') target_feed.merge(feed) return # only update dt_updated if has storys or feed fields updated is_feed_updated = bool(storys) for k, v in feed_dict.items(): if k == 'dt_updated': continue if v != '' and v is not None: old_v = getattr(feed, k, None) if v != old_v: is_feed_updated = True setattr(feed, k, v) now = timezone.now() now_sub_30d = now - timezone.timedelta(days=30) if is_feed_updated: # set dt_updated to now, not trust rss date feed.dt_updated = now feed.dt_checked = feed.dt_synced = now feed.status = FeedStatus.READY feed.save() for s in storys: if not s['dt_updated']: s['dt_updated'] = now if not s['dt_published']: # set dt_published to now - 30d to avoid these storys # take over mushroom page, i.e. Story.query_recent_by_user s['dt_published'] = now_sub_30d modified_storys = Story.bulk_save_by_feed(feed.id, storys, is_refresh=is_refresh) LOG.info( 'feed#%s save storys total=%s num_modified=%s', feed.id, len(storys), len(modified_storys) ) feed.refresh_from_db() if modified_storys: feed.unfreeze() need_fetch_story = _is_feed_need_fetch_storys(feed, modified_storys) for story in modified_storys: if not story.link: continue if need_fetch_story and (not is_fulltext_story(story)): text = processor.story_html_to_text(story.content) num_sub_sentences = len(split_sentences(text)) ctx.tell('worker_rss.fetch_story', dict( url=story.link, use_proxy=feed.use_proxy, story_id=str(story.id), num_sub_sentences=num_sub_sentences, )) else: _detect_story_images(ctx, story)