def test_take_outdated(self): self.assertFalse(Feed.get_by_pk(self._feed.id).dt_checked) outdated1 = Feed.take_outdated_feeds() self.assertEqual(len(outdated1), 1) self.assertEqual(outdated1[0]['feed_id'], self._feed.id) self.assertTrue(Feed.get_by_pk(self._feed.id).dt_checked) outdated2 = Feed.take_outdated_feeds() self.assertEqual(len(outdated2), 0)
def setUp(self): feed = Feed( title='test feed', url='https://blog.example.com/feed.xml', status=FeedStatus.READY, dt_updated=timezone.now(), ) feed.save()
def do_save_feed_creation_result( ctx: ActorContext, feed_creation_id: T.int, messages: T.list(T.str), feed: FeedSchema.optional, ): with transaction.atomic(): feed_dict = feed try: feed_creation = FeedCreation.get_by_pk(feed_creation_id) except FeedCreation.DoesNotExist: LOG.warning(f'feed creation {feed_creation_id} not exists') return if feed_creation.status == FeedStatus.READY: LOG.info(f'feed creation {feed_creation_id} is ready') return feed_creation.message = '\n\n'.join(messages) feed_creation.dt_updated = timezone.now() if not feed_dict: feed_creation.status = FeedStatus.ERROR feed_creation.save() FeedUrlMap(source=feed_creation.url, target=FeedUrlMap.NOT_FOUND).save() return url = feed_dict['url'] feed = Feed.get_first_by_url(url) if not feed: now = timezone.now() feed = Feed(url=url, status=FeedStatus.READY, reverse_url=reverse_url(url), dt_updated=now, dt_checked=now, dt_synced=now) feed.save() feed_creation.status = FeedStatus.READY feed_creation.feed_id = feed.id feed_creation.save() user_feed = UserFeed.objects.filter(user_id=feed_creation.user_id, feed_id=feed.id).first() if user_feed: LOG.info('UserFeed#{} user_id={} feed_id={} already exists'.format( user_feed.id, feed_creation.user_id, feed.id)) else: user_feed = UserFeed( user_id=feed_creation.user_id, feed_id=feed.id, is_from_bookmark=feed_creation.is_from_bookmark, ) user_feed.save() FeedUrlMap(source=feed_creation.url, target=feed.url).save() if feed.url != feed_creation.url: FeedUrlMap(source=feed.url, target=feed.url).save() ctx.hope('harbor_rss.update_feed', dict( feed_id=feed.id, feed=validate_feed_output(feed_dict), ))
def do_update_feed( ctx: ActorContext, feed_id: T.int, feed: FeedSchema, is_refresh: T.bool.default(False).desc('Deprecated'), ): with transaction.atomic(): feed_dict = feed storys = feed_dict.pop('storys') feed = Feed.get_by_pk(feed_id) is_feed_url_changed = feed.url != feed_dict['url'] if is_feed_url_changed: target_feed = Feed.get_first_by_url(feed_dict['url']) if target_feed: LOG.info(f'merge feed#{feed.id} url={feed.url} into ' f'feed#{target_feed.id} url={target_feed.url}') target_feed.merge(feed) return for k, v in feed_dict.items(): if v != '' and v is not None: setattr(feed, k, v) now = timezone.now() now_sub_30d = now - timezone.timedelta(days=30) if not feed.dt_updated: feed.dt_updated = now feed.dt_checked = feed.dt_synced = now feed.status = FeedStatus.READY feed.save() for s in storys: if not s['dt_updated']: s['dt_updated'] = now if not s['dt_published']: # set dt_published to now - 30d to avoid these storys # take over mushroom page, i.e. Story.query_recent_by_user s['dt_published'] = now_sub_30d modified_storys = Story.bulk_save_by_feed(feed.id, storys) LOG.info( 'feed#%s save storys total=%s num_modified=%s', feed.id, len(storys), len(modified_storys) ) feed.refresh_from_db() if modified_storys: feed.unfreeze() need_fetch_story = _is_feed_need_fetch_storys(feed) for story in modified_storys: if not story.link: continue if need_fetch_story and (not is_fulltext_story(feed, story)): ctx.tell('worker_rss.fetch_story', dict( url=story.link, story_id=str(story.id) )) else: _detect_story_images(ctx, story)
def _feed_merge_duplicate(found: list): for feed_ids in found: primary_id, *duplicates = feed_ids with transaction.atomic(): primary = Feed.get_by_pk(primary_id) primary_info = f'#{primary.id} url={primary.url!r}' for feed_id in duplicates: other = Feed.get_by_pk(feed_id) other_info = f'#{other.id} url={other.url!r}' LOG.info('merge duplicate feed %s into %s', other_info, primary_info) FeedUrlMap(source=other.url, target=primary.url).save() primary.merge(other)
def setUp(self): feed = Feed( title='测试1', url='https://blog.example.com/feed1.xml', status=FeedStatus.READY, dt_updated=timezone.now(), ) feed.save() tester = User.objects.create_superuser('tester', email=None, password='******') self._tester = tester
def _create_test_feed(url): feed = Feed.get_first_by_url(url) if not feed: now = timezone.now() feed = Feed(url=url, status=FeedStatus.DISCARD, reverse_url=reverse_url(url), title='蚁阅测试订阅', dt_updated=now, dt_checked=now, dt_synced=now) feed.save() return feed
def do_clean_by_retention(ctx: ActorContext): retention = CONFIG.feed_story_retention feeds = Feed.take_retention_feeds(retention=retention, limit=50) LOG.info('found {} feeds need clean by retention'.format(len(feeds))) for feed in feeds: feed_id = feed['feed_id'] url = feed['url'] n = STORY_SERVICE.delete_by_retention(feed_id, retention=retention) LOG.info(f'deleted {n} storys of feed#{feed_id} {url} by retention')
def do_sync_story_fulltext( ctx: ActorContext, feed_id: T.int, offset: T.int, ) -> T.dict( feed_id=T.int, offset=T.int.min(0), use_proxy=T.bool, url=T.url, response_status=T.int, accept=T_ACCEPT, ): with log_django_context_metric('harbor_rss.sync_story_fulltext:read'): feed = Feed.get_by_pk(feed_id, detail='+use_proxy') story = STORY_SERVICE.get_by_offset(feed_id, offset, detail=True) assert story, f'story#{feed_id},{offset} not found' story_content_info = StoryContentInfo(story.content) num_sub_sentences = len(split_sentences(story_content_info.text)) ret = dict( feed_id=feed_id, offset=offset, url=story.link, use_proxy=feed.use_proxy, accept=FulltextAcceptStrategy.REJECT.value, ) try: result = ctx.ask( 'worker_rss.fetch_story', dict( url=story.link, use_proxy=feed.use_proxy, feed_id=feed_id, offset=offset, num_sub_sentences=num_sub_sentences, )) except _TIMEOUT_ERRORS as ex: LOG.error(f'Ask worker_rss.fetch_story timeout: {ex}') ret.update(response_status=FeedResponseStatus.CONNECTION_TIMEOUT) return ret else: ret.update( response_status=result['response_status'], use_proxy=result['use_proxy'], ) if not result['content']: return ret with log_django_context_metric('harbor_rss.sync_story_fulltext:write'): accept = _update_story( story=story, story_content_info=story_content_info, content=result['content'], summary=None, # not need update summary url=result['url'], sentence_count=result['sentence_count'], ) ret.update(accept=accept.value) return ret
def do_feed_detect_and_merge_duplicate(ctx: ActorContext): begin_time = time.time() checkpoint = None while True: found, checkpoint = Feed.find_duplicate_feeds(checkpoint=checkpoint) _feed_merge_duplicate(found) if not checkpoint: break cost = time.time() - begin_time LOG.info('feed_detect_and_merge_duplicate cost {:.1f}ms'.format(cost * 1000))
def do_check_feed(ctx: ActorContext): rand_sec = random.random() * CHECK_FEED_SECONDS / 10 outdate_seconds = CHECK_FEED_SECONDS + rand_sec feeds = Feed.take_outdated_feeds(outdate_seconds) expire_at = time.time() + outdate_seconds LOG.info('found {} feeds need sync'.format(len(feeds))) for feed in feeds: ctx.hope('worker_rss.sync_feed', dict( feed_id=feed['feed_id'], url=feed['url'], ), expire_at=expire_at)
def do_update_feed_info( ctx: ActorContext, feed_id: T.int, feed: FeedInfoSchema, ): with transaction.atomic(): feed_dict = feed feed = Feed.get_by_pk(feed_id) for k, v in feed_dict.items(): setattr(feed, k, v) feed.dt_updated = timezone.now() feed.save()
def setUp(self): print('setUp') storys = [] updated_storys = [] now = timezone.datetime(2020, 6, 1, 12, 12, 12, tzinfo=timezone.utc) for i in range(200): dt = now + timezone.timedelta(minutes=i) content = f'test story content {i}' * (i % 5) content_hash_base64 = compute_hash_base64(content) summary = content[:30] story = { 'unique_id': f'blog.example.com/{i}', 'title': f'test story {i}', 'content_hash_base64': content_hash_base64, 'author': 'tester', 'link': f'https://blog.example.com/{i}.html', 'dt_published': dt, 'dt_updated': dt, 'summary': summary, 'content': content, } storys.append(validate_story(story)) updated_story = dict(story) updated_content = f'test story content updated {i}' * (i % 5 + 1) updated_story.update( content=updated_content, content_hash_base64=compute_hash_base64(updated_content), ) updated_storys.append(validate_story(updated_story)) self.storys = storys self.updated_storys = updated_storys feed = Feed( title='test feed', url='https://blog.example.com/feed.xml', status=FeedStatus.READY, dt_updated=timezone.now(), ) feed.save() self.feed_id = feed.id
def update_feed_dryness(feeds=None): feed_ids = _get_feed_ids(feeds) LOG.info('total %s feeds', len(feed_ids)) for feed_id in tqdm.tqdm(feed_ids, ncols=80, ascii=True): with transaction.atomic(): feed = Feed.get_by_pk(feed_id) if feed.total_storys <= 0: continue cnt = feed.monthly_story_count if not cnt: Story.refresh_feed_monthly_story_count(feed_id) feed.refresh_from_db() feed.dryness = feed.monthly_story_count.dryness() feed.save()
def _import_feeds(self, imports: list): result = UnionFeed.create_by_imports(user_id=self._tester.id, imports=imports) for creation in result.feed_creations: creation: FeedCreation feed = Feed( title=creation.title, url=creation.url, status=FeedStatus.READY, dt_updated=timezone.now(), ) feed.save() user_feed = UserFeed( user=self._tester, feed=feed, title=creation.title, group=creation.group, dt_updated=timezone.now(), ) user_feed.save() FeedUrlMap(source=creation.url, target=feed.url).save() FeedUrlMap(source=creation.url + '.c', target=feed.url).save() return result
def delete_feed(key): try: key = int(key) except ValueError: pass # ignore if isinstance(key, int): feed = Feed.get_by_pk(key) else: feed = Feed.objects.filter( Q(url__contains=key) | Q(title__contains=key)).first() if not feed: print(f'not found feed like {key}') return if click.confirm(f'delete {feed} ?'): feed.delete()
def update_feed_dt_first_story_published(feeds=None): feed_ids = _get_feed_ids(feeds) LOG.info('total %s feeds', len(feed_ids)) for feed_id in tqdm.tqdm(feed_ids, ncols=80, ascii=True): with transaction.atomic(): feed = Feed.get_by_pk(feed_id) if feed.dt_first_story_published: continue if feed.total_storys <= 0: continue try: story = Story.get_by_offset(feed_id, 0, detail=True) except Story.DoesNotExist: LOG.warning(f'story feed_id={feed_id} offset=0 not exists') continue feed.dt_first_story_published = story.dt_published feed.save()
def test_get_feed_by_url(self): url = 'https://blog.example.com/feed.xml' got = Feed.get_first_by_url(url) self.assertEqual(got.title, 'test feed')
def do_feed_refresh_freeze_level(ctx: ActorContext): begin_time = time.time() Feed.refresh_freeze_level() cost = time.time() - begin_time LOG.info('feed_refresh_freeze_level cost {:.1f}ms'.format(cost * 1000))
def do_update_feed( ctx: ActorContext, feed_id: T.int, feed: FeedSchema, is_refresh: T.bool.default(False), ): with transaction.atomic(): feed_dict = feed storys = feed_dict.pop('storys') feed = Feed.get_by_pk(feed_id) is_feed_url_changed = feed.url != feed_dict['url'] if is_feed_url_changed: target_feed = Feed.get_first_by_url(feed_dict['url']) # FIXME: feed merge 无法正确处理订阅重定向问题。 # 对于这种情况,暂时保留旧的订阅,以后再彻底解决。 # if target_feed: # LOG.info(f'merge feed#{feed.id} url={feed.url} into ' # f'feed#{target_feed.id} url={target_feed.url}') # target_feed.merge(feed) # return if target_feed: LOG.warning( f'FIXME: redirect feed#{feed.id} url={feed.url!r} into ' f'feed#{target_feed.id} url={target_feed.url!r}') feed_dict.pop('url') # only update dt_updated if has storys or feed fields updated is_feed_updated = bool(storys) for k, v in feed_dict.items(): if k == 'dt_updated': continue if (v != '' and v is not None) or k in {'warnings'}: old_v = getattr(feed, k, None) if v != old_v: is_feed_updated = True setattr(feed, k, v) now = timezone.now() now_sub_30d = now - timezone.timedelta(days=30) if is_feed_updated: # set dt_updated to now, not trust rss date feed.dt_updated = now feed.dt_checked = feed.dt_synced = now feed.reverse_url = reverse_url(feed.url) feed.status = FeedStatus.READY feed.save() # save storys, bulk_save_by_feed has standalone transaction for s in storys: if not s['dt_updated']: s['dt_updated'] = now if not s['dt_published']: # set dt_published to now - 30d to avoid these storys # take over mushroom page, i.e. Story.query_recent_by_user s['dt_published'] = now_sub_30d modified_storys = STORY_SERVICE.bulk_save_by_feed(feed.id, storys, is_refresh=is_refresh) LOG.info('feed#%s save storys total=%s num_modified=%s', feed.id, len(storys), len(modified_storys)) feed = Feed.get_by_pk(feed_id) is_freezed = feed.freeze_level is None or feed.freeze_level > 1 if modified_storys and is_freezed: Feed.unfreeze_by_id(feed_id) need_fetch_story = _is_feed_need_fetch_storys(feed, modified_storys) for story in modified_storys: if not story.link: continue if need_fetch_story and (not _is_fulltext_story(story)): text = processor.story_html_to_text(story.content) num_sub_sentences = len(split_sentences(text)) ctx.tell( 'worker_rss.fetch_story', dict( url=story.link, use_proxy=feed.use_proxy, feed_id=story.feed_id, offset=story.offset, num_sub_sentences=num_sub_sentences, ))
def assert_feed_total_storys(self, expect): total_storys = Feed.get_by_pk(self.feed_id).total_storys self.assertEqual(total_storys, expect)
def do_update_feed( ctx: ActorContext, feed_id: T.int, feed: FeedSchema, is_refresh: T.bool.default(False), ): with transaction.atomic(): feed_dict = feed storys = feed_dict.pop('storys') feed = Feed.get_by_pk(feed_id) is_feed_url_changed = feed.url != feed_dict['url'] if is_feed_url_changed: target_feed = Feed.get_first_by_url(feed_dict['url']) if target_feed: LOG.info(f'merge feed#{feed.id} url={feed.url} into ' f'feed#{target_feed.id} url={target_feed.url}') target_feed.merge(feed) return # only update dt_updated if has storys or feed fields updated is_feed_updated = bool(storys) for k, v in feed_dict.items(): if k == 'dt_updated': continue if v != '' and v is not None: old_v = getattr(feed, k, None) if v != old_v: is_feed_updated = True setattr(feed, k, v) now = timezone.now() now_sub_30d = now - timezone.timedelta(days=30) if is_feed_updated: # set dt_updated to now, not trust rss date feed.dt_updated = now feed.dt_checked = feed.dt_synced = now feed.status = FeedStatus.READY feed.save() for s in storys: if not s['dt_updated']: s['dt_updated'] = now if not s['dt_published']: # set dt_published to now - 30d to avoid these storys # take over mushroom page, i.e. Story.query_recent_by_user s['dt_published'] = now_sub_30d modified_storys = Story.bulk_save_by_feed(feed.id, storys, is_refresh=is_refresh) LOG.info( 'feed#%s save storys total=%s num_modified=%s', feed.id, len(storys), len(modified_storys) ) feed.refresh_from_db() if modified_storys: feed.unfreeze() need_fetch_story = _is_feed_need_fetch_storys(feed, modified_storys) for story in modified_storys: if not story.link: continue if need_fetch_story and (not is_fulltext_story(story)): text = processor.story_html_to_text(story.content) num_sub_sentences = len(split_sentences(text)) ctx.tell('worker_rss.fetch_story', dict( url=story.link, use_proxy=feed.use_proxy, story_id=str(story.id), num_sub_sentences=num_sub_sentences, )) else: _detect_story_images(ctx, story)