def run(): with transaction.atomic(): feed_ids = [feed.id for feed in Feed.objects.only('id').all()] LOG.info('total %s feeds', len(feed_ids)) for feed_id in tqdm.tqdm(feed_ids, ncols=80, ascii=True): storys = query_old_storys_by_feed(feed_id) Story.bulk_save_by_feed(feed_id, storys)
def update_story_is_user_marked(): user_storys = list( UserStory.objects.exclude(is_watched=False, is_favorited=False).all()) LOG.info('total %s user marked storys', len(user_storys)) if not user_storys: return for user_story in tqdm.tqdm(user_storys, ncols=80, ascii=True): Story.set_user_marked_by_id(user_story.story_id)
def update_feed_story_publish_period(feeds=None): """ Deprecated since v3.1 """ with transaction.atomic(): feed_ids = _get_feed_ids(feeds) LOG.info('total %s feeds', len(feed_ids)) for feed_id in tqdm.tqdm(feed_ids, ncols=80, ascii=True): Story.update_feed_story_publish_period(feed_id)
def update_feed_dryness(feeds=None): feed_ids = _get_feed_ids(feeds) LOG.info('total %s feeds', len(feed_ids)) for feed_id in tqdm.tqdm(feed_ids, ncols=80, ascii=True): with transaction.atomic(): feed = Feed.get_by_pk(feed_id) if feed.total_storys <= 0: continue cnt = feed.monthly_story_count if not cnt: Story.refresh_feed_monthly_story_count(feed_id) feed.refresh_from_db() feed.dryness = feed.monthly_story_count.dryness() feed.save()
def fix_feed_total_storys(dry_run=False): incorrect_feeds = Story.query_feed_incorrect_total_storys() LOG.info('total %s incorrect feeds', len(incorrect_feeds)) header = ['feed_id', 'total_storys', 'correct_total_storys'] click.echo(format_table(incorrect_feeds, header=header)) if dry_run: return with transaction.atomic(): num_corrected = 0 for feed_id, *__ in tqdm.tqdm(incorrect_feeds, ncols=80, ascii=True): fixed = Story.fix_feed_total_storys(feed_id) if fixed: num_corrected += 1 LOG.info('correct %s feeds', num_corrected)
def test_delete_by_retention(self): storys_0_30 = self.storys[:30] modified = Story.bulk_save_by_feed(self.feed_id, storys_0_30, batch_size=10) self.assertEqual(len(modified), 30) self.assert_feed_total_storys(30) self.assert_total_story_infos(0) storys_20_50 = self.storys[20:50] modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id, storys_20_50, batch_size=10) self.assertEqual(len(modified), 20) self.assert_feed_total_storys(50) self.assert_total_story_infos(20) n = STORY_SERVICE.delete_by_retention(self.feed_id, retention=10, limit=10) self.assertEqual(n, 10) self.assert_feed_total_storys(50) self.assert_total_story_infos(20) n = STORY_SERVICE.delete_by_retention(self.feed_id, retention=10, limit=50) self.assertEqual(n, 30) self.assert_feed_total_storys(50) self.assert_total_story_infos(10)
def do_clean_by_retention(ctx: ActorContext): retention = CONFIG.feed_story_retention feeds = Feed.take_retention_feeds(retention=retention) LOG.info('found {} feeds need clean by retention'.format(len(feeds))) for feed in feeds: feed_id = feed['feed_id'] url = feed['url'] n = Story.delete_by_retention(feed_id, retention=retention) LOG.info(f'deleted {n} storys of feed#{feed_id} {url} by retention')
def fix_story_offset(feeds=None): with transaction.atomic(): feed_ids = _get_feed_ids(feeds) LOG.info('total %s feeds', len(feed_ids)) num_fixed = 0 for feed_id in tqdm.tqdm(feed_ids, ncols=80, ascii=True): num_reallocate = Story.reallocate_offset(feed_id) if num_reallocate > 0: num_fixed += 1 LOG.info('correct %s feeds', num_fixed)
def test_story_dt_and_content_length(self): dt = timezone.datetime(2019, 6, 1, 12, 12, 12, tzinfo=timezone.utc) story = { 'unique_id': f'blog.example.com/1', 'title': f'test story 1', 'dt_published': dt, 'dt_updated': dt, } modified = Story.bulk_save_by_feed(self.feed_id, [validate_story(story)], batch_size=10) self.assertEqual(len(modified), 1) self.assert_feed_total_storys(1) self.assert_total_story_infos(0) dt_created = modified[0].dt_created dt_published = modified[0].dt_published assert modified[0].dt_updated == dt dt = dt + timezone.timedelta(days=1) updated_content = 'updated_content 1' story.update( content=updated_content, content_hash_base64=compute_hash_base64(updated_content), dt_published=dt, dt_updated=dt, ) modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id, [validate_story(story)], batch_size=10) self.assertEqual(len(modified), 1) self.assert_feed_total_storys(1) self.assert_total_story_infos(1) assert modified[0].dt_created == dt_created assert modified[0].dt_published == dt_published assert modified[0].dt_updated == dt assert modified[0].content_length == len(updated_content) dt = dt + timezone.timedelta(days=2) updated_content = 'updated_content 22' story.update( content=updated_content, content_hash_base64=compute_hash_base64(updated_content), dt_published=dt, dt_updated=dt, ) modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id, [validate_story(story)], batch_size=10) self.assertEqual(len(modified), 1) self.assert_feed_total_storys(1) self.assert_total_story_infos(1) assert modified[0].dt_created == dt_created assert modified[0].dt_published == dt_published assert modified[0].dt_updated == dt assert modified[0].content_length == len(updated_content)
def do_update_feed( ctx: ActorContext, feed_id: T.int, feed: FeedSchema, is_refresh: T.bool.default(False).desc('Deprecated'), ): with transaction.atomic(): feed_dict = feed storys = feed_dict.pop('storys') feed = Feed.get_by_pk(feed_id) is_feed_url_changed = feed.url != feed_dict['url'] if is_feed_url_changed: target_feed = Feed.get_first_by_url(feed_dict['url']) if target_feed: LOG.info(f'merge feed#{feed.id} url={feed.url} into ' f'feed#{target_feed.id} url={target_feed.url}') target_feed.merge(feed) return for k, v in feed_dict.items(): if v != '' and v is not None: setattr(feed, k, v) now = timezone.now() now_sub_30d = now - timezone.timedelta(days=30) if not feed.dt_updated: feed.dt_updated = now feed.dt_checked = feed.dt_synced = now feed.status = FeedStatus.READY feed.save() for s in storys: if not s['dt_updated']: s['dt_updated'] = now if not s['dt_published']: # set dt_published to now - 30d to avoid these storys # take over mushroom page, i.e. Story.query_recent_by_user s['dt_published'] = now_sub_30d modified_storys = Story.bulk_save_by_feed(feed.id, storys) LOG.info( 'feed#%s save storys total=%s num_modified=%s', feed.id, len(storys), len(modified_storys) ) feed.refresh_from_db() if modified_storys: feed.unfreeze() need_fetch_story = _is_feed_need_fetch_storys(feed) for story in modified_storys: if not story.link: continue if need_fetch_story and (not is_fulltext_story(feed, story)): ctx.tell('worker_rss.fetch_story', dict( url=story.link, story_id=str(story.id) )) else: _detect_story_images(ctx, story)
def update_feed_dt_first_story_published(feeds=None): feed_ids = _get_feed_ids(feeds) LOG.info('total %s feeds', len(feed_ids)) for feed_id in tqdm.tqdm(feed_ids, ncols=80, ascii=True): with transaction.atomic(): feed = Feed.get_by_pk(feed_id) if feed.dt_first_story_published: continue if feed.total_storys <= 0: continue try: story = Story.get_by_offset(feed_id, 0, detail=True) except Story.DoesNotExist: LOG.warning(f'story feed_id={feed_id} offset=0 not exists') continue feed.dt_first_story_published = story.dt_published feed.save()
def test_mix_bulk_save_by_feed(self): storys_0_30 = self.storys[:30] modified = Story.bulk_save_by_feed(self.feed_id, storys_0_30, batch_size=10) self.assertEqual(len(modified), 30) self.assert_feed_total_storys(30) self.assert_total_story_infos(0) storys_10_50 = self.updated_storys[10:30] + self.storys[30:50] modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id, storys_10_50, batch_size=10) self.assertEqual(len(modified), 40) self.assert_feed_total_storys(50) self.assert_total_story_infos(40) storys_40_60 = self.storys[40:60] modified = STORY_SERVICE.bulk_save_by_feed(self.feed_id, storys_40_60, batch_size=10) self.assertEqual(len(modified), 10) self.assert_feed_total_storys(60) self.assert_total_story_infos(50)
def update_feed_monthly_story_count(feeds=None): feed_ids = _get_feed_ids(feeds) LOG.info('total %s feeds', len(feed_ids)) for feed_id in tqdm.tqdm(feed_ids, ncols=80, ascii=True): with transaction.atomic(): Story.refresh_feed_monthly_story_count(feed_id)
def do_update_feed( ctx: ActorContext, feed_id: T.int, feed: FeedSchema, is_refresh: T.bool.default(False), ): with transaction.atomic(): feed_dict = feed storys = feed_dict.pop('storys') feed = Feed.get_by_pk(feed_id) is_feed_url_changed = feed.url != feed_dict['url'] if is_feed_url_changed: target_feed = Feed.get_first_by_url(feed_dict['url']) if target_feed: LOG.info(f'merge feed#{feed.id} url={feed.url} into ' f'feed#{target_feed.id} url={target_feed.url}') target_feed.merge(feed) return # only update dt_updated if has storys or feed fields updated is_feed_updated = bool(storys) for k, v in feed_dict.items(): if k == 'dt_updated': continue if (v != '' and v is not None) or k in {'warnings'}: old_v = getattr(feed, k, None) if v != old_v: is_feed_updated = True setattr(feed, k, v) now = timezone.now() now_sub_30d = now - timezone.timedelta(days=30) if is_feed_updated: # set dt_updated to now, not trust rss date feed.dt_updated = now feed.dt_checked = feed.dt_synced = now feed.status = FeedStatus.READY feed.save() for s in storys: if not s['dt_updated']: s['dt_updated'] = now if not s['dt_published']: # set dt_published to now - 30d to avoid these storys # take over mushroom page, i.e. Story.query_recent_by_user s['dt_published'] = now_sub_30d modified_storys = Story.bulk_save_by_feed(feed.id, storys, is_refresh=is_refresh) LOG.info('feed#%s save storys total=%s num_modified=%s', feed.id, len(storys), len(modified_storys)) feed.refresh_from_db() if modified_storys: feed.unfreeze() need_fetch_story = _is_feed_need_fetch_storys(feed, modified_storys) for story in modified_storys: if not story.link: continue if need_fetch_story and (not is_fulltext_story(story)): text = processor.story_html_to_text(story.content) num_sub_sentences = len(split_sentences(text)) ctx.tell( 'worker_rss.fetch_story', dict( url=story.link, use_proxy=feed.use_proxy, story_id=str(story.id), num_sub_sentences=num_sub_sentences, )) else: _detect_story_images(ctx, story)