def do_register(ctx: ActorContext, node: NodeSpecSchema): LOG.info(f'register node {node}') ctx.registery.add(node) ctx.hope('registery.check', dict(node=node))
def do_update_feed( ctx: ActorContext, feed_id: T.int, feed: FeedSchema, is_refresh: T.bool.default(False), ): with transaction.atomic(): feed_dict = feed storys = feed_dict.pop('storys') feed = Feed.get_by_pk(feed_id) is_feed_url_changed = feed.url != feed_dict['url'] if is_feed_url_changed: target_feed = Feed.get_first_by_url(feed_dict['url']) if target_feed: LOG.info(f'merge feed#{feed.id} url={feed.url} into ' f'feed#{target_feed.id} url={target_feed.url}') target_feed.merge(feed) return # only update dt_updated if has storys or feed fields updated is_feed_updated = bool(storys) for k, v in feed_dict.items(): if k == 'dt_updated': continue if (v != '' and v is not None) or k in {'warnings'}: old_v = getattr(feed, k, None) if v != old_v: is_feed_updated = True setattr(feed, k, v) now = timezone.now() now_sub_30d = now - timezone.timedelta(days=30) if is_feed_updated: # set dt_updated to now, not trust rss date feed.dt_updated = now feed.dt_checked = feed.dt_synced = now feed.reverse_url = reverse_url(feed.url) feed.status = FeedStatus.READY feed.save() # save storys, bulk_save_by_feed has standalone transaction for s in storys: if not s['dt_updated']: s['dt_updated'] = now if not s['dt_published']: # set dt_published to now - 30d to avoid these storys # take over mushroom page, i.e. Story.query_recent_by_user s['dt_published'] = now_sub_30d modified_storys = STORY_SERVICE.bulk_save_by_feed(feed.id, storys, is_refresh=is_refresh) LOG.info( 'feed#%s save storys total=%s num_modified=%s', feed.id, len(storys), len(modified_storys) ) is_freezed = feed.freeze_level is None or feed.freeze_level > 1 if modified_storys and is_freezed: Feed.unfreeze_by_id(feed_id) need_fetch_story = _is_feed_need_fetch_storys(feed, modified_storys) for story in modified_storys: if not story.link: continue if need_fetch_story and (not is_fulltext_story(story)): text = processor.story_html_to_text(story.content) num_sub_sentences = len(split_sentences(text)) ctx.tell('worker_rss.fetch_story', dict( url=story.link, use_proxy=feed.use_proxy, feed_id=story.feed_id, offset=story.offset, num_sub_sentences=num_sub_sentences, )) else: _detect_story_images(ctx, story)
def do_sync_feed( ctx: ActorContext, feed_id: T.int, url: T.url, use_proxy: T.bool.default(False), checksum_data: T.bytes.maxlen(4096).optional, content_hash_base64: T.str.optional, etag: T.str.optional, last_modified: T.str.optional, is_refresh: T.bool.default(False), ): params = {} if not is_refresh: params = dict(etag=etag, last_modified=last_modified) options = _get_proxy_options() options.update(allow_private_address=CONFIG.allow_private_address) if DNS_SERVICE.is_resolved_url(url): use_proxy = False with FeedReader(**options) as reader: use_proxy = reader.has_rss_proxy and use_proxy response = reader.read(url, **params, use_proxy=use_proxy) LOG.info( f'read feed#{feed_id} url={unquote(url)} status={response.status}') need_proxy = FeedResponseStatus.is_need_proxy(response.status) if (not use_proxy) and reader.has_rss_proxy and need_proxy: LOG.info(f'try use proxy read feed#{feed_id} url={unquote(url)}') proxy_response = reader.read(url, **params, use_proxy=True) LOG.info( f'proxy read feed#{feed_id} url={unquote(url)} status={proxy_response.status}' ) if proxy_response.ok: response = proxy_response if (not response.ok) or (not response.content): return new_hash = compute_hash_base64(response.content) if (not is_refresh) and (new_hash == content_hash_base64): LOG.info( f'feed#{feed_id} url={unquote(url)} not modified by compare content hash!' ) return LOG.info(f'parse feed#{feed_id} url={unquote(url)}') try: raw_result = RawFeedParser().parse(response) except FeedParserError as ex: LOG.warning('failed parse feed#%s url=%r: %s', feed_id, unquote(url), ex) return if raw_result.warnings: warnings = '; '.join(raw_result.warnings) LOG.warning('warning parse feed#%s url=%r: %s', feed_id, unquote(url), warnings) try: feed = _parse_found((response, raw_result), checksum_data=checksum_data, is_refresh=is_refresh) except (Invalid, FeedParserError) as ex: LOG.error('invalid feed#%s url=%r: %s', feed_id, unquote(url), ex, exc_info=ex) return ctx.tell('harbor_rss.update_feed', dict(feed_id=feed_id, feed=feed, is_refresh=is_refresh))
def do_load_registery(ctx: ActorContext): registery_node = ctx.registery.registery_node.name registery_info = pretty_format_json(ctx.registery.to_spec()) LOG.info(f'load registery info for {registery_node}:\n' + registery_info) ctx.tell('scheduler.boardcast_registery')
def do_save_registery(ctx: ActorContext): LOG.info('save registery info for {}'.format( ctx.registery.registery_node.name)) ctx.tell('scheduler.boardcast_registery')
def do_save_feed_creation_result( ctx: ActorContext, feed_creation_id: T.int, messages: T.list(T.str), feed: FeedSchema.optional, ): with transaction.atomic(): feed_dict = feed try: feed_creation = FeedCreation.get_by_pk(feed_creation_id) except FeedCreation.DoesNotExist: LOG.warning(f'feed creation {feed_creation_id} not exists') return if feed_creation.status == FeedStatus.READY: LOG.info(f'feed creation {feed_creation_id} is ready') return feed_creation.message = '\n\n'.join(messages) feed_creation.dt_updated = timezone.now() if not feed_dict: feed_creation.status = FeedStatus.ERROR feed_creation.save() FeedUrlMap(source=feed_creation.url, target=FeedUrlMap.NOT_FOUND).save() return url = feed_dict['url'] feed = Feed.get_first_by_url(url) if not feed: now = timezone.now() feed = Feed(url=url, status=FeedStatus.READY, reverse_url=reverse_url(url), title=feed_dict['title'], dt_updated=now, dt_checked=now, dt_synced=now) feed.save() feed_creation.status = FeedStatus.READY feed_creation.feed_id = feed.id feed_creation.save() user_feed = UserFeed.objects.filter(user_id=feed_creation.user_id, feed_id=feed.id).first() if user_feed: LOG.info('UserFeed#{} user_id={} feed_id={} already exists'.format( user_feed.id, feed_creation.user_id, feed.id)) else: # only set UserFeed.title when import title not equal feed title title = None if feed_creation.title and feed_creation.title != feed.title: title = feed_creation.title user_feed = UserFeed( user_id=feed_creation.user_id, feed_id=feed.id, title=title, group=feed_creation.group, is_from_bookmark=feed_creation.is_from_bookmark, ) user_feed.save() FeedUrlMap(source=feed_creation.url, target=feed.url).save() if feed.url != feed_creation.url: FeedUrlMap(source=feed.url, target=feed.url).save() ctx.hope('harbor_rss.update_feed', dict( feed_id=feed.id, feed=validate_feed_output(feed_dict), ))
def do_local_ask(ctx: ActorContext) -> T.dict(message=T.str): LOG.info(ctx.message) r = ctx.ask('worker.async_local_ask') LOG.info(r) return r