def do_find_feed( ctx: ActorContext, feed_creation_id: T.int, url: T.url, ): # immediately send message to update status ctx.ask('harbor_rss.update_feed_creation_status', dict( feed_creation_id=feed_creation_id, status=FeedStatus.UPDATING, )) messages = [] def message_handler(msg): LOG.info(msg) messages.append(msg) options = dict(message_handler=message_handler, **_get_proxy_options()) options.update(allow_private_address=CONFIG.allow_private_address) with FeedFinder(url, **options) as finder: found = finder.find() try: feed = _parse_found(found) if found else None except (Invalid, FeedParserError) as ex: LOG.error('invalid feed url=%r: %s', unquote(url), ex, exc_info=ex) message_handler(f'invalid feed: {ex}') feed = None ctx.tell('harbor_rss.save_feed_creation_result', dict( feed_creation_id=feed_creation_id, messages=messages, feed=feed, ))
def do_find_feed( ctx: ActorContext, feed_creation_id: T.int, url: T.url, ): # immediately send message to update status ctx.ask( 'harbor_rss.update_feed_creation_status', dict( feed_creation_id=feed_creation_id, status=FeedStatus.UPDATING, )) messages = [] def message_handler(msg): LOG.info(msg) messages.append(msg) options = dict(message_handler=message_handler, **_get_proxy_options()) with FeedFinder(url, **options) as finder: found = finder.find() try: feed = _parse_found(found) if found else None except Invalid as ex: message_handler(f'invalid feed: {ex}') feed = None ctx.tell( 'harbor_rss.save_feed_creation_result', dict( feed_creation_id=feed_creation_id, messages=messages, feed=feed, ))
def do_save_registery(ctx: ActorContext): LOG.info('save registery info for {}'.format( ctx.registery.registery_node.name)) registery_node = ctx.registery.registery_node.to_spec() nodes = ctx.registery.to_spec() Registery.create_or_update(registery_node, nodes) ctx.tell('scheduler.boardcast_registery')
def do_sync_feed( ctx: ActorContext, feed_id: T.int, url: T.url, content_hash_base64: T.str.optional, etag: T.str.optional, last_modified: T.str.optional, ): params = dict(etag=etag, last_modified=last_modified) with FeedReader() as reader: status_code, response = reader.read(url, **params) LOG.info(f'read feed#{feed_id} url={unquote(url)} status_code={status_code}') if status_code != 200 or not response: return new_hash = compute_hash_base64(response.content) if new_hash == content_hash_base64: LOG.info(f'feed#{feed_id} url={unquote(url)} not modified by compare content hash!') return LOG.info(f'parse feed#{feed_id} url={unquote(url)}') parsed = FeedParser.parse_response(response) if parsed.bozo: LOG.warning(f'failed parse feed#{feed_id} url={unquote(url)}: {parsed.bozo_exception}') return try: feed = _parse_found(parsed) except Invalid as ex: LOG.warning(f'invalid feed#{feed_id} url={unquote(url)}: {ex}', exc_info=ex) return ctx.tell('harbor_rss.update_feed', dict(feed_id=feed_id, feed=feed))
def do_ping(ctx: ActorContext, message: T.str) -> T.dict(message=T.str): LOG.info(ctx.message) r = ctx.ask('registery.query') LOG.info(r) ctx.tell('worker.pong', dict(message=message)) if message == 'error': raise ValueError(message) return dict(message=message)
def do_dns_service_refresh(ctx: ActorContext): DNS_SERVICE.refresh() records = {} for host, ip_set in DNS_SERVICE.records.items(): records[host] = list(ip_set) msg = dict(records=records) expire_at = time.time() + 60 * 60 for node in ctx.registery.remote_nodes: ctx.tell('actor.dns_service_update', msg, dst_node=node.name, expire_at=expire_at)
def do_update_feed( ctx: ActorContext, feed_id: T.int, feed: FeedSchema, is_refresh: T.bool.default(False).desc('Deprecated'), ): with transaction.atomic(): feed_dict = feed storys = feed_dict.pop('storys') feed = Feed.get_by_pk(feed_id) is_feed_url_changed = feed.url != feed_dict['url'] if is_feed_url_changed: target_feed = Feed.get_first_by_url(feed_dict['url']) if target_feed: LOG.info(f'merge feed#{feed.id} url={feed.url} into ' f'feed#{target_feed.id} url={target_feed.url}') target_feed.merge(feed) return for k, v in feed_dict.items(): if v != '' and v is not None: setattr(feed, k, v) now = timezone.now() now_sub_30d = now - timezone.timedelta(days=30) if not feed.dt_updated: feed.dt_updated = now feed.dt_checked = feed.dt_synced = now feed.status = FeedStatus.READY feed.save() for s in storys: if not s['dt_updated']: s['dt_updated'] = now if not s['dt_published']: # set dt_published to now - 30d to avoid these storys # take over mushroom page, i.e. Story.query_recent_by_user s['dt_published'] = now_sub_30d modified_storys = Story.bulk_save_by_feed(feed.id, storys) LOG.info( 'feed#%s save storys total=%s num_modified=%s', feed.id, len(storys), len(modified_storys) ) feed.refresh_from_db() if modified_storys: feed.unfreeze() need_fetch_story = _is_feed_need_fetch_storys(feed) for story in modified_storys: if not story.link: continue if need_fetch_story and (not is_fulltext_story(feed, story)): ctx.tell('worker_rss.fetch_story', dict( url=story.link, story_id=str(story.id) )) else: _detect_story_images(ctx, story)
def do_load_registery(ctx: ActorContext): registery_node = ctx.registery.registery_node.name LOG.info(f'load registery info for {registery_node}') registery = Registery.get(registery_node) if registery: ctx.registery.update(registery.node_specs) title = 'loaded' else: title = 'current' LOG.info(f'{title} registery info:\n' + pretty_format_json(ctx.registery.to_spec())) ctx.tell('scheduler.boardcast_registery')
def do_sync_feed( ctx: ActorContext, feed_id: T.int, url: T.url, use_proxy: T.bool.default(False), content_hash_base64: T.str.optional, etag: T.str.optional, last_modified: T.str.optional, ): params = dict(etag=etag, last_modified=last_modified, use_proxy=use_proxy) options = _get_proxy_options() options.update(allow_private_address=CONFIG.allow_private_address) with FeedReader(**options) as reader: response = reader.read(url, **params) LOG.info( f'read feed#{feed_id} url={unquote(url)} response.status={response.status}' ) if response.status != 200 or not response.content: return new_hash = compute_hash_base64(response.content) if new_hash == content_hash_base64: LOG.info( f'feed#{feed_id} url={unquote(url)} not modified by compare content hash!' ) return LOG.info(f'parse feed#{feed_id} url={unquote(url)}') try: raw_result = RawFeedParser().parse(response) except FeedParserError as ex: LOG.warning('failed parse feed#%s url=%r: %s', feed_id, unquote(url), ex) return if raw_result.warnings: warnings = '; '.join(raw_result.warnings) LOG.warning('warning parse feed#%s url=%r: %s', feed_id, unquote(url), warnings) return try: feed = _parse_found((response, raw_result)) except (Invalid, FeedParserError) as ex: LOG.error('invalid feed#%s url=%r: %s', feed_id, unquote(url), ex, exc_info=ex) return ctx.tell('harbor_rss.update_feed', dict(feed_id=feed_id, feed=feed))
def do_sync_feed( ctx: ActorContext, feed_id: T.int, url: T.url, use_proxy: T.bool.default(False), checksum_data: T.bytes.maxlen(4096).optional, content_hash_base64: T.str.optional, etag: T.str.optional, last_modified: T.str.optional, is_refresh: T.bool.default(False), ): params = {} if not is_refresh: params = dict(etag=etag, last_modified=last_modified) options = _proxy_helper.get_proxy_options() if DNS_SERVICE.is_resolved_url(url): use_proxy = False switch_prob = 0.25 # the prob of switch from use proxy to not use proxy with FeedReader(**options) as reader: use_proxy = reader.has_proxy and use_proxy if use_proxy and random.random() < switch_prob: use_proxy = False response = reader.read(url, **params, use_proxy=use_proxy) LOG.info( f'read feed#{feed_id} url={unquote(url)} status={response.status}') need_proxy = FeedResponseStatus.is_need_proxy(response.status) if (not use_proxy) and reader.has_proxy and need_proxy: LOG.info(f'try use proxy read feed#{feed_id} url={unquote(url)}') proxy_response = reader.read(url, **params, use_proxy=True) LOG.info( f'proxy read feed#{feed_id} url={unquote(url)} status={proxy_response.status}' ) if proxy_response.ok: response = proxy_response if (not response.ok) or (not response.content): status = FeedStatus.READY if response.status == 304 else FeedStatus.ERROR _update_feed_info(ctx, feed_id, status=status, response=response) return new_hash = compute_hash_base64(response.content) if (not is_refresh) and (new_hash == content_hash_base64): LOG.info( f'feed#{feed_id} url={unquote(url)} not modified by compare content hash!' ) _update_feed_info(ctx, feed_id, response=response) return LOG.info(f'parse feed#{feed_id} url={unquote(url)}') try: raw_result = RawFeedParser().parse(response) except FeedParserError as ex: LOG.warning('failed parse feed#%s url=%r: %s', feed_id, unquote(url), ex) _update_feed_info(ctx, feed_id, status=FeedStatus.ERROR, response=response, warnings=str(ex)) return if raw_result.warnings: warnings = '; '.join(raw_result.warnings) LOG.warning('warning parse feed#%s url=%r: %s', feed_id, unquote(url), warnings) try: feed = _parse_found((response, raw_result), checksum_data=checksum_data, is_refresh=is_refresh) except (Invalid, FeedParserError) as ex: LOG.error('invalid feed#%s url=%r: %s', feed_id, unquote(url), ex, exc_info=ex) _update_feed_info(ctx, feed_id, status=FeedStatus.ERROR, response=response, warnings=str(ex)) return ctx.tell('harbor_rss.update_feed', dict(feed_id=feed_id, feed=feed, is_refresh=is_refresh))
def do_load_registery(ctx: ActorContext): registery_node = ctx.registery.registery_node.name registery_info = pretty_format_json(ctx.registery.to_spec()) LOG.info(f'load registery info for {registery_node}:\n' + registery_info) ctx.tell('scheduler.boardcast_registery')
def do_save_registery(ctx: ActorContext): LOG.info('save registery info for {}'.format( ctx.registery.registery_node.name)) ctx.tell('scheduler.boardcast_registery')
def do_update_feed( ctx: ActorContext, feed_id: T.int, feed: FeedSchema, is_refresh: T.bool.default(False), ): with transaction.atomic(): feed_dict = feed storys = feed_dict.pop('storys') feed = Feed.get_by_pk(feed_id) is_feed_url_changed = feed.url != feed_dict['url'] if is_feed_url_changed: target_feed = Feed.get_first_by_url(feed_dict['url']) # FIXME: feed merge 无法正确处理订阅重定向问题。 # 对于这种情况,暂时保留旧的订阅,以后再彻底解决。 # if target_feed: # LOG.info(f'merge feed#{feed.id} url={feed.url} into ' # f'feed#{target_feed.id} url={target_feed.url}') # target_feed.merge(feed) # return if target_feed: LOG.warning( f'FIXME: redirect feed#{feed.id} url={feed.url!r} into ' f'feed#{target_feed.id} url={target_feed.url!r}') feed_dict.pop('url') # only update dt_updated if has storys or feed fields updated is_feed_updated = bool(storys) for k, v in feed_dict.items(): if k == 'dt_updated': continue if (v != '' and v is not None) or k in {'warnings'}: old_v = getattr(feed, k, None) if v != old_v: is_feed_updated = True setattr(feed, k, v) now = timezone.now() now_sub_30d = now - timezone.timedelta(days=30) if is_feed_updated: # set dt_updated to now, not trust rss date feed.dt_updated = now feed.dt_checked = feed.dt_synced = now feed.reverse_url = reverse_url(feed.url) feed.status = FeedStatus.READY feed.save() # save storys, bulk_save_by_feed has standalone transaction for s in storys: if not s['dt_updated']: s['dt_updated'] = now if not s['dt_published']: # set dt_published to now - 30d to avoid these storys # take over mushroom page, i.e. Story.query_recent_by_user s['dt_published'] = now_sub_30d modified_storys = STORY_SERVICE.bulk_save_by_feed(feed.id, storys, is_refresh=is_refresh) LOG.info('feed#%s save storys total=%s num_modified=%s', feed.id, len(storys), len(modified_storys)) feed = Feed.get_by_pk(feed_id) is_freezed = feed.freeze_level is None or feed.freeze_level > 1 if modified_storys and is_freezed: Feed.unfreeze_by_id(feed_id) need_fetch_story = _is_feed_need_fetch_storys(feed, modified_storys) for story in modified_storys: if not story.link: continue if need_fetch_story and (not _is_fulltext_story(story)): text = processor.story_html_to_text(story.content) num_sub_sentences = len(split_sentences(text)) ctx.tell( 'worker_rss.fetch_story', dict( url=story.link, use_proxy=feed.use_proxy, feed_id=story.feed_id, offset=story.offset, num_sub_sentences=num_sub_sentences, ))
def do_update_feed( ctx: ActorContext, feed_id: T.int, feed: FeedSchema, is_refresh: T.bool.default(False), ): with transaction.atomic(): feed_dict = feed storys = feed_dict.pop('storys') feed = Feed.get_by_pk(feed_id) is_feed_url_changed = feed.url != feed_dict['url'] if is_feed_url_changed: target_feed = Feed.get_first_by_url(feed_dict['url']) if target_feed: LOG.info(f'merge feed#{feed.id} url={feed.url} into ' f'feed#{target_feed.id} url={target_feed.url}') target_feed.merge(feed) return # only update dt_updated if has storys or feed fields updated is_feed_updated = bool(storys) for k, v in feed_dict.items(): if k == 'dt_updated': continue if v != '' and v is not None: old_v = getattr(feed, k, None) if v != old_v: is_feed_updated = True setattr(feed, k, v) now = timezone.now() now_sub_30d = now - timezone.timedelta(days=30) if is_feed_updated: # set dt_updated to now, not trust rss date feed.dt_updated = now feed.dt_checked = feed.dt_synced = now feed.status = FeedStatus.READY feed.save() for s in storys: if not s['dt_updated']: s['dt_updated'] = now if not s['dt_published']: # set dt_published to now - 30d to avoid these storys # take over mushroom page, i.e. Story.query_recent_by_user s['dt_published'] = now_sub_30d modified_storys = Story.bulk_save_by_feed(feed.id, storys, is_refresh=is_refresh) LOG.info( 'feed#%s save storys total=%s num_modified=%s', feed.id, len(storys), len(modified_storys) ) feed.refresh_from_db() if modified_storys: feed.unfreeze() need_fetch_story = _is_feed_need_fetch_storys(feed, modified_storys) for story in modified_storys: if not story.link: continue if need_fetch_story and (not is_fulltext_story(story)): text = processor.story_html_to_text(story.content) num_sub_sentences = len(split_sentences(text)) ctx.tell('worker_rss.fetch_story', dict( url=story.link, use_proxy=feed.use_proxy, story_id=str(story.id), num_sub_sentences=num_sub_sentences, )) else: _detect_story_images(ctx, story)