def story_fetch_fulltext( request, feed_id: T.feed_unionid.object, offset: T.int.min(0), ) -> T.dict( feed_id=T.feed_unionid, offset=T.int.min(0), response_status=T.int, response_status_name=T.str, use_proxy=T.bool.optional, accept=T_ACCEPT.optional, story=StorySchema.optional, ): feed_unionid = feed_id check_unionid(request, feed_unionid) user_id, feed_id = feed_unionid content = dict(feed_id=feed_id, offset=offset) expire_at = int(time.time() + 60) use_proxy = None accept = None try: result = scheduler.ask('harbor_rss.sync_story_fulltext', content, expire_at=expire_at) except _TIMEOUT_ERRORS as ex: LOG.error(f'Ask harbor_rss.sync_story_fulltext timeout: {ex}') response_status = FeedResponseStatus.CONNECTION_TIMEOUT else: response_status = result['response_status'] use_proxy = result['use_proxy'] accept = result['accept'] story = None if accept != FulltextAcceptStrategy.REJECT.value: story = UnionStory.get_by_feed_offset(feed_unionid, offset, detail=True) story = story.to_dict() response_status_name = FeedResponseStatus.name_of(response_status) return dict( feed_id=feed_unionid, offset=offset, response_status=response_status, response_status_name=response_status_name, use_proxy=use_proxy, accept=accept, story=story, )
def do_sync_feed( ctx: ActorContext, feed_id: T.int, url: T.url, use_proxy: T.bool.default(False), checksum_data: T.bytes.maxlen(4096).optional, content_hash_base64: T.str.optional, etag: T.str.optional, last_modified: T.str.optional, is_refresh: T.bool.default(False), ): params = {} if not is_refresh: params = dict(etag=etag, last_modified=last_modified) options = _proxy_helper.get_proxy_options() if DNS_SERVICE.is_resolved_url(url): use_proxy = False switch_prob = 0.25 # the prob of switch from use proxy to not use proxy with FeedReader(**options) as reader: use_proxy = reader.has_proxy and use_proxy if use_proxy and random.random() < switch_prob: use_proxy = False response = reader.read(url, **params, use_proxy=use_proxy) LOG.info( f'read feed#{feed_id} url={unquote(url)} status={response.status}') need_proxy = FeedResponseStatus.is_need_proxy(response.status) if (not use_proxy) and reader.has_proxy and need_proxy: LOG.info(f'try use proxy read feed#{feed_id} url={unquote(url)}') proxy_response = reader.read(url, **params, use_proxy=True) LOG.info( f'proxy read feed#{feed_id} url={unquote(url)} status={proxy_response.status}' ) if proxy_response.ok: response = proxy_response if (not response.ok) or (not response.content): status = FeedStatus.READY if response.status == 304 else FeedStatus.ERROR _update_feed_info(ctx, feed_id, status=status, response=response) return new_hash = compute_hash_base64(response.content) if (not is_refresh) and (new_hash == content_hash_base64): LOG.info( f'feed#{feed_id} url={unquote(url)} not modified by compare content hash!' ) _update_feed_info(ctx, feed_id, response=response) return LOG.info(f'parse feed#{feed_id} url={unquote(url)}') try: raw_result = RawFeedParser().parse(response) except FeedParserError as ex: LOG.warning('failed parse feed#%s url=%r: %s', feed_id, unquote(url), ex) _update_feed_info(ctx, feed_id, status=FeedStatus.ERROR, response=response, warnings=str(ex)) return if raw_result.warnings: warnings = '; '.join(raw_result.warnings) LOG.warning('warning parse feed#%s url=%r: %s', feed_id, unquote(url), warnings) try: feed = _parse_found((response, raw_result), checksum_data=checksum_data, is_refresh=is_refresh) except (Invalid, FeedParserError) as ex: LOG.error('invalid feed#%s url=%r: %s', feed_id, unquote(url), ex, exc_info=ex) _update_feed_info(ctx, feed_id, status=FeedStatus.ERROR, response=response, warnings=str(ex)) return ctx.tell('harbor_rss.update_feed', dict(feed_id=feed_id, feed=feed, is_refresh=is_refresh))
def response_status_name(self) -> str: if self.response_status is None: return None return FeedResponseStatus.name_of(self.response_status)