async def goto( self, url: str, wait: str = "load", *args: Any, **kwargs: Any ) -> NavigationResult: """Navigate the browser to the supplied URL. The return value of this function indicates the next action to be performed by the crawler :param url: The URL of the page to navigate to :param wait: The wait condition that all the pages frame have before navigation is considered complete :param kwargs: Any additional arguments for use in navigating :return: An NavigationResult indicating the next action of the crawler """ self._url = url logged_method = f"goto" try: response = await self.frames.mainFrame.goto( url, waitUntil=wait, timeout=self._navigation_timeout ) self.set_timestamp_from_response(response) info = ( Helper.json_string( url=url, responseURL=response.url, status=response.status, mime=response.mimeType, ) if response is not None else Helper.json_string(url=url) ) self.logger.info(logged_method, f"we navigated to the page - {info}") self.frontier.crawling_new_page(self.main_frame.url) return self._determine_navigation_result(response) except NavigationError as ne: if ne.disconnected: self.logger.critical( logged_method, f"connection closed while navigating to {url}", exc_info=ne, ) return NavigationResult.EXIT_CRAWL_LOOP if ne.timeout or ne.response is not None: return self._determine_navigation_result(ne.response) self.logger.exception( logged_method, f"navigation failed for {url}", exc_info=ne ) return NavigationResult.SKIP_URL except Exception as e: self.logger.exception( logged_method, f"unknown error while navigating to {url}", exc_info=e ) return NavigationResult.EXIT_CRAWL_LOOP
def __init__( self, behavior_js: str, tab: Tab, next_action_expression: str, loop: Optional[AbstractEventLoop] = None, collect_outlinks: bool = False, post_run_actions: bool = False, frame: Optional[Union[Frame, Callable[[], Frame]]] = None, ) -> None: """Initialize the new WRBehaviorRunner instance :param behavior_js: The behavior's JS :param tab: The tab the behavior's JS will be run in :param next_action_expression: The JS expression used to initiate a behavior's action :param loop: The event loop used by the automation :param collect_outlinks: Should outlinks be collected after each action :param post_run_actions: Should a screenshot be taken once the behavior is done :param frame: Optional reference to or callable returning a simplechrome.FrameManager.Frame that the behavior is to be run in """ self.behavior_js: str = behavior_js self.tab: Tab = tab self.next_action_expression: str = next_action_expression self.collect_outlinks: bool = collect_outlinks self.post_run_actions: bool = post_run_actions self.frame: Optional[Union[Frame, Callable[[], Frame]]] = frame self.loop: AbstractEventLoop = Helper.ensure_loop(loop) self.logger: AutoLogger = create_autologger("behaviorRunner", "WRBehaviorRunner") self._done: bool = False self._paused: bool = False self._did_init: bool = False self._running_task: Optional[Task] = None self._num_actions_performed: int = 0
def __init__( self, browser: Browser, tab_data: Dict[str, str], redis: Optional[Redis] = None, session: Optional[ClientSession] = None, *args: Any, **kwargs: Any, ) -> None: super().__init__(loop=Helper.ensure_loop(browser.loop)) self.browser: Browser = browser self.redis = redis self.session = session self.tab_data: Dict[str, str] = tab_data self.client: Optional[Client] = None self.logger: AutoLogger = create_autologger("tabs", self.__class__.__name__) self._url: str = self.tab_data["url"] self._id: str = self.tab_data["id"] self._timestamp: str = None self._behaviors_paused: bool = False self._connection_closed: bool = False self._running: bool = False self._reconnecting: bool = False self._graceful_shutdown: bool = False self._default_handling_of_dialogs: bool = True self._behavior_run_task: Optional[Task] = None self._reconnect_promise: Optional[Task] = None self._running_behavior: Optional[Behavior] = None self._close_reason: Optional[CloseReason] = None self._viewport: Optional[Dict] = None
async def _extract_href_from_remote_node( self, node: Dict, outlink_accum: List[str] ) -> None: """Converts the supplied node to it's runtime object and retrieves the value of calling the href property getting on the node, adding the value of the href to the supplied out link accumulator if it has a crawlable scheme e.g. http(s) :param node: A node dict returned by DOM.getFlattenedDocument :param outlink_accum: A list used to accumulate valid out links """ # the supplied node dictionary represents the dom node as is # i.e. any attributes listed in that dictionary are not resolved # according to the browser's attribute resolution algorithm # hence the need to resolve (convert the node to a runtime DOM object) # and call the node's getter for the href attribute runtime_node = await self.client.DOM.resolveNode(nodeId=node["nodeId"]) obj_id = runtime_node["object"]["objectId"] results = await self.client.Runtime.callFunctionOn( self.href_fn, objectId=obj_id ) await self.client.Runtime.releaseObject(objectId=obj_id) # the url here is fully resolved against the origin it exists in # thus safe for usage in programmatic navigation url = results.get("result", {}).get("value") if Helper.url_has_crawlable_scheme(url): outlink_accum.append(url)
def __init__(self, conf: AutomationConfig, loop: Optional[AbstractEventLoop] = None) -> None: """Create a new driver :param conf: The automation configuration object :param loop: The event loop to be used """ self.conf: AutomationConfig = conf self.loop: AbstractEventLoop = Helper.ensure_loop(loop) self.did_init: bool = False self.shutdown_condition: ShutdownCondition = ShutdownCondition( loop=self.loop) self.session: ClientSession = Helper.create_aio_http_client_session( loop) self.behavior_manager: RemoteBehaviorManager = RemoteBehaviorManager( conf=self.conf, session=self.session, loop=self.loop) self.redis: Redis = None self.logger: AutoLogger = create_autologger("drivers", self.__class__.__name__) self._browser_exit_infos: List[BrowserExitInfo] = []
def __init__(self, loop: Optional[AbstractEventLoop] = None) -> None: """Initialize the new ShutdownCondition instance :param loop: The event loop used by the automation """ self.loop: AbstractEventLoop = Helper.ensure_loop(loop) self._shutdown_event: Event = Event(loop=self.loop) self._shutdown_from_signal: bool = False # SIGINT for local debugging self.loop.add_signal_handler(SIGINT, self._initiate_shutdown_signal) self.loop.add_signal_handler(SIGTERM, self._initiate_shutdown_signal)
async def close(self) -> None: logged_method = "close" self.logger.info( logged_method, f"closing {'gracefully' if self._graceful_shutdown else ''}" ) hard_close = not self._graceful_shutdown if self._running_behavior is not None and hard_close: self.logger.info(logged_method, "ending the running behavior") self._running_behavior.end() if self._crawl_loop_running(): msg = ( "canceling the crawl loop task" if hard_close else "waiting for the crawl loop task to end gracefully" ) self.logger.info(logged_method, msg) try: if hard_close: await Helper.timed_future_completion( self.crawl_loop_task, timeout=15, cancel=hard_close, loop=self.loop, ) else: await self.crawl_loop_task except Exception as e: self.logger.exception( logged_method, "the crawl loop threw an unexpected exception while waiting for it to end", exc_info=e, ) end_info = Helper.json_string(id=self.reqid, time=int(time.time())) self.logger.info(logged_method, f"crawl loop task ended - {end_info}") if self._graceful_shutdown: await self.frontier.remove_current_from_pending() await self.navigation_reset() self.crawl_loop_task = None is_frontier_exhausted = await self.frontier.exhausted() if self._close_reason is None and is_frontier_exhausted: self._close_reason = CloseReason.CRAWL_END await self.redis.lpush(self.config.redis_keys.auto_done, end_info) await super().close()
def __init__( self, conf: AutomationConfig, session: ClientSession, loop: Optional[AbstractEventLoop] = None, ) -> None: """Initialize the new instance of RemoteBehaviorManager :param conf: The automation's config :param session: The HTTP session to use for making the behavior requests :param loop: The event loop for the automation """ self.conf: AutomationConfig = conf self.session: ClientSession = session self.loop: AbstractEventLoop = Helper.ensure_loop(loop) self.logger: AutoLogger = create_autologger("remoteBehaviorManager", "RemoteBehaviorManager")
async def _post_action(self) -> None: """Executes the actions we are configured to do after an behavior's action. Available post run actions: - Out link collection """ logged_method = "post action" self.logger.debug( logged_method, Helper.json_string(action_count=self._num_actions_performed)) self._num_actions_performed += 1 # If the behavior runner is configured to collect out links, the collection occurs after every 10 # actions initiated. This is done in order to ensure that the performance of running an behavior does # not degrade due to a page having lots of out links (10k+). # Note: the previous handling of out links was to collect them after every action if self.collect_outlinks and self._num_actions_performed % 10 == 0: self.logger.debug(logged_method, f"collecting outlinks") await self.tab.collect_outlinks()
async def add(self, url: str, depth: int) -> bool: """Conditionally adds a URL to frontier. The addition condition is not seen, in scope, and not an inner page link. If the supplied URL is an inner page link it is added to the inner page links set. :param url: The URL to maybe add to the frontier :param depth: The depth the URL is to be crawled at :return: T/F indicating if the URL @ depth was added to the frontier """ logged_method = "add" url_info = Helper.json_string(url=url, depth=depth, page=self.scope.current_page) in_scope = self.scope.in_scope(url) if not in_scope: self.logger.info( logged_method, f"Not adding URL to the frontier, not in scope - {url_info}", ) return False if self.scope.is_inner_page_link(url): await self.redis.sadd(self.keys.inner_page_links, url) self.logger.info( logged_method, f"Not adding URL to the frontier, inner page link - {url_info}", ) return False was_added = await self.redis.sadd(self.keys.seen, url) if was_added == 0: self.logger.info( logged_method, f"Not adding URL to the frontier, seen - {url_info}" ) return False await self.redis.rpush(self.keys.queue, url_info) self.logger.info(logged_method, f"Added URL to the frontier - {url_info}") return True
def __init__( self, redis: Redis, config: AutomationConfig, loop: Optional[AbstractEventLoop] = None, ): """Initialize the new instance of RedisFrontier :param redis: The redis instance to be used :param config: The automation config :param loop: The event loop used by the automation """ self.config: AutomationConfig = config self.crawl_depth: int = -1 self.currently_crawling: Optional[Dict[str, Union[str, int]]] = None self.keys: RedisKeys = self.config.redis_keys self.logger: AutoLogger = create_autologger("frontier", "RedisFrontier") self.loop: AbstractEventLoop = Helper.ensure_loop(loop) self.redis: Redis = redis self.scope: RedisScope = RedisScope(self.redis, self.keys) self._did_wait: bool = False
def __init__( self, config: AutomationConfig, behavior_manager: BehaviorManager, session: Optional[ClientSession] = None, redis: Optional[Redis] = None, loop: Optional[AbstractEventLoop] = None, ) -> None: """ :param config: The configuration of this automation :param loop: Optional reference to the running event loop :param redis: Optional instance of redis to use """ super().__init__(loop=Helper.ensure_loop(loop)) self.tab_datas: List[Dict] = None self.redis: Optional[Redis] = redis self.session: Optional[ClientSession] = session self.tabs: Dict[str, Tab] = {} self.tab_closed_reasons: Dict[str, TabClosedInfo] = {} self.running: bool = False self.logger: AutoLogger = create_autologger("chrome_browser", "Chrome") self._config: AutomationConfig = config self._behavior_manager: BehaviorManager = behavior_manager
async def evaluate_in_page(self, js_string: str, contextId: Optional[Any] = None) -> Any: """Evaluates the supplied string of JavaScript in the tab :param js_string: The string of JavaScript to be evaluated :return: The results of the evaluation if any """ logged_method = "evaluate_in_page" self.logger.debug(logged_method, "evaluating js in page") try: results = await self.client.Runtime.evaluate( js_string, contextId=contextId, userGesture=True, awaitPromise=True, includeCommandLineAPI=True, returnByValue=True, ) except Exception as e: if not isinstance(e, CancelledError): self.logger.exception( logged_method, "evaluating js in page failed due to an python error", exc_info=e, ) return {"done": True} js_exception = results.get("exceptionDetails") if js_exception: jse_dets = Helper.getExceptionMessage(js_exception) self.logger.critical( logged_method, f"evaluating js in page failed due to an JS error - {jse_dets}", ) return {} return results.get("result", {}).get("value")