def hostname_must_resolvable(cls, v): hostname = cast(str, get_hostname_from_url(v)) ip_address = get_ip_address_by_hostname(hostname) if ip_address is None: raise ValueError(f"Cannot resolve hostname: {hostname}.") return v
async def take_snapshot( url: str, accept_language: Optional[str] = None, ignore_https_errors: bool = False, referer: Optional[str] = None, timeout: Optional[int] = None, user_agent: Optional[str] = None, ) -> SnapshotResult: """Take a snapshot of a website by puppeteer Arguments: url {str} -- A URL of a website Keyword Arguments: accept_language {Optional[str]} -- Accept-language header to use (default: {None}) ignore_https_errors {bool} -- Whether to ignore HTTPS errors (default: {False}) referer {Optional[str]} -- Referer header to use (default: {None}) timeout {Optional[int]} -- Maximum time to wait for in seconds (default: {None}) user_agent {Optional[str]} -- User-agent header to use (default: {None}) Returns: SnapshotResult """ submitted_url: str = url try: async with async_playwright() as p: browser: playwright.browser.Browser = await launch_browser(p) page: Page = await browser.newPage( ignoreHTTPSErrors=ignore_https_errors, userAgent=user_agent) headers = {} if accept_language is not None: headers["Accept-Language"] = accept_language await page.setExtraHTTPHeaders(headers) # intercept responses on page to get scripts scripts: List[Script] = [] async def handle_response(response: Response) -> None: content_type: str = response.headers.get( "content-type", "") if response.ok and is_js_content_type(content_type): content = await response.text() scripts.append( Script( url=response.url, content=content, sha256=calculate_sha256(content), )) page.on( "response", lambda response: asyncio.create_task( handle_response(response)), ) # default timeout = 30 seconds timeout = timeout or 30 * 1000 res: Response = await page.goto( url, referer=referer, timeout=timeout, waitUntil=settings.BROWSER_WAIT_UNTIL, ) request = { "accept_language": accept_language, "browser": browser.version, "ignore_https_errors": ignore_https_errors, "referer": referer, "timeout": timeout, "user_agent": await page.evaluate("() => navigator.userAgent"), } url = page.url status = res.status screenshot_data = await page.screenshot() body = await page.content() sha256 = calculate_sha256(body) headers = res.headers await browser.close() except Error as e: raise (e) server = headers.get("server") content_type = headers.get("content-type") content_length = headers.get("content-length") hostname = cast(str, get_hostname_from_url(url)) certificate = Certificate.load_and_dump_from_url(url) ip_address = cast(str, get_ip_address_by_hostname(hostname)) asn = get_asn_by_ip_address(ip_address) or "" whois = Whois.whois(hostname) snapshot = Snapshot( url=url, submitted_url=submitted_url, status=status, body=body, sha256=sha256, headers=headers, hostname=hostname, ip_address=ip_address, asn=asn, server=server, content_length=content_length, content_type=content_type, whois=whois, certificate=certificate, request=request, ) screenshot = Screenshot() screenshot.data = base64.b64encode(screenshot_data).decode() return SnapshotResult( screenshot=screenshot, snapshot=snapshot, scripts=scripts, )
async def take_snapshot( url: str, accept_language: Optional[str] = None, host: Optional[str] = None, ignore_https_errors: bool = False, referer: Optional[str] = None, timeout: Optional[int] = None, user_agent: Optional[str] = None, ) -> SnapshotResult: """Take a snapshot of a website by httpx Arguments: url {str} -- A URL of a website Keyword Arguments: accept_language {Optional[str]} -- Accept-language header to use (default: {None}) host {Optional[str]} -- Host header to use (default: {None}) ignore_https_errors {bool} -- Whether to ignore HTTPS errors (default: {False}) referer {Optional[str]} -- Referer header to use (default: {None}) timeout {Optional[int]} -- Maximum time to wait for in seconds (default: {None}) user_agent {Optional[str]} -- User-agent header to use (default: {None}) Returns: SnapshotResult """ submitted_url: str = url verify = not ignore_https_errors try: # default timeout = 30 seconds timeout = int(timeout / 1000) if timeout is not None else 30 headers = { "user-agent": user_agent or DEFAULT_UA, "accept-language": accept_language or DEFAULT_AL, "referer": referer or DEFAULT_REFERER, } if host is not None: headers["host"] = host client = httpx.AsyncClient(verify=verify) res = await client.get( url, headers=headers, timeout=timeout, allow_redirects=True, ) request = { "accept_language": accept_language, "browser": "httpx", "host": host, "ignore_https_errors": ignore_https_errors, "referer": referer, "timeout": timeout, "user_agent": user_agent, } url = str(res.url) status = res.status_code body = res.text sha256 = calculate_sha256(body) headers = {k.lower(): v for (k, v) in res.headers.items()} except httpx.HTTPError as e: raise (e) server = headers.get("server") content_type = headers.get("content-type") content_length = headers.get("content-length") hostname = cast(str, get_hostname_from_url(url)) certificate = Certificate.load_and_dump_from_url(url) ip_address = cast(str, get_ip_address_by_hostname(hostname)) asn = get_asn_by_ip_address(ip_address) or "" whois = Whois.whois(hostname) snapshot = Snapshot( url=url, submitted_url=submitted_url, status=status, body=body, sha256=sha256, headers=headers, hostname=hostname, ip_address=ip_address, asn=asn, server=server, content_length=content_length, content_type=content_type, whois=whois, certificate=certificate, request=request, ) screenshot = Screenshot() screenshot.data = "" # get scripts scripts = cast(List[Script], await ScriptTask.process(snapshot, insert_to_db=False)) return SnapshotResult(screenshot=screenshot, snapshot=snapshot, scripts=scripts)
async def take_snapshot( url: str, user_agent: Optional[str] = None, timeout: Optional[int] = None, ignore_https_errors: bool = False, ) -> Snapshot: """Take a snapshot of a website by puppeteer Arguments: url {str} -- A URL of a website Keyword Arguments: user_agent {Optional[str]} -- User agent to use (default: {None}) timeout {Optional[int]} -- Maximum time to wait for in seconds (default: {None}) ignore_https_errors {bool} -- Whether to ignore HTTPS errors (default: {False}) Returns: Snapshot -- Snapshot ORM instance """ submitted_url: str = url try: browser = await launch( headless=True, ignoreHTTPSErrors=ignore_https_errors, args=["--no-sandbox"], ) page = await browser.newPage() if user_agent is not None: await page.setUserAgent(user_agent) # default timeout = 30 seconds timeout = timeout if timeout is not None else 30 * 1000 res = await page.goto(url, timeout=timeout) request = { "browser": await browser.version(), "ignore_https_errors": ignore_https_errors, "timeout": timeout, "user_agent": user_agent or await browser.userAgent(), } url = page.url status = res.status screenshot = await page.screenshot(encoding="base64") body = await page.content() sha256 = calculate_sha256(body) headers = res.headers except PyppeteerError as e: await browser.close() raise (e) else: await browser.close() finally: if browser is not None: await browser.close() server = headers.get("server") content_type = headers.get("content-type") content_length = headers.get("content-length") hostname = cast(str, get_hostname_from_url(url)) certificate = Certificate.load_and_dump_from_url(url) ip_address = cast(str, get_ip_address_by_hostname(hostname)) asn = get_asn_by_ip_address(ip_address) whois = Whois.whois(hostname) snapshot = Snapshot( url=url, submitted_url=submitted_url, status=status, body=body, sha256=sha256, headers=headers, hostname=hostname, ip_address=ip_address, asn=asn, server=server, content_length=content_length, content_type=content_type, whois=whois, certificate=certificate, request=request, screenshot=screenshot, ) return snapshot