async def read_page(browserurl, targeturl): """ read a page and get the title originally https://github.com/HyperionGray/trio-chrome-devtools-protocol/blob/master/examples/get_title.py """ logger.info("Connecting to browser: %s", browserurl) async with open_cdp(browserurl) as conn: logger.info("Listing targets") targets = await conn.execute(target.get_targets()) target_id = targets[0].target_id logger.info("Attaching to target id=%s", target_id) session = await conn.open_session(target_id) logger.info("Navigating to %s", targeturl) await session.execute(page.enable()) async with session.wait_for(page.LoadEventFired): await session.execute(page.navigate(targeturl)) _, item = await session.execute(page.print_to_pdf()) print(f"pdf is: {type(item)}") logger.info("Extracting page title") root_node = await session.execute(dom.get_document()) title_node_id = await session.execute( dom.query_selector(root_node.node_id, "title") ) html = await session.execute(dom.get_outer_html(title_node_id))
async def main(): logger.info('Connecting to browser: %s', sys.argv[1]) async with open_cdp_connection(sys.argv[1]) as conn: logger.info('Listing targets') targets = await conn.execute(target.get_targets()) for t in targets: if (t.type == 'page' and not t.url.startswith('devtools://') and not t.attached): target_id = t.target_id break logger.info('Attaching to target id=%s', target_id) session = await conn.open_session(target_id) logger.info('Navigating to %s', sys.argv[2]) await session.execute(page.enable()) async with session.wait_for(page.LoadEventFired): await session.execute(page.navigate(sys.argv[2])) logger.info('Extracting page title') root_node = await session.execute(dom.get_document()) title_node_id = await session.execute( dom.query_selector(root_node.node_id, 'title')) html = await session.execute(dom.get_outer_html(title_node_id)) print(html)
async def save_pdf(browserurl, targeturl, pdfpath, sleeptime): """ make a pdf from a webpage originally https://github.com/HyperionGray/trio-chrome-devtools-protocol/blob/master/examples/screenshot.py Parameters browserurl: str ws address for chrome developer protocol commands targeturl: str url of page to print to pdf pngfile: str filename for png file """ logger.info("Connecting to browser: %s", browserurl) async with open_cdp(browserurl) as conn: logger.info("Listing targets") targets = await conn.execute(target.get_targets()) target_id = targets[0].target_id logger.info("Attaching to target id=%s", target_id) async with conn.open_session(target_id) as session: logger.info("Setting device emulation") await session.execute( emulation.set_device_metrics_override( width=1200, height=2000, device_scale_factor=1, mobile=False ) ) logger.info("Enabling page events") await session.execute(page.enable()) logger.info("Navigating to %s", targeturl) async with session.wait_for(page.LoadEventFired): await session.execute(page.navigate(url=targeturl)) time.sleep(sleeptime) root_node = await session.execute(dom.get_document()) title_node_id = await session.execute( dom.query_selector(root_node.node_id, "body") ) body_html = await session.execute(dom.get_outer_html(title_node_id)) logger.debug(body_html) logger.info("Saving a pdf") # TODO: make sure that javascript finishes rendering # await session.execute(page.capture_screenshot(format="png")) pdf_data, _ = await session.execute(page.print_to_pdf()) pdf_file = await trio.open_file(pdfpath, "wb") async with pdf_file: await pdf_file.write(b64decode(pdf_data)) logger.info(f"wrote {pdfpath}")
async def convert_page(session, path, reduction_code): urlpath = 'file://' + os.path.abspath(path) logger.info('Navigating to %s', urlpath) async with session.wait_for(page.LoadEventFired): await session.execute(page.navigate(url=urlpath)) (_, exc) = await session.execute(runtime.evaluate(reduction_code)) root_id = (await session.execute(dom.get_document())).node_id main_id = await session.execute(dom.query_selector(root_id, '#main')) return await session.execute(dom.get_outer_html(main_id))
async def merge_pages_in(ethox_doc, ws_addr, reduction_code): async with open_cdp_connection(ws_addr) as conn: logger.info('Listing targets') targets = await conn.execute(target.get_targets()) target_id = targets[0].target_id logger.info('Attaching to target id=%s', target_id) session = await conn.open_session(target_id) logger.info('Setting device emulation') await session.execute( emulation.set_device_metrics_override(width=800, height=600, device_scale_factor=1, mobile=False)) logger.info('Enabling page events') await session.execute(page.enable()) logger.info('Starting to crawl documentation') contents = {} for doc_page in glob.iglob(os.path.join(ethox_doc, '**', '*.html'), recursive=True): (target_item, main_contents) = await convert_page(session, doc_page, reduction_code) contents[target_item] = main_contents await convert_page(session, os.path.join(ethox_doc, 'index.html'), reduction_code) root_id = (await session.execute(dom.get_document())).node_id body_id = await session.execute(dom.query_selector(root_id, 'body')) footer_id = await session.execute(dom.query_selector( root_id, 'footer')) main_id = await session.execute(dom.query_selector(root_id, '#main')) for (_, contents) in contents.items(): cloned_id = await session.execute( dom.copy_to(main_id, body_id, footer_id)) await session.execute(dom.set_outer_html(cloned_id, contents)) await print_page(session, 'cargo_doc.pdf')
async def main(): logger.info('Connecting to browser: %s', sys.argv[1]) async with open_cdp_connection(sys.argv[1]) as conn: logger.info('Listing targets') targets = await conn.execute(target.get_targets()) target_id = targets[0].target_id logger.info('Attaching to target id=%s', target_id) session = await conn.open_session(target_id) logger.info('Navigating to %s', sys.argv[2]) await session.execute(page.enable()) await session.execute(page.navigate(sys.argv[2])) event = await session.wait_for(page.LoadEventFired) logger.info('Extracting page title') root_node = await session.execute(dom.get_document()) title_node_id = await session.execute( dom.query_selector(root_node.node_id, 'title')) html = await session.execute(dom.get_outer_html(title_node_id)) print(html)