Exemplo n.º 1
0
Arquivo: edmunds.py Projeto: qdbp/cars
def scrape_edmunds() -> None:

    global SESSION
    SESSION = Session()

    opts = FirefoxOptions()
    opts.add_argument("--headless")

    drv = Firefox(options=opts)
    drv.request_interceptor = interceptor
    drv.response_interceptor = resp_interceptor

    next_xpath = ".//a[@aria-label='Pagination left']"
    for px in range(29, 10000):
        drv.get(
            f"https://www.edmunds.com/inventory/srp.html"
            f"?inventorytype=used,cpo&pagenumber={px}"
            f"&sort=mileage:asc&radius=500"
        )
        while True:
            try:
                btn = drv.find_element(By.XPATH, next_xpath)
                btn.click()
                print(drv.current_url)
                break
            # TODO detect last page properly
            except Exception:
                time.sleep(1)

    drv.close()
    SESSION.close()
Exemplo n.º 2
0
    wd.header_overrides = {
        'Referer': 'https://twitter.com/freedom',
    }
    links = get_links(wd, p_type, args.tag)

    # TODO Filter out links already parsed and sent

    for link in links:
        get_html(wd, link)
        wd.delete_all_cookies()
        wd.execute_script('window.localStorage.clear()')
        wd.execute_script('window.sessionStorage.clear()')

    # TODO Create TOC

    wd.close()

    html_files = []
    for i, link in enumerate(links):

        with open('tmp/%i.html' % i, 'w') as tmp_html:
            tmp_html.write(link['html'])

        html_files.append('tmp/%i.html' % i)

    cmd_args = ['html2pdf/main.js']

    for f in html_files:
        cmd_args.append('file://%s' % os.path.abspath(f))

    category = ""