def test_process_link(testdb, caplog): source = Source(url='http://umsu.de/papers/') source.load_from_db() browser = scraper.Browser(use_virtual_display=VDISPLAY) browser.goto(source.url) source.set_html(browser.page_source) link = 'options.pdf' el = browser.find_element_by_xpath("//a[@href='{}']".format(link)) url = source.make_absolute(link) li = Link(url=url, source=source, element=el) li.load_from_db() debuglevel(2) scraper.process_link(li, force_reprocess=True, keep_tempfiles=True) debuglevel(5) assert 'Options and Actions' in caplog.text() assert 'But even if we know' in caplog.text()
ap.add_argument('-d', '--debug_level', default=1, type=int) ap.add_argument('-k', '--keep', action='store_true', help='keep temporary files') ap.add_argument('-l', '--link', type=str, help='only process this link') args = ap.parse_args() debug.debuglevel(args.debug_level) cur = db.dict_cursor() query = "SELECT * FROM sources WHERE url LIKE %s LIMIT 1" cur.execute(query, ('%'+args.url+'%',)) sources = cur.fetchall() if not sources: raise Exception(args.url+' not in sources table') source = scraper.Source(**sources[0]) if args.link: browser = scraper.Browser(use_virtual_display=True) browser.goto(source.url) source.set_html(browser.page_source) try: el = browser.find_element_by_xpath("//a[contains(@href, '{}')]".format(args.link)) except Exception as e: sys.exit('no link containing '+args.link+' on '+source.url) url = source.make_absolute(el.get_attribute('href')) li = scraper.Link(url=url, source=source, element=el) li.load_from_db() scraper.process_link(li, force_reprocess=True, keep_tempfiles=args.keep) else: scraper.scrape(source, keep_tempfiles=args.keep)