def run(self): killer = GracefulKiller() pause_secs = 10 restart_browser_interval = 900 browser_starttime = time.time() while True: # process new blog posts: blogpostprocessor.run() # look for new papers: source = scraper.next_source() if source: scraper.scrape(source) # wait: pause_secs = 10 if source else 60 for sec in range(pause_secs): if killer.kill_now: self.stop() return time.sleep(1) # restart browser? if time.time() - browser_starttime > restart_browser_interval: browser.stop_browser() browser_starttime = time.time()
ap.add_argument('-d', '--debug_level', default=1, type=int) ap.add_argument('-k', '--keep', action='store_true', help='keep temporary files') ap.add_argument('-l', '--link', type=str, help='only process this link') args = ap.parse_args() debug.debuglevel(args.debug_level) cur = db.dict_cursor() query = "SELECT * FROM sources WHERE url LIKE %s LIMIT 1" cur.execute(query, ('%'+args.url+'%',)) sources = cur.fetchall() if not sources: raise Exception(args.url+' not in sources table') source = scraper.Source(**sources[0]) if args.link: browser = scraper.Browser(use_virtual_display=True) browser.goto(source.url) source.set_html(browser.page_source) try: el = browser.find_element_by_xpath("//a[contains(@href, '{}')]".format(args.link)) except Exception as e: sys.exit('no link containing '+args.link+' on '+source.url) url = source.make_absolute(el.get_attribute('href')) li = scraper.Link(url=url, source=source, element=el) li.load_from_db() scraper.process_link(li, force_reprocess=True, keep_tempfiles=args.keep) else: scraper.scrape(source, keep_tempfiles=args.keep)
def test_scrape(testdb): src = Source(url='http://umsu.de/papers/') src.load_from_db() scraper.scrape(src)