示例#1
0
 def run(self):
     killer = GracefulKiller()
     pause_secs = 10
     restart_browser_interval = 900
     browser_starttime = time.time()
     while True:
         # process new blog posts:
         blogpostprocessor.run()
         # look for new papers:
         source = scraper.next_source()
         if source:
             scraper.scrape(source)
         # wait:
         pause_secs = 10 if source else 60
         for sec in range(pause_secs):
             if killer.kill_now:
                 self.stop()
                 return
             time.sleep(1)
         # restart browser?
         if time.time() - browser_starttime > restart_browser_interval:
             browser.stop_browser()
             browser_starttime = time.time() 
示例#2
0
文件: scrape.py 项目: wo/opp-tools
ap.add_argument('-d', '--debug_level', default=1, type=int)
ap.add_argument('-k', '--keep', action='store_true', help='keep temporary files')
ap.add_argument('-l', '--link', type=str, help='only process this link')
args = ap.parse_args()

debug.debuglevel(args.debug_level)

cur = db.dict_cursor()
query = "SELECT * FROM sources WHERE url LIKE %s LIMIT 1"
cur.execute(query, ('%'+args.url+'%',))
sources = cur.fetchall()
if not sources:
   raise Exception(args.url+' not in sources table')
source = scraper.Source(**sources[0])

if args.link:
    browser = scraper.Browser(use_virtual_display=True)
    browser.goto(source.url)
    source.set_html(browser.page_source)
    try:
        el = browser.find_element_by_xpath("//a[contains(@href, '{}')]".format(args.link))
    except Exception as e:
        sys.exit('no link containing '+args.link+' on '+source.url)
    url = source.make_absolute(el.get_attribute('href'))
    li = scraper.Link(url=url, source=source, element=el)
    li.load_from_db()
    scraper.process_link(li, force_reprocess=True, keep_tempfiles=args.keep)
else:
    scraper.scrape(source, keep_tempfiles=args.keep)

示例#3
0
def test_scrape(testdb):
    src = Source(url='http://umsu.de/papers/')
    src.load_from_db()
    scraper.scrape(src)