class Crawler(object): def __init__(self, driver, lock, first_url, db_int=1): self.driver = driver self.driver.implicitly_wait(10) self.driver.set_page_load_timeout(30) self.r = RedisRecord() self.lock = lock self.queue = deque([]) self.queue.append(first_url) self.dbs = [1, 2] def init(self): self.r.init(self.dbs) url = self.queue.popleft() self.driver.get(url) self.parser(url) def parser(self, url): self.r.switchDB(1) if not self.r.get(url): self.driver.get(url) elem_links = self.driver.find_elements_by_tag_name('a') self.lock.acquire() self.sort([link.get_attribute("href") for link in elem_links], url) self.lock.release() self.r.switchDB(1) self.r.put(url, url) def navigation(self): while (len(self.queue) > 0): url = self.queue.popleft() try: # self.driver.refresh() self.r.switchDB(1) self.parser(url) except URLError as e: print url except IOError as e: self.r.switchDB(2) print "I/O error({0}): {1}".format(e.errno, e.strerror) # self.r.put(new_url,new_url) self.r.switchDB(1) except e: continue try: self.driver.quit() print "Fin du crawling du site " + url except URLError as e: self.driver = getattr(webdriver, 'Firefox')() print 'boum' self.lock.acquire() self.r.switchDB(1) self.r.put(url, url) self.lock.release() def sort(self, elem_links, url): fex = Faup() f = Filters() f.load() self.r.switchDB(1) extend = True domainfilter = True schemefilter = True try: for link in elem_links: new_url = link self.r.switchDB(2) if not self.r.get(new_url) and new_url: self.r.switchDB(1) if not self.r.get(new_url): fex.decode(new_url) domain = fex.get_host() if f.isfilteredscheme(fex.get_scheme()): self.r.switchDB(2) self.r.put(new_url, new_url) schemefilter = False if f.isfiltereddomains(domain): self.r.switchDB(2) self.r.put(new_url, new_url) domainfilter = False if f.isfilteredextention(fex.get_resource_path()): extend = False self.r.switchDB(2) self.r.put(new_url, new_url) if extend and domainfilter and schemefilter: self.r.switchDB(1) self.r.rpush('crawl', new_url) self.queue.append(new_url) except TypeError as e: print "TypeError"