class WxGhost(object): def __init__(self): self.ghost = Ghost(log_level=logging.CRITICAL).start() self.ghost.download_images = False try: self.ghost.load_cookies("cookie.txt") print 'load cookie' except IOError: print 'load cookie error' self.ghost.show() def handle_frequency(self): if u"您的访问过于频繁" in self.ghost.content: print 'frequency' self.ghost.show() self.ghost.capture_to("seccode.png", selector="#seccodeImage") self.ghost.wait_for_text(u'以下内容来自微信公众号', timeout=1800) # 输入验证码 self.ghost.save_cookies("cookie.txt") def open(self, url): try: self.ghost.open(url) self.handle_frequency() except TimeoutError: print 'timeout when open' return False return True def evaluate(self, js, expect_loading=True): try: self.ghost.evaluate(js, expect_loading=expect_loading) self.handle_frequency() except TimeoutError: return False return True def sleep(self, value): self.ghost.sleep(value) def get_lxml(self): return lxml.html.fromstring(self.ghost.content)
class Crawler: def __init__(self, location, cookie_file=None, mainwindow=None): self.mainwindow = mainwindow self.ghost = Ghost().start() self.ghost._confirm_expected = True # self.ghost.wait_timeout = page_timeout self.ghost.download_images = False if cookie_file != '': try: self.ghost.load_cookies(cookie_file) except IOError: self.display("cookie: IOError", '<font color=red>$</font>', 'url') self.max_depth = 0 self.url_queue = [] self.location = location.split('?')[0] # dvwa_security(self.__ghost, 'low') def go(self): self.display("...crawling", "<b>$<b>", 'url') times = 0 while True: try: self.ghost.open(self.location) current_url, resources = self.ghost.evaluate('window.location.href') # redirect self.location = str(current_url) r = urlparse.urlparse(self.location) self.host = r.netloc # slash(r.scheme + "://" + r.netloc) self.display(self.location, "<a href='$'>$<a>", 'url') self.url_queue.append(self.location) break except TimeoutError: times = times + 1 if times == 5: self.display("TimeoutError", '<font color=red>$</font>', 'url') self.exit() self.crawler_page(self.location, 0) # url, depth # Test for url in self.url_queue: t = Test(self.ghost, url, self.mainwindow) t.test() self.exit() def crawler_page(self, location, depth): if depth >= self.max_depth: return try: self.ghost.open(location) current_url, resources = self.ghost.evaluate('window.location.href') # redirect location = str(current_url) except TimeoutError: return urls = [] soup = BeautifulSoup(str(self.ghost.content), from_encoding='utf-8') bs_as = soup.find_all('a') for a in bs_as: url = self.convert_a(location, a) if url: r = urlparse.urlparse(url) host = r.netloc # slash(r.scheme + "://" + r.netloc) if host == self.host and url not in self.url_queue: self.display(url, "<a href='$'>$<a>", 'url') self.url_queue.append(url) urls.append(url) for url in urls: self.crawler_page(url, depth + 1) def display(self, content, format=None, widget=None): print content if self.mainwindow: self.mainwindow.display(content, format, widget) def convert_a(self, location, a): if str(type(a)) == "<class 'bs4.element.Tag'>": try: href = a['href'] except KeyError: return None elif str(type(a)) == "<type 'str'>": href = a else: return None # <type 'unicode'> href = href.strip() # useless if href.lower() in ['javascript:;', "javacript:void(0);", "javascript:void(0)", "javascript:void(0);", 'return false;', '/', "http://www", ""]: return None for s in ['mailto:', '#', 'javascript:']: if href.lower().startswith(s): return None # normal if href.startswith('http://') or href.startswith('https://'): return href # path if href.startswith("//"): href = "http:" + href # //www.baidu.com/s elif href.startswith("/"): href = self.host + href[1:] else: href = slash(location) + href return href def exit(self): self.ghost.hide() if self.mainwindow: self.mainwindow.go_button.setEnabled(True) self.mainwindow.finish() else: print "Finish" self.ghost.sleep(120)