def run(self): """Starts the crawling process the listed websites. The results queue will start filling up with image URLs. """ self.__running = True # Open up all browser windows for i in range(self.__browser_instance_cnt): if not self.__running: break # End prematurely browser = Driver(executable_path=paths.driver) # Set up the browser to be closable self.__browser_close_methods.append(browser.quit) # Set the page timeout browser.set_page_load_timeout(self.__load_timeout) self.__browser_pool.put(browser) crawl_threads = [] # Starts crawling the page and returns the given browser to the pool # when finished def crawl_and_return_to_pool(url, browser): progress_weight = (1 / len(self.__website_list)) * 100 self._crawl_page(url, browser, progress_weight) self.__browser_pool.put(browser) # Start crawling each URL for url in self.__website_list: if not self.__running: break # End prematurely # Wait for an unused browser instance browser = self.__browser_pool.get() # Start crawling thread = Thread(target=crawl_and_return_to_pool, args=(url, browser)) thread.start() crawl_threads.append(thread) # Wait for crawling to finish for thread in crawl_threads: thread.join() self._close_browsers() self.__running = False self.__is_finished = True
def get_driver(self, ua, proxy=None): if not proxy: service_args = self.service_args else: service_args = self.service_args.copy() p = urlsplit(proxy) service_args += [ "--proxy=" + (p.netloc or p.path), "--proxy-type=" + (p.scheme or "http") ] dcap = self.dcap.copy() dcap["phantomjs.page.settings.userAgent"] = ua driver = PhantomJS(executable_path=self.executable_path, service_args=service_args, desired_capabilities=dcap) driver.set_page_load_timeout(self.page_load_timeout) driver.set_script_timeout(self.page_load_timeout) return driver
def get_driver(self, request): proxy = request.meta.get("proxy", None) if not proxy: service_args = self.service_args else: service_args = self.service_args.copy() p = urlparse(proxy) service_args += [ "--proxy=" + (p.netloc or p.path), "--proxy-type=" + (p.scheme or "http") ] ua = request.headers.get( 'User-Agent', b'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:46.0) Gecko/20100101 Firefox/59.0' ) self.dcap["phantomjs.page.settings.userAgent"] = ua.decode() driver = PhantomJS(executable_path=self.executable_path, service_args=service_args, desired_capabilities=self.dcap) driver.set_page_load_timeout(self.page_load_timeout) driver.set_script_timeout(self.page_load_timeout) return driver
class RequestUtil: __browserAgent = 'Mozilla/5.0 (Windows NT 6.1; WOW64; rv:39.0) Gecko/20100101 Firefox/39.0' def __init__(self): self.cookies = '' self._lock = threading.RLock() def http_get_request(self, url, referer, timeout=''): self._lock.acquire() cookie = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie), SmartRedirectHandler()) urllib2.install_opener(opener) headers = { 'User-Agent': self.__browserAgent, 'Referer': referer, 'Cache-Control': 'max-age=0', 'Accept': '*/*', 'Connection': 'Keep-Alive', 'Accept-encoding': 'gzip' } req = urllib2.Request(url=url, headers=headers) if timeout == '': open = urllib2.urlopen(req) else: open = urllib2.urlopen(req, timeout=timeout) if self.cookies == '': for item in cookie: self.cookies = self.cookies + item.name + '=' + item.value + ';' self.cookies = self.cookies[:-1] if url != open.url: req = urllib2.Request(url=open.url, headers=headers) self._lock.release() return (open, req) def http_post_request(self, url, datas, referer, timeout=''): self._lock.acquire() postdata = urllib.urlencode(datas) headers = { 'User-Agent': self.__browserAgent, 'Referer': referer, 'Content-Type': 'application/x-www-form-urlencoded', 'Cache-Control': 'no-cache', 'Accept': '*/*', 'Connection': 'Keep-Alive', 'Accept-encoding': 'gzip', 'Cookie': self.cookies } req = urllib2.Request(url=url, data=postdata, headers=headers) req.get_host() if timeout == '': open = urllib2.urlopen(req) else: open = urllib2.urlopen(req, timeout=timeout) if url != open.url: req = urllib2.Request(url=open.url, headers=headers) self._lock.release() return (open, req) def http_get(self, url, refer='https://www.baidu.com'): return self.http_get_request(url, refer, 60) def http_post(self, url, datas, refer='https://www.baidu.com'): return self.http_post_request(url, datas, refer, 60) def http_post_request2(self, url, datas, timeout=''): if timeout == '': open = urllib2.urlopen(url, datas) else: open = urllib2.urlopen(url, datas, timeout=timeout) data = open.read() return data def http_post2(self, url, datas): return self.http_post_request2(url, datas, 300) def create_phandomjs(self, service_args, caps, timeout=30): self.driver = PhantomJS(desired_capabilities=caps, service_args=service_args) self.driver.set_page_load_timeout(timeout) self.driver.set_script_timeout(timeout) self.driver.implicitly_wait(timeout) def close_phandomjs(self): try: self.driver.quit() except: pass def http_get_phandomjs(self, url, refer='https://www.baidu.com', timeout=1000): caps = dict(DesiredCapabilities.PHANTOMJS) caps['browserName'] = 'chrome' caps["phantomjs.page.settings.resourceTimeout"] = timeout caps["phantomjs.page.settings.loadImages"] = False caps["phantomjs.page.settings.userAgent"] = (self.__browserAgent) caps["phantomjs.page.customHeaders.Referer"] = (refer) service_args = [] service_args.append('--load-images=no') service_args.append('--disk-cache=yes') service_args.append('--cookies-file=') self.create_phandomjs(timeout=timeout, service_args=service_args, caps=caps) self.driver.get(url) return self.driver.page_source
class PagesCrawler(BaseSpider): name = 'pages' link_extractor = RegexpLinkExtractor(canonicalize=False, deny_extensions=[]) ignored_exts = set(['.' + e for e in IGNORED_EXTENSIONS]) def __init__(self, **kw): args = DEFAULT_INPUT.copy() args.update(kw) self.args = args self.start_urls = to_list(args['start_urls']) self.maxdepth = int(args['maxdepth']) self.follow_prefixes = to_list(args['follow_prefixes']) self.nofollow_prefixes = to_list(args['nofollow_prefixes']) self.discover_prefixes = [url_to_lru_clean("http%s://%s" % (https, u.replace('http://', '').replace('https://', ''))) for u in to_list(args['discover_prefixes']) for https in ['', 's']] self.resolved_links = {} self.user_agent = args['user_agent'] self.phantom = 'phantom' in args and args['phantom'] and args['phantom'].lower() != "false" if self.phantom: self.ph_timeout = int(args.get('phantom_timeout', PHANTOM['TIMEOUT'])) self.ph_idle_timeout = int(args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT'])) self.ph_ajax_timeout = int(args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT'])) self.errors = 0 dispatcher.connect(self.closed, spider_closed) dispatcher.connect(self.crashed, spider_error) def start_requests(self): self.log("Starting crawl task - jobid: %s" % self.crawler.settings['JOBID'], log.INFO) self.log("ARGUMENTS : "+str(self.args), log.INFO) if self.phantom: self.init_phantom() for url in self.start_urls: yield self._request(url) def init_phantom(self): self.prefixfiles = os.path.join( scrapyd_config().get('logs_dir'), HYPHE_PROJECT, self.name, self.crawler.settings['JOBID'] ) self.log("Using path %s for PhantomJS crawl" % self.prefixfiles, log.INFO) phantom_args = [] if PROXY and not PROXY.startswith(':'): phantom_args.append('--proxy=%s' % PROXY) phantom_args.append('--cookies-file=%s-phantomjs-cookie.txt' % self.prefixfiles) phantom_args.append('--ignore-ssl-errors=true') phantom_args.append('--load-images=false') self.capabilities = dict(DesiredCapabilities.PHANTOMJS) self.capabilities['phantomjs.page.settings.userAgent'] = self.user_agent self.capabilities['takesScreenshot'] = False self.capabilities['phantomjs.page.settings.javascriptCanCloseWindows'] = False self.capabilities['phantomjs.page.settings.javascriptCanOpenWindows'] = False self.phantom = PhantomJS( executable_path=PHANTOM['PATH'], service_args=phantom_args, desired_capabilities=self.capabilities, service_log_path="%s-phantomjs.log" % self.prefixfiles ) self.phantom.implicitly_wait(10) self.phantom.set_page_load_timeout(60) self.phantom.set_script_timeout(self.ph_timeout + 15) def crashed(self, spider): self.errors += 1 self.closed("CRASH") def closed(self, reason): if self.errors: self.log("%s error%s encountered during the crawl." % (self.errors, 's' if self.errors > 1 else ''), log.ERROR) if self.phantom: self.phantom.quit() if not self.errors: for f in ["phantomjs-cookie.txt", "phantomjs.log"]: fi = "%s-%s" % (self.prefixfiles, f) if os.path.exists(fi) and not self.errors: os.remove(fi) def handle_response(self, response): lru = url_to_lru_clean(response.url) if self.phantom: self.phantom.get(response.url) # Collect whole DOM of the webpage including embedded iframes with open(os.path.join(PHANTOM["JS_PATH"], "get_iframes_content.js")) as js: get_bod_w_iframes = js.read() bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes) response._set_body(bod_w_iframes.encode('utf-8')) # Try to scroll and unfold page self.log("Start PhantomJS scrolling and unfolding", log.INFO) with open(os.path.join(PHANTOM["JS_PATH"], "scrolldown_and_unfold.js")) as js: try: signal.signal(signal.SIGALRM, timeout_alarm) signal.alarm(self.ph_timeout + 30) timedout = self.phantom.execute_async_script( js.read(), self.ph_timeout, self.ph_idle_timeout, self.ph_ajax_timeout) signal.alarm(0) if timedout: raise SeleniumTimeout self.log("Scrolling/Unfolding finished", log.INFO) except SeleniumTimeout: self.log("Scrolling/Unfolding timed-out (%ss)" % self.ph_timeout, log.WARNING) self.errors += 1 except WebDriverException as e: err = json.loads(e.msg)['errorMessage'] self.log("Scrolling/Unfolding crashed: %s" % err, log.ERROR) self.errors += 1 except Exception as e: self.log("Scrolling/Unfolding crashed: %s %s" % (type(e), e), log.ERROR) self.errors += 1 return self._make_raw_page(response, lru) bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes) response._set_body(bod_w_iframes.encode('utf-8')) # Cleanup pages with base64 images embedded that make scrapy consider them not htmlresponses if response.status == 200 and not isinstance(response, HtmlResponse): try: flags = response.flags if "partial" in flags: flags.remove("partial") flags.append("cleaned") response = HtmlResponse(response.url, headers=response.headers, body=cleanupbase64images(response.body), flags=flags, request=response.request) self.log("WARNING: page with base64 embedded images was cleaned-up for links extraction") except: pass if 300 < response.status < 400 or isinstance(response, HtmlResponse): return self.parse_html(response, lru) else: return self._make_raw_page(response, lru) def handle_error(self, failure, response=None): if response: p = self._make_raw_page(response, failure.request.url) p['error'] = error_name(failure.value) return p elif not "://www" in failure.request.url: return self._request(failure.request.url.replace('://', '://www.')) self.log("ERROR : %s" % failure.getErrorMessage(), log.ERROR) self.errors += 1 return def parse_html(self, response, lru): lrulinks = [] # handle redirects realdepth = response.meta['depth'] if 300 < response.status < 400: redir_url = response.headers['Location'] if redir_url.startswith('/'): redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'), redir_url) elif redir_url.startswith('./') or not redir_url.startswith('http'): redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'), redir_url[1:]) links = [{'url': redir_url}] response.meta['depth'] -= 1 else: try: links = self.link_extractor.extract_links(response) except Exception as e: self.log("ERROR: links extractor crashed on %s: %s %s" % (response, type(e), e), log.ERROR) links = [] self.errors += 1 for link in links: try: url = link.url except AttributeError: url = link['url'] try: lrulink = url_to_lru_clean(url) except ValueError, e: self.log("Error converting URL %s to LRU: %s" % (url, e), log.ERROR) continue lrulinks.append((url, lrulink)) if self._should_follow(response.meta['depth'], lru, lrulink) and \ not url_has_any_extension(url, self.ignored_exts): yield self._request(url) response.meta['depth'] = realdepth yield self._make_html_page(response, lru, lrulinks)
class PagesCrawler(BaseSpider): name = 'pages' link_extractor = RegexpLinkExtractor(canonicalize=False, deny_extensions=[]) ignored_exts = set(['.' + e for e in IGNORED_EXTENSIONS]) def __init__(self, **kw): args = DEFAULT_INPUT.copy() args.update(kw) self.args = args self.start_urls = to_list(args['start_urls']) self.maxdepth = int(args['maxdepth']) self.follow_prefixes = to_list(args['follow_prefixes']) self.nofollow_prefixes = to_list(args['nofollow_prefixes']) self.discover_prefixes = [ url_to_lru_clean( "http%s://%s" % (https, u.replace('http://', '').replace('https://', ''))) for u in to_list(args['discover_prefixes']) for https in ['', 's'] ] self.resolved_links = {} self.user_agent = args['user_agent'] self.phantom = 'phantom' in args and args[ 'phantom'] and args['phantom'].lower() != "false" if self.phantom: self.ph_timeout = int( args.get('phantom_timeout', PHANTOM['TIMEOUT'])) self.ph_idle_timeout = int( args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT'])) self.ph_ajax_timeout = int( args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT'])) self.errors = 0 dispatcher.connect(self.closed, spider_closed) dispatcher.connect(self.crashed, spider_error) def start_requests(self): self.log( "Starting crawl task - jobid: %s" % self.crawler.settings['JOBID'], log.INFO) self.log("ARGUMENTS : " + str(self.args), log.INFO) if self.phantom: self.init_phantom() for url in self.start_urls: yield self._request(url) def init_phantom(self): self.prefixfiles = os.path.join(scrapyd_config().get('logs_dir'), HYPHE_PROJECT, self.name, self.crawler.settings['JOBID']) self.log("Using path %s for PhantomJS crawl" % self.prefixfiles, log.INFO) phantom_args = [] if PROXY and not PROXY.startswith(':'): phantom_args.append('--proxy=%s' % PROXY) phantom_args.append('--cookies-file=%s-phantomjs-cookie.txt' % self.prefixfiles) phantom_args.append('--ignore-ssl-errors=true') phantom_args.append('--load-images=false') self.capabilities = dict(DesiredCapabilities.PHANTOMJS) self.capabilities[ 'phantomjs.page.settings.userAgent'] = self.user_agent self.capabilities['takesScreenshot'] = False self.capabilities[ 'phantomjs.page.settings.javascriptCanCloseWindows'] = False self.capabilities[ 'phantomjs.page.settings.javascriptCanOpenWindows'] = False self.phantom = PhantomJS(executable_path=PHANTOM['PATH'], service_args=phantom_args, desired_capabilities=self.capabilities, service_log_path="%s-phantomjs.log" % self.prefixfiles) self.phantom.implicitly_wait(10) self.phantom.set_page_load_timeout(60) self.phantom.set_script_timeout(self.ph_timeout + 15) def crashed(self, spider): self.errors += 1 self.closed("CRASH") def closed(self, reason): if self.errors: self.log( "%s error%s encountered during the crawl." % (self.errors, 's' if self.errors > 1 else ''), log.ERROR) if self.phantom: self.phantom.quit() if not self.errors: for f in ["phantomjs-cookie.txt", "phantomjs.log"]: fi = "%s-%s" % (self.prefixfiles, f) if os.path.exists(fi) and not self.errors: os.remove(fi) def handle_response(self, response): lru = url_to_lru_clean(response.url) if self.phantom: self.phantom.get(response.url) # Collect whole DOM of the webpage including embedded iframes with open( os.path.join(PHANTOM["JS_PATH"], "get_iframes_content.js")) as js: get_bod_w_iframes = js.read() bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes) response._set_body(bod_w_iframes.encode('utf-8')) # Try to scroll and unfold page self.log("Start PhantomJS scrolling and unfolding", log.INFO) with open( os.path.join(PHANTOM["JS_PATH"], "scrolldown_and_unfold.js")) as js: try: signal.signal(signal.SIGALRM, timeout_alarm) signal.alarm(self.ph_timeout + 30) timedout = self.phantom.execute_async_script( js.read(), self.ph_timeout, self.ph_idle_timeout, self.ph_ajax_timeout) signal.alarm(0) if timedout: raise SeleniumTimeout self.log("Scrolling/Unfolding finished", log.INFO) except SeleniumTimeout: self.log( "Scrolling/Unfolding timed-out (%ss)" % self.ph_timeout, log.WARNING) self.errors += 1 except WebDriverException as e: err = json.loads(e.msg)['errorMessage'] self.log("Scrolling/Unfolding crashed: %s" % err, log.ERROR) self.errors += 1 except Exception as e: self.log( "Scrolling/Unfolding crashed: %s %s" % (type(e), e), log.ERROR) self.errors += 1 return self._make_raw_page(response, lru) bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes) response._set_body(bod_w_iframes.encode('utf-8')) # Cleanup pages with base64 images embedded that make scrapy consider them not htmlresponses if response.status == 200 and not isinstance(response, HtmlResponse): try: flags = response.flags if "partial" in flags: flags.remove("partial") flags.append("cleaned") response = HtmlResponse(response.url, headers=response.headers, body=cleanupbase64images( response.body), flags=flags, request=response.request) self.log( "WARNING: page with base64 embedded images was cleaned-up for links extraction" ) except: pass if 300 < response.status < 400 or isinstance(response, HtmlResponse): return self.parse_html(response, lru) else: return self._make_raw_page(response, lru) def handle_error(self, failure, response=None): if response: p = self._make_raw_page(response, failure.request.url) p['error'] = error_name(failure.value) return p elif not "://www" in failure.request.url: return self._request(failure.request.url.replace('://', '://www.')) error = failure.getErrorMessage() self.log("ERROR : %s" % error, log.ERROR) if PROXY and not PROXY.startswith( ':') and "OpenSSL.SSL.Error" in error: return self._request(failure.request.url, noproxy=True) self.errors += 1 return def parse_html(self, response, lru): lrulinks = [] # handle redirects realdepth = response.meta['depth'] if 300 < response.status < 400: redir_url = response.headers['Location'] if redir_url.startswith('/'): redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'), redir_url) elif redir_url.startswith( './') or not redir_url.startswith('http'): redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'), redir_url[1:]) links = [{'url': redir_url}] response.meta['depth'] -= 1 else: try: links = self.link_extractor.extract_links(response) except Exception as e: self.log( "ERROR: links extractor crashed on %s: %s %s" % (response, type(e), e), log.ERROR) links = [] self.errors += 1 for link in links: try: url = link.url except AttributeError: url = link['url'] try: lrulink = url_to_lru_clean(url) except ValueError, e: self.log("Error converting URL %s to LRU: %s" % (url, e), log.ERROR) continue lrulinks.append((url, lrulink)) if self._should_follow(response.meta['depth'], lru, lrulink) and \ not url_has_any_extension(url, self.ignored_exts): yield self._request(url) response.meta['depth'] = realdepth yield self._make_html_page(response, lru, lrulinks)
class CNStock(SentimentCrawler): def __init__(self): super().__init__(init=False) self.driver = PhantomJS() self.driver.maximize_window() self.wait = WebDriverWait(self.driver, 15) self.url = 'http://www.cnstock.com/' self.name = '中国证券网' def crawl_main_page(self, keyword): self.driver.set_page_load_timeout(10) try: self.driver.get(self.url) except TimeoutException: self.driver.execute_script('window.stop();') try: self.wait.until( ec.presence_of_element_located((By.ID, 'nav_keywords'))) except: CustomLogging.log_to_file('中国证券网打开失败', LogType.ERROR) self.driver.find_element_by_id('nav_keywords').clear() self.driver.find_element_by_id('nav_keywords').send_keys(keyword + Keys.ENTER) return self.crawl_search_results() def crawl_search_results(self): search_results = [] self.driver.switch_to.window(self.driver.window_handles[-1]) self.driver.maximize_window() exit_flag = 0 while True: try: self.wait.until( ec.presence_of_element_located( (By.CLASS_NAME, 'result-cont'))) except TimeoutException: CustomLogging.log_to_file('中国证券网搜索结果页错误', LogType.ERROR) break try: result_articles = self.driver.find_elements_by_class_name( 'result-article') for each_article in result_articles: item = Entity() publish_date = each_article.find_element_by_class_name( 'g').text item.publish_date = re.search( re.compile( '[1-9]\d{3}-(0[1-9]|1[0-2])-(0[1-9]|[1-2][0-9]|3[0-1])\s+(20|21|22|23|[0-1]\d):[0-5]\d' ), publish_date).group() if not in_date_range( conv_pub_date(item.publish_date, 'cnstock'), self.year_range): exit_flag = 1 # 跳出for循环 break item.short_description = each_article.find_element_by_class_name( 'des').text item.title = each_article.find_element_by_tag_name( 'a').text if self.keyword not in item.short_description and self.keyword not in item.title: continue if item.title in self.titles: continue else: self.titles.append(item.title) item.url = each_article.find_element_by_tag_name( 'a').get_attribute('href') threading.Thread(target=super().download_and_save_item, args=(item, )).start() if exit_flag == 1: break except NoSuchElementException: CustomLogging.log_to_file('没有搜索结果', LogType.INFO) break try: next_page = self.driver.find_element_by_xpath( '//div[@class="pagination pagination-centered"]//a[contains(text(), "下一页")]' ) self.driver.get(next_page.get_attribute('href')) # next_page.click() except NoSuchElementException: break return search_results def parse_html(self, url, html): bs = BeautifulSoup(html, 'lxml') try: full_content = bs.find('div', attrs={'id': 'qmt_content_div'}).text return full_content except Exception: CustomLogging.log_to_file('页面解析错误: {0}|{1}'.format(self.name, url), LogType.ERROR) pass
class InspectAddress(object): def __init__(self): dcap = dict(DesiredCapabilities.PHANTOMJS) # 设置userAgent dcap["phantomjs.page.settings.userAgent"] = ( "Mozilla/5.0 (Macintosh; Intel Mac OS X 10.9; rv:25.0) Gecko/20100101 Firefox/25.0 " ) self.driver = PhantomJS( executable_path=r'phantomjs-2.1.1-windows\bin\phantomjs.exe', desired_capabilities=dcap) def get_dev_cookie(self): logurl = 'https://www.bidinghuo.cn/api/backend/login.json' # jsondata_url = 'https://www.bidinghuo.cn/api/backend/platform/query.json' headers = { 'Content-Type': 'application/json;charset=UTF-8', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36' } data = { u'username': config.developers_account[0], u'password': config.developers_account[1] } value = '' try: res = requests.post(logurl, data=data) if res.status_code == 200: print u'开发平台账户登录-成功' value = res.cookies['laravel_session'] else: print u'开发平台账户登录-失败' except: print u'开发平台账户登录-失败' cookies = { u'domain': u'.bidinghuo.cn', u'secure': False, u'value': value, u'expiry': None, u'path': u'/', u'httpOnly': True, u'name': u'laravel_session' } return cookies def get_brand_cookie(self): logurl = 'https://pyf123.bidinghuo.cn/api/admin/login.json' headers = { 'Content-Type': 'application/json;charset=UTF-8', 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/64.0.3282.186 Safari/537.36' } data = { u'username': config.brand_user[0], u'password': config.brand_user[1] } value = '' try: res = requests.post(logurl, data=data) if res.status_code == 200: print u'品牌商账户登录-成功' value = res.cookies['laravel_session'] else: print u'品牌商账户登录-失败' except: print u'品牌商账户登录-失败' cookies = { u'domain': u'.bidinghuo.cn', u'secure': False, u'value': value, u'expiry': None, u'path': u'/', u'httpOnly': True, u'name': u'laravel_session' } return cookies def developer_platform(self): '''访问品牌商管理开发平台''' url = config.developers_platform try: self.driver.add_cookie(self.get_dev_cookie()) self.driver.get(url) self.driver.set_page_load_timeout(30) except: print u'访问品牌商管理开发平台-异常' try: page = self.driver.page_source page_soup = BeautifulSoup(page) username = page_soup.find_all(class_='user-name')[0] assert username.string == config.developers_account[0] print u'品牌商管理开发平台-访问正常' except: print u'品牌商管理开发平台-访问异常' def brand_platform(self): '''访问品牌商后台''' url = config.brand_platform try: self.driver.add_cookie(self.get_brand_cookie()) self.driver.get(url) self.driver.set_page_load_timeout(30) bdh_title = BeautifulSoup( self.driver.page_source).find_all(class_='ovh')[0].h2.string nsgj_title = BeautifulSoup( self.driver.page_source).find_all(class_='ovh')[1].h2.string assert bdh_title == u'必订火' assert nsgj_title == u'内审管家' print u'访问品牌商后台-正常' except: print u'访问品牌商后台-异常' try: page = self.driver.page_source nsgj_href = self.driver.find_element_by_xpath( '//*[@id="app"]/div[2]/div/div[2]/div/div[2]/a').get_attribute( 'href') bdh_href = self.driver.find_element_by_xpath( '//*[@id="app"]/div[2]/div/div[1]/div/div[2]/a').get_attribute( 'href') assert requests.get(bdh_href).status_code == 200 self.driver.get(bdh_href) self.driver.set_page_load_timeout(30) dhh_title = BeautifulSoup(self.driver.page_source).find_all( class_='meeting-name text-overflow')[0].string assert dhh_title == u'测试订货会' print u'访问品牌商订货会-正常' except: print u'访问品牌商订货会-异常' try: assert requests.get(bdh_href).status_code == 200 self.driver.get(nsgj_href) self.driver.set_page_load_timeout(30) nsh_title = BeautifulSoup(self.driver.page_source).find_all( class_='meeting-name text-overflow')[0].string assert nsh_title == u'认同与人体' print u'访问品牌商内审管家-正常' except: print u'访问品牌商内审管家-异常'
class WeixinPhantomjs(Base): all_uids = {docs['uid'] for docs in in_collection.find({}, {'uid': 1}) if 'uid' in docs} def __init__(self): self.start_page = START_PAGE self.end_page = END_PAGE self.weixin_url = REFER_FIRST # self.driver = Firefox() if hasattr(config, 'PHANTOMJS_PATH'): self.driver = PhantomJS(executable_path=getattr(config, 'PHANTOMJS_PATH')) else: self.driver = PhantomJS() def open_weixin_browser(self, word): try: self.driver.get(self.weixin_url) self.driver.set_page_load_timeout(3) self.driver.find_element_by_id('upquery').send_keys(word) self.driver.find_element_by_class_name('swz').click() time.sleep(3) urls_uids = self.extract_urls_uids(word=word) Article(urls_uids=urls_uids, word=word).extract() except Exception as e: storage_word.append([word, 0]) self.logger.info('Open weixin error: type <{}>, mag <{}>'.format(e.__class__, e)) self.close_browser() return True return False def get_total_pages_to_word(self): pages = [] page_id_css = 'pagebar_container' try: e = self.driver.find_element_by_id(page_id_css) for _p in e.text.split(): _p = _p.strip() if not _p.isdigit(): return DEFAULT_PAGES if DEFAULT_PAGES <= pages[-1] else pages[-1] else: pages.append(int(_p)) return 1 except (NoSuchElementException, NoSuchWindowException, TypeError, IndexError): pass return 1 def extract_urls_uids(self, word): urls_uids = [] timestamp = [_t.get_attribute('t') for _t in self.driver.find_elements_by_css_selector('div.s-p')] urls_tits = [(t.get_attribute('href'), self.trim(t.text)) for t in self.driver.find_elements_by_css_selector('h4 a')] if len(urls_tits) != len(timestamp): return urls_uids for index, url_tit in enumerate(urls_tits): try: uid = self.md5(timestamp[index] + url_tit[1] + word) if uid not in self.__class__.all_uids: self.__class__.all_uids.add(uid) urls_uids.append({'url': url_tit[0], 'uid': uid}) except (TypeError, IndexError): pass return urls_uids @property def is_forbidden(self): css_id = 'seccodeForm' try: if self.driver.find_element_by_id(css_id): return True except NoSuchElementException: pass return False def appear_element(self, by): try: # Have `click` function to specified element tag = WebDriverWait(self.driver, 20).until(lambda driver: driver.find_element_by_id(by)) tag.click() return True except (TimeoutException, NoSuchWindowException, NoSuchElementException): pass return False def crawl_single(self, word=None, go=0): is_go = True go_page = int(go) next_page_css = 'sogou_page_%s' is_break = self.open_weixin_browser(word) pages = self.get_total_pages_to_word() for page in range(self.start_page + 1, (pages or self.end_page) + 1): if is_go and page < go_page: continue else: is_go = False if not self.appear_element(by=next_page_css % page): is_break = True msg = '\tNot appear next page element, will break' elif self.is_forbidden: is_break = True msg = '\tSpider was forbidden, crawling again after sleeping a moment!' if is_break: storage_word.append([word, page]) self.logger.info(msg) break urls_uids = self.extract_urls_uids(word=word) Article(urls_uids=urls_uids, word=word).extract() # self.driver.find_element_by_id(next_page_css % page).click() # wt = randint(10, 40) if page % 5 == 0 else randint(5, 18) wt = randint(1, 5) self.logger.info('Word <{}>, Page <{}> Done, sleeping {}s!'.format(word, page, wt)) # self.driver.implicitly_wait(wt) time.sleep(wt) self.close_browser() @classmethod def crawl_with_threads(cls): pool = ThreadPool(4) total_words = QueryWords().get_query_words() for bulk_words in total_words: try: pool.map(lambda w: cls().crawl_single(w), bulk_words) except Exception as e: cls.logger.info('Threads crawl error: type <{}>, msg <{}>'.format(e.__class__, e)) pool.close() pool.join() in_client.close() def close_browser(self): try: self.driver.close() except (NoSuchWindowException,): pass
class WeixinPhantomjs(Base): def __init__(self): self.start_page = START_PAGE self.end_page = END_PAGE self.weixin_url = REFER_FIRST # self.driver = Firefox() if hasattr(config, 'PHANTOMJS_PATH'): self.driver = PhantomJS(executable_path=getattr(config, 'PHANTOMJS_PATH')) else: self.driver = PhantomJS() self.client = MongoClient(HOST, PORT) self.collection = self.client[DB][COLLECTION] self.all_uids = self.uids def open_weixin_browser(self, word): try: self.driver.get(self.weixin_url) self.driver.set_page_load_timeout(3) self.driver.find_element_by_id('upquery').send_keys(word) self.driver.find_element_by_class_name('swz').click() time.sleep(3) urls_uids = self.extract_urls_uids(word=word) Article(urls_uids=urls_uids, word=word).extract() except Exception as e: storage_word.append([word, 0]) self.logger.info('Open weixin error: type <{}>, mag <{}>'.format(e.__class__, e)) self.close_browser() return True return False def get_total_pages_to_word(self): pages = [] page_id_css = 'pagebar_container' try: e = self.driver.find_element_by_id(page_id_css) for _p in e.text.split(): _p = _p.strip() if not _p.isdigit(): return DEFAULT_PAGES if DEFAULT_PAGES <= pages[-1] else pages[-1] else: pages.append(int(_p)) return 1 except (NoSuchElementException, NoSuchWindowException, TypeError, IndexError): pass return 1 def get_query_words(self, word): query_words = [] for docs in self.collection.find({}, {'rel': 1, 'conp': 1}).sort([('_id', 1)]): w = docs['conp'] if w not in query_words: query_words.append(w) for item in docs['rel']: if item not in query_words: query_words.append(item) self.client.close() return self.query_index(query_words, word) @property def uids(self): return {docs['uid'] for docs in in_collection.find({}, {'uid': 1}) if 'uid' in docs} def extract_urls_uids(self, word): urls_uids = [] timestamp = [_t.get_attribute('t') for _t in self.driver.find_elements_by_css_selector('div.s-p')] urls_tits = [(t.get_attribute('href'), self.trim(t.text)) for t in self.driver.find_elements_by_css_selector('h4 a')] if len(urls_tits) != len(timestamp): return urls_uids for index, url_tit in enumerate(urls_tits): try: uid = self.md5(timestamp[index] + url_tit[1] + word) if uid not in self.all_uids: self.all_uids.add(uid) urls_uids.append({'url': url_tit[0], 'uid': uid}) except (TypeError, IndexError): pass return urls_uids @staticmethod def query_index(words, cut_word): temp_words = words[START_INDEX:END_INDEX] try: index = temp_words.index(cut_word) return temp_words[index:], index + START_INDEX except ValueError: pass return temp_words, START_INDEX @property def is_forbidden(self): css_id = 'seccodeForm' try: if self.driver.find_element_by_id(css_id): return True except NoSuchElementException: pass return False def appear_element(self, by): try: # Have `click` function to specified element tag = WebDriverWait(self.driver, 20).until(lambda driver: driver.find_element_by_id(by)) tag.click() return True except (TimeoutException, NoSuchWindowException, NoSuchElementException): pass return False def crawl(self, word=None, go=0): is_go = True is_break = False go_page = int(go) next_page_css = 'sogou_page_%s' query_words, ind = self.get_query_words(word) for index, word in enumerate(query_words, 1): next_ind = ind + index is_break = self.open_weixin_browser(word) pages = self.get_total_pages_to_word() for page in range(self.start_page + 1, (pages or self.end_page) + 1): if is_go and page < go_page: continue else: is_go = False if not self.appear_element(by=next_page_css % page): is_break = True msg = '\tNot appear next page element, will break' elif self.is_forbidden: is_break = True msg = '\tSpider was forbidden, crawling again after sleeping a moment!' if is_break: storage_word.append([word, page]) self.logger.info(msg) break urls_uids = self.extract_urls_uids(word=word) Article(urls_uids=urls_uids, word=word).extract() # self.driver.find_element_by_id(next_page_css % page).click() wt = randint(10, 40) if page % 3 == 0 else randint(5, 18) self.logger.info('Index <{}>, Word <{}>, Page <{}> Done, sleeping {}s!'.format(next_ind, word, page, wt)) # self.driver.implicitly_wait(wt) time.sleep(wt) if is_break: break in_client.close() self.close_browser() def close_browser(self): try: self.driver.close() except (NoSuchWindowException,): pass
class PagesCrawler(Spider): name = 'pages' link_extractor = RegexpLinkExtractor(canonicalize=False, deny_extensions=[]) ignored_exts = set(['.' + e for e in IGNORED_EXTENSIONS]) def __init__(self, **kwargs): mongo = MongoClient(MONGO_HOST, MONGO_PORT)[MONGO_DB][MONGO_JOBS_COL] job = mongo.find_one({"_id": kwargs["job_id"]}) args = job["crawl_arguments"] self.args = args self.start_urls = to_list(args['start_urls']) self.maxdepth = int(args['max_depth']) self.follow_prefixes = to_list(args['follow_prefixes']) self.nofollow_prefixes = to_list(args['nofollow_prefixes']) self.prefixes_trie = LRUTrie() for p in self.follow_prefixes: self.prefixes_trie.set_lru(p, True) for p in self.nofollow_prefixes: self.prefixes_trie.set_lru(p, False) self.discover_prefixes = [ url_to_lru_clean( "http%s://%s" % (https, u.replace('http://', '').replace('https://', '')), TLDS_TREE) for u in to_list(args['discover_prefixes']) for https in ['', 's'] ] self.resolved_links = {} self.user_agent = args['user_agent'] self.phantom = 'phantom' in args and args[ 'phantom'] and args['phantom'].lower() != "false" self.cookies = None if 'cookies' in args and args["cookies"]: self.cookies = dict( cookie.split('=', 1) for cookie in re.split(r'\s*;\s*', args['cookies']) if '=' in cookie) if self.phantom: self.ph_timeout = int( args.get('phantom_timeout', PHANTOM['TIMEOUT'])) self.ph_idle_timeout = int( args.get('phantom_idle_timeout', PHANTOM['IDLE_TIMEOUT'])) self.ph_ajax_timeout = int( args.get('phantom_ajax_timeout', PHANTOM['AJAX_TIMEOUT'])) self.errors = 0 @classmethod def from_crawler(cls, crawler, *args, **kwargs): spider = super(PagesCrawler, cls).from_crawler(crawler, *args, **kwargs) crawler.signals.connect(spider.spider_closed, signal=spider_closed) crawler.signals.connect(spider.spider_crashed, signal=spider_error) return spider def start_requests(self): self.log( "Starting crawl task - jobid: %s" % self.crawler.settings['JOBID'], logging.INFO) self.log("ARGUMENTS : " + str(self.args), logging.INFO) if self.phantom: self.init_phantom() for url in self.start_urls: yield self._request(url) def init_phantom(self): self.prefixfiles = os.path.join(scrapyd_config().get('logs_dir'), HYPHE_PROJECT, self.name, self.crawler.settings['JOBID']) self.log("Using path %s for PhantomJS crawl" % self.prefixfiles, logging.INFO) phantom_args = [] if PROXY and not PROXY.startswith(':'): phantom_args.append('--proxy=%s' % PROXY) phantom_args.append('--cookies-file=%s-phantomjs-cookie.txt' % self.prefixfiles) phantom_args.append('--ignore-ssl-errors=true') phantom_args.append('--load-images=false') self.capabilities = dict(DesiredCapabilities.PHANTOMJS) self.capabilities[ 'phantomjs.page.settings.userAgent'] = self.user_agent self.capabilities['takesScreenshot'] = False self.capabilities[ 'phantomjs.page.settings.javascriptCanCloseWindows'] = False self.capabilities[ 'phantomjs.page.settings.javascriptCanOpenWindows'] = False self.phantom = PhantomJS(executable_path=PHANTOM['PATH'], service_args=phantom_args, desired_capabilities=self.capabilities, service_log_path="%s-phantomjs.log" % self.prefixfiles) self.phantom.implicitly_wait(10) self.phantom.set_page_load_timeout(60) self.phantom.set_script_timeout(self.ph_timeout + 15) def spider_crashed(self, spider): self.errors += 1 self.spider_closed(spider, reason="CRASH") def spider_closed(self, spider, reason=""): if self.errors: self.log( "%s error%s encountered during the crawl (%s)." % (self.errors, 's' if self.errors > 1 else '', reason), logging.ERROR) if self.phantom: self.phantom.quit() if not self.errors: for f in ["phantomjs-cookie.txt", "phantomjs.log"]: fi = "%s-%s" % (self.prefixfiles, f) if os.path.exists(fi) and not self.errors: os.remove(fi) def handle_response(self, response): lru = url_to_lru_clean(response.url, TLDS_TREE) if self.phantom: self.phantom.get(response.url) # Collect whole DOM of the webpage including embedded iframes with open( os.path.join(PHANTOM["JS_PATH"], "get_iframes_content.js")) as js: get_bod_w_iframes = js.read() bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes) response._set_body(bod_w_iframes.encode('utf-8')) # Try to scroll and unfold page self.log("Start PhantomJS scrolling and unfolding", logging.INFO) with open( os.path.join(PHANTOM["JS_PATH"], "scrolldown_and_unfold.js")) as js: try: signal.signal(signal.SIGALRM, timeout_alarm) signal.alarm(self.ph_timeout + 30) timedout = self.phantom.execute_async_script( js.read(), self.ph_timeout, self.ph_idle_timeout, self.ph_ajax_timeout) signal.alarm(0) if timedout: raise SeleniumTimeout self.log("Scrolling/Unfolding finished", logging.INFO) except SeleniumTimeout: self.log( "Scrolling/Unfolding timed-out (%ss)" % self.ph_timeout, logging.WARNING) self.errors += 1 except WebDriverException as e: err = json.loads(e.msg)['errorMessage'] self.log("Scrolling/Unfolding crashed: %s" % err, logging.ERROR) self.errors += 1 except Exception as e: self.log( "Scrolling/Unfolding crashed: %s %s" % (type(e), e), logging.ERROR) self.errors += 1 return self._make_raw_page(response, lru) bod_w_iframes = self.phantom.execute_script(get_bod_w_iframes) response._set_body(bod_w_iframes.encode('utf-8')) # Cleanup pages with base64 images embedded that make scrapy consider them not htmlresponses if response.status == 200 and not isinstance(response, HtmlResponse): try: flags = response.flags if "partial" in flags: flags.remove("partial") flags.append("cleaned") response = HtmlResponse(response.url, headers=response.headers, body=cleanupbase64images( response.body), flags=flags, request=response.request) self.log( "WARNING: page with base64 embedded images was cleaned-up for links extraction" ) except: pass if 300 < response.status < 400 or isinstance(response, HtmlResponse): return self.parse_html(response, lru) else: return self._make_raw_page(response, lru) def handle_error(self, failure, response=None): if response: p = self._make_raw_page(response, failure.request.url) p['error'] = error_name(failure.value) return p elif not "://www" in failure.request.url: return self._request(failure.request.url.replace('://', '://www.')) error = failure.getErrorMessage() self.log("ERROR : %s" % error, logging.ERROR) if PROXY and not PROXY.startswith( ':') and "OpenSSL.SSL.Error" in error: return self._request(failure.request.url, noproxy=True) self.errors += 1 return def parse_html(self, response, lru): lrulinks = [] # handle redirects realdepth = response.meta['depth'] if 300 < response.status < 400: redir_url = response.headers['Location'] if redir_url.startswith('/'): redir_url = "%s%s" % (lru_get_host_url(lru).strip('/'), redir_url) elif redir_url.startswith('../'): lrustart = lru[:lru.rfind('|p:')] while redir_url.startswith('../'): lrustart = lrustart[:lrustart.rfind('|p:')] redir_url = redir_url[3:] redir_url = "%s/%s" % (lru_to_url(lrustart + '|'), redir_url) elif redir_url.startswith( './') or not redir_url.startswith('http'): redir_url = "%s%s" % (lru_get_path_url(lru).strip('/'), redir_url[1:]) links = [{'url': redir_url}] response.meta['depth'] -= 1 else: try: links = self.link_extractor.extract_links(response) except Exception as e: self.log( "ERROR: links extractor crashed on %s: %s %s" % (response, type(e), e), logging.ERROR) links = [] self.errors += 1 for link in links: try: url = link.url except AttributeError: url = link['url'] try: lrulink = url_to_lru_clean(url, TLDS_TREE) except (ValueError, IndexError) as e: self.log("Error converting URL %s to LRU: %s" % (url, e), logging.ERROR) continue lrulinks.append((url, lrulink)) if self._should_follow(response.meta['depth'], lrulink) and \ not url_has_any_extension(url, self.ignored_exts): yield self._request(url) response.meta['depth'] = realdepth yield self._make_html_page(response, lru, lrulinks) def _make_html_page(self, response, lru, lrulinks): p = self._make_raw_page(response, lru) if STORE_HTML: p['body'] = Binary(response.body.encode('zip')) p['lrulinks'] = lrulinks return p def _make_raw_page(self, response, lru): p = self._new_page(response.url, lru) p['status'] = response.status p['size'] = len(response.body) if isinstance(response, HtmlResponse): p['encoding'] = response.encoding if response.meta.get('depth'): p['depth'] = response.meta['depth'] if response.headers.get('content-type'): p['content_type'] = response.headers.get('content-type').partition( ';')[0] p['error'] = None return p def _new_page(self, url, lru=None): if lru is None: lru = url_to_lru_clean(url, TLDS_TREE) p = Page() p['url'] = url p['lru'] = lru p['depth'] = 0 p['timestamp'] = int(time.time() * 1000) return p def _should_follow(self, depth, tolru): c1 = depth < self.maxdepth c2 = self.prefixes_trie.match_lru(tolru) return c1 and c2 def _request(self, url, noproxy=False, **kw): kw['meta'] = {'handle_httpstatus_all': True, 'noproxy': noproxy} kw['callback'] = self.handle_response kw['errback'] = self.handle_error if self.cookies: kw['cookies'] = self.cookies if self.phantom: kw['method'] = 'HEAD' return Request(url, **kw)