示例#1
0
 def process_request(self, request, spider):
     driver = None
     logging.info("*************PhontomJsMiddleware*************")
     jobTemplateList = CacheFactory.get("job_template_by_url", request.url)
     if jobTemplateList != None and len(jobTemplateList) > 0:
         jobTemplate = jobTemplateList[0]
     else:
         jobTemplate = spider.jobTemplate
     # jobTemplate = spider.jobTemplate
     if jobTemplate:
         if hasattr(jobTemplate, 'needLogin') and jobTemplate.needLogin:
             userName = jobTemplate.userName
             password = jobTemplate.passWord
             unameId = jobTemplate.unameElementId
             passwordId = jobTemplate.pwdElementId
             submitId = jobTemplate.submitElementId
             return self.login(request=request, username=userName, password=password, username_id=unameId,
                               password_id=passwordId, submit_id=submitId)
         if jobTemplate.phantomjsFlag and mutex.acquire():
             try:
                 logging.info("*************PhontomJsMiddleware*************" + request.url)
                 driver = webdriver.PhantomJS(executable_path=ConfigUtils.getSpiderPorperties(
                     ConfigUtils.KEYMAP.PHANTOMJS))  # executable_path='D:\\developTools\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe'
                 capabilities = self.get_desired_capabilities(spider)
                 driver.start_session(capabilities)
                 driver.set_page_load_timeout(30)
                 driver.set_script_timeout(30)
                 driver.set_window_size(1000, 10000)  # 尽量将窗口设置大一些,以应对某些网站使用懒加载
                 driver.get(request.url)
                 time.sleep(int(jobTemplate.sleepTime))
                 body = driver.page_source
                 logging.info("PhantomJS is visiting " + request.url)
                 htmlResponse = HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)
                 # driver.close()
                 # driver.service.process.send_signal(signal.SIGTERM)  # kill the specific phantomjs child proc
                 # driver.quit()
                 return htmlResponse
             except Exception, e:
                 urlListStatusId = request.meta.get("urlListStatusId")
                 if urlListStatusId:
                     UrlDao.updateUrlStatus(urlListStatusId, UrlStatus.FAIL, repr(Exception))
                 logging.exception("time out visiting==>%s,%s" % (request.url, str(e)))
                 # try:
                 #     if driver!=None:
                 #         logging.exception("time out visiting==>%s,%s"%(request.url,str(e)))
                 #         # driver.close()
                 #         driver.service.process.send_signal(signal.SIGTERM)  # kill the specific phantomjs child proc
                 #         driver.quit()
                 # except Exception,e:
                 #     logging.error("451e:" + str(e))
                 # return
             finally:
                 # try:
                 #     driver.close()
                 # except Exception, e:
                 #     logging.error("452e:" + str(e))
                 try:
示例#2
0
 def process_exception(self, request, exception, spider):
     urlListStatusId = request.meta.get("urlListStatusId")
     if urlListStatusId:
         UrlDao.updateUrlStatus(urlListStatusId, UrlStatus.FAIL, repr(exception))
     if llen(ConfigUtils.getRedisPorperties(
             KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0 and spider.taskJob.status != TaskStatus.SUCCESS:
         TaskJobDao.updateTaskJobStatus(spider.taskJob.id, TaskStatus.SUCCESS)
         UrlDao.updateUrlStatusListByTaskJobHistoryId(spider.jobTemplate.taskJobHistoryId, status=UrlStatus.STOP,
                                                      desc="The task is over and no longer crawls on this URL")
     logger.info("process_exception ProxyMiddleware")
     return None
示例#3
0
 def parse(self, response):
     if response.body:
         urlListStatusId = response.meta["urlListStatusId"]
         if urlListStatusId:
             UrlDao.updateUrlStatus(urlListStatusId, UrlStatus.SUCCESS)
         htmlItem = HtmlItem()
         htmlItem["url"] = response.url
         htmlItem["html"] = response.body
         subUrls = []
         URLgroup = LinkExtractor(allow=()).extract_links(response)
         if (self.cur_url_depth < self.depth_limit
                 and self.depth_limit != 0) or self.depth_limit == 0:
             for URL in URLgroup:
                 if self.is_domain_allowed(URL.url):
                     subUrls.append(URL.url)
         htmlItem["subUrls"] = subUrls
         # htmlItem["taskJob"]=self.taskJob
         # htmlItem["taskJobHistory"] = self.taskJobHistory
         htmlItem["curUrlDepth"] = self.cur_url_depth
         return htmlItem
示例#4
0
 def process_request(self, request, spider):
     driver = None
     logging.info("*************ChromeMiddleware*************")
     jobTemplateList = CacheFactory.get("job_template_by_url", request.url)
     if jobTemplateList != None and len(jobTemplateList) > 0:
         jobTemplate = jobTemplateList[0]
     else:
         jobTemplate = spider.jobTemplate
     if jobTemplate:
         if hasattr(jobTemplate, 'needLogin') and jobTemplate.needLogin:
             userName = jobTemplate.userName
             password = jobTemplate.passWord
             unameId = jobTemplate.unameElementId
             passwordId = jobTemplate.pwdElementId
             submitId = jobTemplate.submitElementId
             return self.login(request=request, username=userName, password=password, username_id=unameId,
                               password_id=passwordId, submit_id=submitId)
         if jobTemplate.chromeFlag and mutex.acquire():
             try:
                 driver = webdriver.Chrome(
                     executable_path=ConfigUtils.getSpiderPorperties(ConfigUtils.KEYMAP.CHROME))
                 capabilities = self.get_desired_capabilities(spider)
                 driver.start_session(capabilities)
                 driver.set_page_load_timeout(30)
                 driver.set_script_timeout(30)
                 driver.set_window_size(1000, 10000)  # 尽量将窗口设置大一些,以应对某些网站使用懒加载
                 driver.get(request.url)
                 time.sleep(int(jobTemplate.sleepTime))
                 body = driver.page_source
                 logging.info("PhantomJS is visiting " + request.url)
                 htmlResponse = HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)
                 driver.quit()
                 return htmlResponse
             except Exception, e:
                 urlListStatusId = request.meta.get("urlListStatusId")
                 if urlListStatusId:
                     UrlDao.updateUrlStatus(urlListStatusId, UrlStatus.FAIL, repr(Exception))
                 logging.exception("time out visiting==>%s,%s" % (request.url, str(e)))
             finally:
                 try:
示例#5
0
class RedisCallbackSpider(RedisCrawlSpider):
    def beforeStartUrl(self, data):
        return data

    def next_requests(self):
        use_set = self.settings.getbool('REDIS_START_URLS_AS_SET',
                                        defaults.START_URLS_AS_SET)
        fetch_one = self.server.spop if use_set else self.server.lpop

        if not LicenseUtils.check_licence(LicenseUtils.get_mac_address()):
            reactor.stop()
        """Returns a request to be scheduled or none."""

        # XXX: Do we need to use a timeout here?
        found = 0
        while found < self.redis_batch_size:
            redis_key = fetch_one(self.redis_key)
            taskJobHistoryId = redis_key
            if taskJobHistoryId != None:
                taskJobHistory = loadTaskJobHistoryById(taskJobHistoryId)
                if taskJobHistory:
                    taskJobId = taskJobHistory.taskJobId
                    taskJob = TaskJobDao.loadTaskById(taskJobId)
                    if taskJob and taskJob.status == TaskStatus.PAUSE:
                        RedisUtils.lpush(
                            ConfigUtils.getRedisPorperties(
                                KEYMAP.ASSIST_SPIDER_REDIS_KEY),
                            taskJobHistoryId)
                        break
                else:
                    break
            else:
                break
            if hashswitch:
                if str(localIP) != str(tjs.get_node(redis_key)):
                    RedisUtils.lpush(self.redis_key, redis_key)
                    return

            redis_key = self.redis_key + "_" + redis_key
            orginData = fetch_one(redis_key)
            data = None
            # data = fetch_one(self.redis_key)
            try:
                logging.info("orginData==" + orginData)
                orginData = json.loads(orginData)
                orginData["taskJobHistoryId"] = taskJobHistoryId
                data = self.beforeStartUrl(orginData)
            except Exception, e:
                logging.error("Error e:")
                logging.error(e)
                logging.error(orginData)
                break
            if not data:
                # Queue empty.
                logging.warning('********dataUrl is null*************')
                break
            try:
                req = self.make_request_from_data(data)
                # req.replace(meta={"id":"123"})
                req.meta["id"] = orginData.get("id")
                req.meta["dataParentId"] = orginData.get("dataParentId")
                req.meta["taskJobHistoryId"] = orginData.get(
                    "taskJobHistoryId")
                req.meta["url"] = orginData.get("url")
                urlListStatusId = req.meta["urlListStatusId"] = orginData.get(
                    "urlListStatusId")
            except Exception, e:
                logging.error("make_request_from_data:e:" + e)
                break
            if req:
                if urlListStatusId:
                    UrlDao.updateUrlStatus(urlListStatusId, UrlStatus.RUNNING)
                else:
                    urlListStatus = UrlClazz(
                        url=orginData.get("url"),
                        status=UrlStatus.RUNNING,
                        parentUrl=orginData.get("parentUrl"),
                        taskJobId=orginData.get("id"),
                        taskJobHistoryId=taskJobHistoryId)
                    UrlDao.addUrl(urlListStatus)
                yield req
                found += 1
            else:
                if urlListStatusId:
                    UrlDao.updateUrlStatus(urlListStatusId, UrlStatus.FAIL,
                                           "Request not made from data")
                else:
                    urlListStatus = UrlClazz(
                        url=orginData.get("url"),
                        status=UrlStatus.RUNNING,
                        parentUrl=orginData.get("parentUrl"),
                        taskJobId=orginData.get("id"),
                        taskJobHistoryId=taskJobHistoryId)
                    UrlDao.addUrl(urlListStatus)
                self.logger.debug("Request not made from data: %r", data)