示例#1
0
 def loadNext(self, childJobTemplateList, item):
     if childJobTemplateList == None or len(childJobTemplateList) == 0:
         # pcInfo = Pcinfo()
         # pidList = pcInfo.getPidListByProcessName(ConfigUtils.getRedisPorperties(KEYMAP.PROCESS_SPIDER_NAME))
         # if pidList and len(pidList):
         #     RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.PROCESS_SPIDER_STATUS) + "_" + os.getpid(), 0)
         #     for pid in pidList:
         #         RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.PROCESS_SPIDER_STATUS) + "_" + pid, 0)
         # else:
         if llen(
                 ConfigUtils.getRedisPorperties(
                     KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0:
             if self.taskJob.status != TaskStatus.SUCCESS:
                 TaskJobDao.updateTaskJobStatus(self.taskJob.id,
                                                TaskStatus.SUCCESS)
                 UrlDao.updateUrlStatusListByTaskJobHistoryId(
                     self.taskJobHistoryId,
                     status=UrlStatus.STOP,
                     desc="The task is over and no longer crawls on this URL"
                 )
         return
     for jobTemplate in childJobTemplateList:
         parentId = str(item.get("id"))
         taskJobParam = TaskJobParam(paramNameEn="dataParentId",
                                     paramValue=parentId)
         taskJobParamList = []
         taskJobParamList.append(taskJobParam)
         taskJobParamList.extend(self.taskJobParamList)
         CrawlerService.parseUrlAndInsertRedis(
             taskJob=self.taskJob,
             paramMap=item,
             taskJobParam=taskJobParamList,
             taskJobHistory=TaskJobHistory(id=self.taskJobHistoryId),
             jobTemplate=jobTemplate)
示例#2
0
 def process_request(self, request, spider):
     driver = None
     logging.info("*************PhontomJsMiddleware*************")
     jobTemplateList = CacheFactory.get("job_template_by_url", request.url)
     if jobTemplateList != None and len(jobTemplateList) > 0:
         jobTemplate = jobTemplateList[0]
     else:
         jobTemplate = spider.jobTemplate
     # jobTemplate = spider.jobTemplate
     if jobTemplate:
         if hasattr(jobTemplate, 'needLogin') and jobTemplate.needLogin:
             userName = jobTemplate.userName
             password = jobTemplate.passWord
             unameId = jobTemplate.unameElementId
             passwordId = jobTemplate.pwdElementId
             submitId = jobTemplate.submitElementId
             return self.login(request=request, username=userName, password=password, username_id=unameId,
                               password_id=passwordId, submit_id=submitId)
         if jobTemplate.phantomjsFlag and mutex.acquire():
             try:
                 logging.info("*************PhontomJsMiddleware*************" + request.url)
                 driver = webdriver.PhantomJS(executable_path=ConfigUtils.getSpiderPorperties(
                     ConfigUtils.KEYMAP.PHANTOMJS))  # executable_path='D:\\developTools\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe'
                 capabilities = self.get_desired_capabilities(spider)
                 driver.start_session(capabilities)
                 driver.set_page_load_timeout(30)
                 driver.set_script_timeout(30)
                 driver.set_window_size(1000, 10000)  # 尽量将窗口设置大一些,以应对某些网站使用懒加载
                 driver.get(request.url)
                 time.sleep(int(jobTemplate.sleepTime))
                 body = driver.page_source
                 logging.info("PhantomJS is visiting " + request.url)
                 htmlResponse = HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)
                 # driver.close()
                 # driver.service.process.send_signal(signal.SIGTERM)  # kill the specific phantomjs child proc
                 # driver.quit()
                 return htmlResponse
             except Exception, e:
                 urlListStatusId = request.meta.get("urlListStatusId")
                 if urlListStatusId:
                     UrlDao.updateUrlStatus(urlListStatusId, UrlStatus.FAIL, repr(Exception))
                 logging.exception("time out visiting==>%s,%s" % (request.url, str(e)))
                 # try:
                 #     if driver!=None:
                 #         logging.exception("time out visiting==>%s,%s"%(request.url,str(e)))
                 #         # driver.close()
                 #         driver.service.process.send_signal(signal.SIGTERM)  # kill the specific phantomjs child proc
                 #         driver.quit()
                 # except Exception,e:
                 #     logging.error("451e:" + str(e))
                 # return
             finally:
                 # try:
                 #     driver.close()
                 # except Exception, e:
                 #     logging.error("452e:" + str(e))
                 try:
示例#3
0
 def process_exception(self, request, exception, spider):
     urlListStatusId = request.meta.get("urlListStatusId")
     if urlListStatusId:
         UrlDao.updateUrlStatus(urlListStatusId, UrlStatus.FAIL, repr(exception))
     if llen(ConfigUtils.getRedisPorperties(
             KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0 and spider.taskJob.status != TaskStatus.SUCCESS:
         TaskJobDao.updateTaskJobStatus(spider.taskJob.id, TaskStatus.SUCCESS)
         UrlDao.updateUrlStatusListByTaskJobHistoryId(spider.jobTemplate.taskJobHistoryId, status=UrlStatus.STOP,
                                                      desc="The task is over and no longer crawls on this URL")
     logger.info("process_exception ProxyMiddleware")
     return None
示例#4
0
 def parse(self, response):
     if response.body:
         urlListStatusId = response.meta["urlListStatusId"]
         if urlListStatusId:
             UrlDao.updateUrlStatus(urlListStatusId, UrlStatus.SUCCESS)
         htmlItem = HtmlItem()
         htmlItem["url"] = response.url
         htmlItem["html"] = response.body
         subUrls = []
         URLgroup = LinkExtractor(allow=()).extract_links(response)
         if (self.cur_url_depth < self.depth_limit
                 and self.depth_limit != 0) or self.depth_limit == 0:
             for URL in URLgroup:
                 if self.is_domain_allowed(URL.url):
                     subUrls.append(URL.url)
         htmlItem["subUrls"] = subUrls
         # htmlItem["taskJob"]=self.taskJob
         # htmlItem["taskJobHistory"] = self.taskJobHistory
         htmlItem["curUrlDepth"] = self.cur_url_depth
         return htmlItem
示例#5
0
 def process_request(self, request, spider):
     driver = None
     logging.info("*************ChromeMiddleware*************")
     jobTemplateList = CacheFactory.get("job_template_by_url", request.url)
     if jobTemplateList != None and len(jobTemplateList) > 0:
         jobTemplate = jobTemplateList[0]
     else:
         jobTemplate = spider.jobTemplate
     if jobTemplate:
         if hasattr(jobTemplate, 'needLogin') and jobTemplate.needLogin:
             userName = jobTemplate.userName
             password = jobTemplate.passWord
             unameId = jobTemplate.unameElementId
             passwordId = jobTemplate.pwdElementId
             submitId = jobTemplate.submitElementId
             return self.login(request=request, username=userName, password=password, username_id=unameId,
                               password_id=passwordId, submit_id=submitId)
         if jobTemplate.chromeFlag and mutex.acquire():
             try:
                 driver = webdriver.Chrome(
                     executable_path=ConfigUtils.getSpiderPorperties(ConfigUtils.KEYMAP.CHROME))
                 capabilities = self.get_desired_capabilities(spider)
                 driver.start_session(capabilities)
                 driver.set_page_load_timeout(30)
                 driver.set_script_timeout(30)
                 driver.set_window_size(1000, 10000)  # 尽量将窗口设置大一些,以应对某些网站使用懒加载
                 driver.get(request.url)
                 time.sleep(int(jobTemplate.sleepTime))
                 body = driver.page_source
                 logging.info("PhantomJS is visiting " + request.url)
                 htmlResponse = HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request)
                 driver.quit()
                 return htmlResponse
             except Exception, e:
                 urlListStatusId = request.meta.get("urlListStatusId")
                 if urlListStatusId:
                     UrlDao.updateUrlStatus(urlListStatusId, UrlStatus.FAIL, repr(Exception))
                 logging.exception("time out visiting==>%s,%s" % (request.url, str(e)))
             finally:
                 try:
示例#6
0
 def process_item(self, item, spider):
     try:
         curUrl = item["url"]
         subUrls = item["subUrls"]
         taskJob = spider.taskJob
         self.save_to_hdfs(taskJob.id,taskJob.databaseId,item["html"])
         taskJobHistory = spider.taskJobHistory
         if subUrls and len(subUrls)>0:
             parentUrlDepth = item["curUrlDepth"]
             for url in subUrls:
                 newTaskJob = ClassCopy.copyToNewInstances(taskJob,TaskJob)
                 newTaskJob.url=url
                 newTaskJob.curUrlDepth=parentUrlDepth+1
                 newTaskJob.parentUrl = curUrl
                 CrawlerService.parseUrlAndInsertRedis(newTaskJob, taskJobHistory=taskJobHistory)
         else:
             if llen(ConfigUtils.getRedisPorperties(KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0:
                 if taskJob.status != TaskStatus.SUCCESS:
                     TaskJobDao.updateTaskJobStatus(taskJob.id, TaskStatus.SUCCESS)
                     UrlDao.updateUrlStatusListByTaskJobHistoryId(spider.taskJobHistory.id, status=UrlStatus.STOP,
                                                                  desc="depth spider is over")
         return item
     except Exception,e:
         logger.exception("CacheHtmlPipeline:"+str(e))
示例#7
0
class RedisCallbackSpider(RedisCrawlSpider):
    def beforeStartUrl(self, data):
        return data

    def next_requests(self):
        use_set = self.settings.getbool('REDIS_START_URLS_AS_SET',
                                        defaults.START_URLS_AS_SET)
        fetch_one = self.server.spop if use_set else self.server.lpop

        if not LicenseUtils.check_licence(LicenseUtils.get_mac_address()):
            reactor.stop()
        """Returns a request to be scheduled or none."""

        # XXX: Do we need to use a timeout here?
        found = 0
        while found < self.redis_batch_size:
            redis_key = fetch_one(self.redis_key)
            taskJobHistoryId = redis_key
            if taskJobHistoryId != None:
                taskJobHistory = loadTaskJobHistoryById(taskJobHistoryId)
                if taskJobHistory:
                    taskJobId = taskJobHistory.taskJobId
                    taskJob = TaskJobDao.loadTaskById(taskJobId)
                    if taskJob and taskJob.status == TaskStatus.PAUSE:
                        RedisUtils.lpush(
                            ConfigUtils.getRedisPorperties(
                                KEYMAP.ASSIST_SPIDER_REDIS_KEY),
                            taskJobHistoryId)
                        break
                else:
                    break
            else:
                break
            if hashswitch:
                if str(localIP) != str(tjs.get_node(redis_key)):
                    RedisUtils.lpush(self.redis_key, redis_key)
                    return

            redis_key = self.redis_key + "_" + redis_key
            orginData = fetch_one(redis_key)
            data = None
            # data = fetch_one(self.redis_key)
            try:
                logging.info("orginData==" + orginData)
                orginData = json.loads(orginData)
                orginData["taskJobHistoryId"] = taskJobHistoryId
                data = self.beforeStartUrl(orginData)
            except Exception, e:
                logging.error("Error e:")
                logging.error(e)
                logging.error(orginData)
                break
            if not data:
                # Queue empty.
                logging.warning('********dataUrl is null*************')
                break
            try:
                req = self.make_request_from_data(data)
                # req.replace(meta={"id":"123"})
                req.meta["id"] = orginData.get("id")
                req.meta["dataParentId"] = orginData.get("dataParentId")
                req.meta["taskJobHistoryId"] = orginData.get(
                    "taskJobHistoryId")
                req.meta["url"] = orginData.get("url")
                urlListStatusId = req.meta["urlListStatusId"] = orginData.get(
                    "urlListStatusId")
            except Exception, e:
                logging.error("make_request_from_data:e:" + e)
                break
            if req:
                if urlListStatusId:
                    UrlDao.updateUrlStatus(urlListStatusId, UrlStatus.RUNNING)
                else:
                    urlListStatus = UrlClazz(
                        url=orginData.get("url"),
                        status=UrlStatus.RUNNING,
                        parentUrl=orginData.get("parentUrl"),
                        taskJobId=orginData.get("id"),
                        taskJobHistoryId=taskJobHistoryId)
                    UrlDao.addUrl(urlListStatus)
                yield req
                found += 1
            else:
                if urlListStatusId:
                    UrlDao.updateUrlStatus(urlListStatusId, UrlStatus.FAIL,
                                           "Request not made from data")
                else:
                    urlListStatus = UrlClazz(
                        url=orginData.get("url"),
                        status=UrlStatus.RUNNING,
                        parentUrl=orginData.get("parentUrl"),
                        taskJobId=orginData.get("id"),
                        taskJobHistoryId=taskJobHistoryId)
                    UrlDao.addUrl(urlListStatus)
                self.logger.debug("Request not made from data: %r", data)
示例#8
0
    def _do_upinsert(self, item):
        now = str(datetime.now())
        data = item["data"]
        url = item["url"]
        jobTemplateFieldList = item["jobTemplateFieldList"]
        jobTemplate = item["jobTemplate"]
        self.dataParentId = jobTemplate.dataParentId if hasattr(
            jobTemplate, "dataParentId") else None
        extraData = jobTemplate.extraData
        self.taskJob = item["taskJob"]
        # searchTaskJob = item["searchTaskJob"]
        taskJobHistroy = item["taskJobHistroy"]
        self.taskJobHistoryId = jobTemplate.taskJobHistoryId
        taskJobHistroyId = str(taskJobHistroy.id)
        paramMap = {}
        self.taskJobParamList = []
        if taskJobHistroy != None:
            self.taskJobParamList.append(
                TaskJobParam(paramNameEn="task_job_id_sequence",
                             paramValue=taskJobHistroyId))
            paramMap["task_job_id_sequence"] = taskJobHistroyId
        # if searchTaskJob!=None:
        #     self.taskJobParamList.append(TaskJobParam(paramNameEn=searchTaskJob.name, paramValue=searchTaskJob.name))
        #     paramMap[searchTaskJob.name] = searchTaskJob.name
        # self.taskJobParamList = []
        # if self.taskJobHistoryId!=None:
        #     self.taskJobParamList=CacheFactory.get("task_job_param", self.taskJobHistoryId)
        # if self.taskJobParamList!=None:
        #     for taskJobParam in self.taskJobParamList:
        #         paramMap[taskJobParam.paramNameEn]=taskJobParam.paramValue
        tableName = jobTemplate.tableName
        jobTemplateId = jobTemplate.id
        databaseId = jobTemplate.databaseId if jobTemplate.databaseId != "-1" and jobTemplate.databaseId != None else self.taskJob.databaseId
        db = self.dbclient.getConnection(databaseId)

        if db == None:
            logging.warning('db is null,please check it with databaseid :%s' %
                            databaseId)
            if llen(
                    ConfigUtils.getRedisPorperties(
                        KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0:
                if self.taskJob.status != TaskStatus.SUCCESS:
                    TaskJobDao.updateTaskJobStatus(self.taskJob.id,
                                                   TaskStatus.SUCCESS)
                    UrlDao.updateUrlStatusListByTaskJobHistoryId(
                        self.taskJobHistoryId,
                        status=UrlStatus.STOP,
                        desc="no db")
            return
        sqlArray = []
        if data == None or len(data) == 0:
            logging.warning(
                'insert data not exist,please retry crawler or check template or check error'
            )
            if llen(
                    ConfigUtils.getRedisPorperties(
                        KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0:
                if self.taskJob.status != TaskStatus.SUCCESS:
                    TaskJobDao.updateTaskJobStatus(self.taskJob.id,
                                                   TaskStatus.SUCCESS)
                    UrlDao.updateUrlStatusListByTaskJobHistoryId(
                        self.taskJobHistoryId,
                        status=UrlStatus.STOP,
                        desc="no data")
            return
        logging.info('----pipelines insert data-----%s' % str(data))
        for d in data:
            d["task_job_url"] = url
            if self.dataParentId != None:
                d["parent_id"] = self.dataParentId
            d["id"] = str(uuid.uuid1())
            if self.dbclient.db_type == 'kafka':
                d['TemplateName'] = jobTemplate.name
                d['UrlStatus'] = 0
                d['Timestamps'] = int(time.time())
            if self.dbclient.db_type == 'hdfs' or self.dbclient.db_type == 'mongodb':
                sqlArray.append(
                    db.insert(jobTemplate.id, tableName, d, paramMap))
            else:
                sqlArray.append(db.insert(tableName, d, paramMap))
            if jobTemplateId != None:
                try:
                    childJobTemplateList = TemplateDao.queryJobTemplateListByParentId(
                        jobTemplateId)
                    self.loadNext(childJobTemplateList,
                                  dict(extraData.items() + d.items()))
                except Exception, e:
                    logging.error(e.message)
示例#9
0
def saveUrlListStatus(urlListStatus):
    if urlListStatus:
        # RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.URL_LIST_SPIDER_REDIS_KEY) + "_" + urlListStatus.taskJobId,
        #                  json.dumps(urlListStatus))
        UrlDao.addUrl(urlListStatus)