Пример #1
0
 def loadNext(self, childJobTemplateList, item):
     if childJobTemplateList == None or len(childJobTemplateList) == 0:
         # pcInfo = Pcinfo()
         # pidList = pcInfo.getPidListByProcessName(ConfigUtils.getRedisPorperties(KEYMAP.PROCESS_SPIDER_NAME))
         # if pidList and len(pidList):
         #     RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.PROCESS_SPIDER_STATUS) + "_" + os.getpid(), 0)
         #     for pid in pidList:
         #         RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.PROCESS_SPIDER_STATUS) + "_" + pid, 0)
         # else:
         if llen(
                 ConfigUtils.getRedisPorperties(
                     KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0:
             if self.taskJob.status != TaskStatus.SUCCESS:
                 TaskJobDao.updateTaskJobStatus(self.taskJob.id,
                                                TaskStatus.SUCCESS)
                 UrlDao.updateUrlStatusListByTaskJobHistoryId(
                     self.taskJobHistoryId,
                     status=UrlStatus.STOP,
                     desc="The task is over and no longer crawls on this URL"
                 )
         return
     for jobTemplate in childJobTemplateList:
         parentId = str(item.get("id"))
         taskJobParam = TaskJobParam(paramNameEn="dataParentId",
                                     paramValue=parentId)
         taskJobParamList = []
         taskJobParamList.append(taskJobParam)
         taskJobParamList.extend(self.taskJobParamList)
         CrawlerService.parseUrlAndInsertRedis(
             taskJob=self.taskJob,
             paramMap=item,
             taskJobParam=taskJobParamList,
             taskJobHistory=TaskJobHistory(id=self.taskJobHistoryId),
             jobTemplate=jobTemplate)
Пример #2
0
 def process_exception(self, request, exception, spider):
     urlListStatusId = request.meta.get("urlListStatusId")
     if urlListStatusId:
         UrlDao.updateUrlStatus(urlListStatusId, UrlStatus.FAIL, repr(exception))
     if llen(ConfigUtils.getRedisPorperties(
             KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0 and spider.taskJob.status != TaskStatus.SUCCESS:
         TaskJobDao.updateTaskJobStatus(spider.taskJob.id, TaskStatus.SUCCESS)
         UrlDao.updateUrlStatusListByTaskJobHistoryId(spider.jobTemplate.taskJobHistoryId, status=UrlStatus.STOP,
                                                      desc="The task is over and no longer crawls on this URL")
     logger.info("process_exception ProxyMiddleware")
     return None
Пример #3
0
 def process_item(self, item, spider):
     try:
         curUrl = item["url"]
         subUrls = item["subUrls"]
         taskJob = spider.taskJob
         self.save_to_hdfs(taskJob.id,taskJob.databaseId,item["html"])
         taskJobHistory = spider.taskJobHistory
         if subUrls and len(subUrls)>0:
             parentUrlDepth = item["curUrlDepth"]
             for url in subUrls:
                 newTaskJob = ClassCopy.copyToNewInstances(taskJob,TaskJob)
                 newTaskJob.url=url
                 newTaskJob.curUrlDepth=parentUrlDepth+1
                 newTaskJob.parentUrl = curUrl
                 CrawlerService.parseUrlAndInsertRedis(newTaskJob, taskJobHistory=taskJobHistory)
         else:
             if llen(ConfigUtils.getRedisPorperties(KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0:
                 if taskJob.status != TaskStatus.SUCCESS:
                     TaskJobDao.updateTaskJobStatus(taskJob.id, TaskStatus.SUCCESS)
                     UrlDao.updateUrlStatusListByTaskJobHistoryId(spider.taskJobHistory.id, status=UrlStatus.STOP,
                                                                  desc="depth spider is over")
         return item
     except Exception,e:
         logger.exception("CacheHtmlPipeline:"+str(e))
Пример #4
0
    def _do_upinsert(self, item):
        now = str(datetime.now())
        data = item["data"]
        url = item["url"]
        jobTemplateFieldList = item["jobTemplateFieldList"]
        jobTemplate = item["jobTemplate"]
        self.dataParentId = jobTemplate.dataParentId if hasattr(
            jobTemplate, "dataParentId") else None
        extraData = jobTemplate.extraData
        self.taskJob = item["taskJob"]
        # searchTaskJob = item["searchTaskJob"]
        taskJobHistroy = item["taskJobHistroy"]
        self.taskJobHistoryId = jobTemplate.taskJobHistoryId
        taskJobHistroyId = str(taskJobHistroy.id)
        paramMap = {}
        self.taskJobParamList = []
        if taskJobHistroy != None:
            self.taskJobParamList.append(
                TaskJobParam(paramNameEn="task_job_id_sequence",
                             paramValue=taskJobHistroyId))
            paramMap["task_job_id_sequence"] = taskJobHistroyId
        # if searchTaskJob!=None:
        #     self.taskJobParamList.append(TaskJobParam(paramNameEn=searchTaskJob.name, paramValue=searchTaskJob.name))
        #     paramMap[searchTaskJob.name] = searchTaskJob.name
        # self.taskJobParamList = []
        # if self.taskJobHistoryId!=None:
        #     self.taskJobParamList=CacheFactory.get("task_job_param", self.taskJobHistoryId)
        # if self.taskJobParamList!=None:
        #     for taskJobParam in self.taskJobParamList:
        #         paramMap[taskJobParam.paramNameEn]=taskJobParam.paramValue
        tableName = jobTemplate.tableName
        jobTemplateId = jobTemplate.id
        databaseId = jobTemplate.databaseId if jobTemplate.databaseId != "-1" and jobTemplate.databaseId != None else self.taskJob.databaseId
        db = self.dbclient.getConnection(databaseId)

        if db == None:
            logging.warning('db is null,please check it with databaseid :%s' %
                            databaseId)
            if llen(
                    ConfigUtils.getRedisPorperties(
                        KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0:
                if self.taskJob.status != TaskStatus.SUCCESS:
                    TaskJobDao.updateTaskJobStatus(self.taskJob.id,
                                                   TaskStatus.SUCCESS)
                    UrlDao.updateUrlStatusListByTaskJobHistoryId(
                        self.taskJobHistoryId,
                        status=UrlStatus.STOP,
                        desc="no db")
            return
        sqlArray = []
        if data == None or len(data) == 0:
            logging.warning(
                'insert data not exist,please retry crawler or check template or check error'
            )
            if llen(
                    ConfigUtils.getRedisPorperties(
                        KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0:
                if self.taskJob.status != TaskStatus.SUCCESS:
                    TaskJobDao.updateTaskJobStatus(self.taskJob.id,
                                                   TaskStatus.SUCCESS)
                    UrlDao.updateUrlStatusListByTaskJobHistoryId(
                        self.taskJobHistoryId,
                        status=UrlStatus.STOP,
                        desc="no data")
            return
        logging.info('----pipelines insert data-----%s' % str(data))
        for d in data:
            d["task_job_url"] = url
            if self.dataParentId != None:
                d["parent_id"] = self.dataParentId
            d["id"] = str(uuid.uuid1())
            if self.dbclient.db_type == 'kafka':
                d['TemplateName'] = jobTemplate.name
                d['UrlStatus'] = 0
                d['Timestamps'] = int(time.time())
            if self.dbclient.db_type == 'hdfs' or self.dbclient.db_type == 'mongodb':
                sqlArray.append(
                    db.insert(jobTemplate.id, tableName, d, paramMap))
            else:
                sqlArray.append(db.insert(tableName, d, paramMap))
            if jobTemplateId != None:
                try:
                    childJobTemplateList = TemplateDao.queryJobTemplateListByParentId(
                        jobTemplateId)
                    self.loadNext(childJobTemplateList,
                                  dict(extraData.items() + d.items()))
                except Exception, e:
                    logging.error(e.message)