def loadNext(self, childJobTemplateList, item): if childJobTemplateList == None or len(childJobTemplateList) == 0: # pcInfo = Pcinfo() # pidList = pcInfo.getPidListByProcessName(ConfigUtils.getRedisPorperties(KEYMAP.PROCESS_SPIDER_NAME)) # if pidList and len(pidList): # RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.PROCESS_SPIDER_STATUS) + "_" + os.getpid(), 0) # for pid in pidList: # RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.PROCESS_SPIDER_STATUS) + "_" + pid, 0) # else: if llen( ConfigUtils.getRedisPorperties( KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0: if self.taskJob.status != TaskStatus.SUCCESS: TaskJobDao.updateTaskJobStatus(self.taskJob.id, TaskStatus.SUCCESS) UrlDao.updateUrlStatusListByTaskJobHistoryId( self.taskJobHistoryId, status=UrlStatus.STOP, desc="The task is over and no longer crawls on this URL" ) return for jobTemplate in childJobTemplateList: parentId = str(item.get("id")) taskJobParam = TaskJobParam(paramNameEn="dataParentId", paramValue=parentId) taskJobParamList = [] taskJobParamList.append(taskJobParam) taskJobParamList.extend(self.taskJobParamList) CrawlerService.parseUrlAndInsertRedis( taskJob=self.taskJob, paramMap=item, taskJobParam=taskJobParamList, taskJobHistory=TaskJobHistory(id=self.taskJobHistoryId), jobTemplate=jobTemplate)
def process_exception(self, request, exception, spider): urlListStatusId = request.meta.get("urlListStatusId") if urlListStatusId: UrlDao.updateUrlStatus(urlListStatusId, UrlStatus.FAIL, repr(exception)) if llen(ConfigUtils.getRedisPorperties( KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0 and spider.taskJob.status != TaskStatus.SUCCESS: TaskJobDao.updateTaskJobStatus(spider.taskJob.id, TaskStatus.SUCCESS) UrlDao.updateUrlStatusListByTaskJobHistoryId(spider.jobTemplate.taskJobHistoryId, status=UrlStatus.STOP, desc="The task is over and no longer crawls on this URL") logger.info("process_exception ProxyMiddleware") return None
def process_item(self, item, spider): try: curUrl = item["url"] subUrls = item["subUrls"] taskJob = spider.taskJob self.save_to_hdfs(taskJob.id,taskJob.databaseId,item["html"]) taskJobHistory = spider.taskJobHistory if subUrls and len(subUrls)>0: parentUrlDepth = item["curUrlDepth"] for url in subUrls: newTaskJob = ClassCopy.copyToNewInstances(taskJob,TaskJob) newTaskJob.url=url newTaskJob.curUrlDepth=parentUrlDepth+1 newTaskJob.parentUrl = curUrl CrawlerService.parseUrlAndInsertRedis(newTaskJob, taskJobHistory=taskJobHistory) else: if llen(ConfigUtils.getRedisPorperties(KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0: if taskJob.status != TaskStatus.SUCCESS: TaskJobDao.updateTaskJobStatus(taskJob.id, TaskStatus.SUCCESS) UrlDao.updateUrlStatusListByTaskJobHistoryId(spider.taskJobHistory.id, status=UrlStatus.STOP, desc="depth spider is over") return item except Exception,e: logger.exception("CacheHtmlPipeline:"+str(e))
def _do_upinsert(self, item): now = str(datetime.now()) data = item["data"] url = item["url"] jobTemplateFieldList = item["jobTemplateFieldList"] jobTemplate = item["jobTemplate"] self.dataParentId = jobTemplate.dataParentId if hasattr( jobTemplate, "dataParentId") else None extraData = jobTemplate.extraData self.taskJob = item["taskJob"] # searchTaskJob = item["searchTaskJob"] taskJobHistroy = item["taskJobHistroy"] self.taskJobHistoryId = jobTemplate.taskJobHistoryId taskJobHistroyId = str(taskJobHistroy.id) paramMap = {} self.taskJobParamList = [] if taskJobHistroy != None: self.taskJobParamList.append( TaskJobParam(paramNameEn="task_job_id_sequence", paramValue=taskJobHistroyId)) paramMap["task_job_id_sequence"] = taskJobHistroyId # if searchTaskJob!=None: # self.taskJobParamList.append(TaskJobParam(paramNameEn=searchTaskJob.name, paramValue=searchTaskJob.name)) # paramMap[searchTaskJob.name] = searchTaskJob.name # self.taskJobParamList = [] # if self.taskJobHistoryId!=None: # self.taskJobParamList=CacheFactory.get("task_job_param", self.taskJobHistoryId) # if self.taskJobParamList!=None: # for taskJobParam in self.taskJobParamList: # paramMap[taskJobParam.paramNameEn]=taskJobParam.paramValue tableName = jobTemplate.tableName jobTemplateId = jobTemplate.id databaseId = jobTemplate.databaseId if jobTemplate.databaseId != "-1" and jobTemplate.databaseId != None else self.taskJob.databaseId db = self.dbclient.getConnection(databaseId) if db == None: logging.warning('db is null,please check it with databaseid :%s' % databaseId) if llen( ConfigUtils.getRedisPorperties( KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0: if self.taskJob.status != TaskStatus.SUCCESS: TaskJobDao.updateTaskJobStatus(self.taskJob.id, TaskStatus.SUCCESS) UrlDao.updateUrlStatusListByTaskJobHistoryId( self.taskJobHistoryId, status=UrlStatus.STOP, desc="no db") return sqlArray = [] if data == None or len(data) == 0: logging.warning( 'insert data not exist,please retry crawler or check template or check error' ) if llen( ConfigUtils.getRedisPorperties( KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0: if self.taskJob.status != TaskStatus.SUCCESS: TaskJobDao.updateTaskJobStatus(self.taskJob.id, TaskStatus.SUCCESS) UrlDao.updateUrlStatusListByTaskJobHistoryId( self.taskJobHistoryId, status=UrlStatus.STOP, desc="no data") return logging.info('----pipelines insert data-----%s' % str(data)) for d in data: d["task_job_url"] = url if self.dataParentId != None: d["parent_id"] = self.dataParentId d["id"] = str(uuid.uuid1()) if self.dbclient.db_type == 'kafka': d['TemplateName'] = jobTemplate.name d['UrlStatus'] = 0 d['Timestamps'] = int(time.time()) if self.dbclient.db_type == 'hdfs' or self.dbclient.db_type == 'mongodb': sqlArray.append( db.insert(jobTemplate.id, tableName, d, paramMap)) else: sqlArray.append(db.insert(tableName, d, paramMap)) if jobTemplateId != None: try: childJobTemplateList = TemplateDao.queryJobTemplateListByParentId( jobTemplateId) self.loadNext(childJobTemplateList, dict(extraData.items() + d.items())) except Exception, e: logging.error(e.message)