Exemplo n.º 1
0
 def getScrapBaseItem(self, taskJobId):
     taskJobHistoryId = self.params.get("taskJobHistoryId") or ""
     if CacheFactory.get("task_job", taskJobHistoryId) == None:
         taskJob = querTaskJob(taskJobId)
         scrapBaseItem = ScrapBaseItem()
         jobTemplateFieldList = queryFieldByTaskJobId(taskJobId)
         if jobTemplateFieldList == None or len(jobTemplateFieldList) == 0:
             jobTemplateFieldList = TemplateDao.queryJobTemplateFieldByJobTemplateId(
                 taskJob.jobTemplateId)
         jobTemplate = queryJobTemplate(taskJob.jobTemplateId)
         jobTemplateParamList = queryJobParam(taskJobId)
         taskJobParamList = TaskJobDao.queryTaskJobParam(taskJobId)
         scrapBaseItem["jobTemplateFieldList"] = jobTemplateFieldList
         scrapBaseItem["jobTemplate"] = jobTemplate
         scrapBaseItem["taskJobId"] = taskJobId
         scrapBaseItem["taskJob"] = taskJob
         setattr(taskJob, "taskJobHistoryId", taskJobHistoryId)
         scrapBaseItem["jobTemplateParamList"] = jobTemplateParamList
         scrapBaseItem["taskJobParamList"] = taskJobParamList
         CacheFactory.cache("task_job", taskJobHistoryId, scrapBaseItem)
         taskJobHistory = None
         if taskJobHistoryId != None and taskJobHistoryId != "" and CacheFactory.get(
                 "task_job_history", taskJobHistoryId) == None:
             taskJobHistory = TaskJobDao.loadTaskJobHistoryById(
                 taskJobHistoryId)
             CacheFactory.cache("task_job_history", taskJobHistoryId,
                                taskJobHistory)
         taskJobHistory = CacheFactory.get("task_job_history",
                                           taskJobHistoryId)
         scrapBaseItem["taskJobHistroy"] = taskJobHistory
     return CacheFactory.get("task_job",
                             taskJobHistoryId) or ScrapBaseItem()
Exemplo n.º 2
0
 def beforeStartUrl(self, dataDict):
     if (dataDict == None):
         return dataDict
     id = dataDict.get("id")
     if id == None:
         return
     status = RedisUtils.hgetUrlRedisStatus(RedisUtils.prefix + id)
     taskJobHistoryId = dataDict.get("taskJobHistoryId")
     if taskJobHistoryId:
         taskJobHistory = TaskJobDao.loadTaskJobHistoryById(
             taskJobHistoryId)
         if taskJobHistory:
             taskJobId = taskJobHistory.taskJobId
             self.taskJob = TaskJobDao.loadTaskById(taskJobId)
             self.taskJobHistory = taskJobHistory
     url = dataDict["url"] if dataDict.has_key(
         "url") else "http://www.baidu.com"
     self.url = url
     if self.allowed_domain is None:
         self.allowed_domain = self.get_first_domain(self.get_domain(url))
     self.cur_url_depth = dataDict.get("curUrlDepth")
     self.depth_limit = dataDict.get("depthLimit") if dataDict.has_key(
         "depthLimit") else 3
     return url