def loadNext(self, childJobTemplateList, item): if childJobTemplateList == None or len(childJobTemplateList) == 0: # pcInfo = Pcinfo() # pidList = pcInfo.getPidListByProcessName(ConfigUtils.getRedisPorperties(KEYMAP.PROCESS_SPIDER_NAME)) # if pidList and len(pidList): # RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.PROCESS_SPIDER_STATUS) + "_" + os.getpid(), 0) # for pid in pidList: # RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.PROCESS_SPIDER_STATUS) + "_" + pid, 0) # else: if llen( ConfigUtils.getRedisPorperties( KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0: if self.taskJob.status != TaskStatus.SUCCESS: TaskJobDao.updateTaskJobStatus(self.taskJob.id, TaskStatus.SUCCESS) UrlDao.updateUrlStatusListByTaskJobHistoryId( self.taskJobHistoryId, status=UrlStatus.STOP, desc="The task is over and no longer crawls on this URL" ) return for jobTemplate in childJobTemplateList: parentId = str(item.get("id")) taskJobParam = TaskJobParam(paramNameEn="dataParentId", paramValue=parentId) taskJobParamList = [] taskJobParamList.append(taskJobParam) taskJobParamList.extend(self.taskJobParamList) CrawlerService.parseUrlAndInsertRedis( taskJob=self.taskJob, paramMap=item, taskJobParam=taskJobParamList, taskJobHistory=TaskJobHistory(id=self.taskJobHistoryId), jobTemplate=jobTemplate)
def process_request(self, request, spider): driver = None logging.info("*************PhontomJsMiddleware*************") jobTemplateList = CacheFactory.get("job_template_by_url", request.url) if jobTemplateList != None and len(jobTemplateList) > 0: jobTemplate = jobTemplateList[0] else: jobTemplate = spider.jobTemplate # jobTemplate = spider.jobTemplate if jobTemplate: if hasattr(jobTemplate, 'needLogin') and jobTemplate.needLogin: userName = jobTemplate.userName password = jobTemplate.passWord unameId = jobTemplate.unameElementId passwordId = jobTemplate.pwdElementId submitId = jobTemplate.submitElementId return self.login(request=request, username=userName, password=password, username_id=unameId, password_id=passwordId, submit_id=submitId) if jobTemplate.phantomjsFlag and mutex.acquire(): try: logging.info("*************PhontomJsMiddleware*************" + request.url) driver = webdriver.PhantomJS(executable_path=ConfigUtils.getSpiderPorperties( ConfigUtils.KEYMAP.PHANTOMJS)) # executable_path='D:\\developTools\\phantomjs-2.1.1-windows\\bin\\phantomjs.exe' capabilities = self.get_desired_capabilities(spider) driver.start_session(capabilities) driver.set_page_load_timeout(30) driver.set_script_timeout(30) driver.set_window_size(1000, 10000) # 尽量将窗口设置大一些,以应对某些网站使用懒加载 driver.get(request.url) time.sleep(int(jobTemplate.sleepTime)) body = driver.page_source logging.info("PhantomJS is visiting " + request.url) htmlResponse = HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request) # driver.close() # driver.service.process.send_signal(signal.SIGTERM) # kill the specific phantomjs child proc # driver.quit() return htmlResponse except Exception, e: urlListStatusId = request.meta.get("urlListStatusId") if urlListStatusId: UrlDao.updateUrlStatus(urlListStatusId, UrlStatus.FAIL, repr(Exception)) logging.exception("time out visiting==>%s,%s" % (request.url, str(e))) # try: # if driver!=None: # logging.exception("time out visiting==>%s,%s"%(request.url,str(e))) # # driver.close() # driver.service.process.send_signal(signal.SIGTERM) # kill the specific phantomjs child proc # driver.quit() # except Exception,e: # logging.error("451e:" + str(e)) # return finally: # try: # driver.close() # except Exception, e: # logging.error("452e:" + str(e)) try:
def process_exception(self, request, exception, spider): urlListStatusId = request.meta.get("urlListStatusId") if urlListStatusId: UrlDao.updateUrlStatus(urlListStatusId, UrlStatus.FAIL, repr(exception)) if llen(ConfigUtils.getRedisPorperties( KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0 and spider.taskJob.status != TaskStatus.SUCCESS: TaskJobDao.updateTaskJobStatus(spider.taskJob.id, TaskStatus.SUCCESS) UrlDao.updateUrlStatusListByTaskJobHistoryId(spider.jobTemplate.taskJobHistoryId, status=UrlStatus.STOP, desc="The task is over and no longer crawls on this URL") logger.info("process_exception ProxyMiddleware") return None
def parse(self, response): if response.body: urlListStatusId = response.meta["urlListStatusId"] if urlListStatusId: UrlDao.updateUrlStatus(urlListStatusId, UrlStatus.SUCCESS) htmlItem = HtmlItem() htmlItem["url"] = response.url htmlItem["html"] = response.body subUrls = [] URLgroup = LinkExtractor(allow=()).extract_links(response) if (self.cur_url_depth < self.depth_limit and self.depth_limit != 0) or self.depth_limit == 0: for URL in URLgroup: if self.is_domain_allowed(URL.url): subUrls.append(URL.url) htmlItem["subUrls"] = subUrls # htmlItem["taskJob"]=self.taskJob # htmlItem["taskJobHistory"] = self.taskJobHistory htmlItem["curUrlDepth"] = self.cur_url_depth return htmlItem
def process_request(self, request, spider): driver = None logging.info("*************ChromeMiddleware*************") jobTemplateList = CacheFactory.get("job_template_by_url", request.url) if jobTemplateList != None and len(jobTemplateList) > 0: jobTemplate = jobTemplateList[0] else: jobTemplate = spider.jobTemplate if jobTemplate: if hasattr(jobTemplate, 'needLogin') and jobTemplate.needLogin: userName = jobTemplate.userName password = jobTemplate.passWord unameId = jobTemplate.unameElementId passwordId = jobTemplate.pwdElementId submitId = jobTemplate.submitElementId return self.login(request=request, username=userName, password=password, username_id=unameId, password_id=passwordId, submit_id=submitId) if jobTemplate.chromeFlag and mutex.acquire(): try: driver = webdriver.Chrome( executable_path=ConfigUtils.getSpiderPorperties(ConfigUtils.KEYMAP.CHROME)) capabilities = self.get_desired_capabilities(spider) driver.start_session(capabilities) driver.set_page_load_timeout(30) driver.set_script_timeout(30) driver.set_window_size(1000, 10000) # 尽量将窗口设置大一些,以应对某些网站使用懒加载 driver.get(request.url) time.sleep(int(jobTemplate.sleepTime)) body = driver.page_source logging.info("PhantomJS is visiting " + request.url) htmlResponse = HtmlResponse(driver.current_url, body=body, encoding='utf-8', request=request) driver.quit() return htmlResponse except Exception, e: urlListStatusId = request.meta.get("urlListStatusId") if urlListStatusId: UrlDao.updateUrlStatus(urlListStatusId, UrlStatus.FAIL, repr(Exception)) logging.exception("time out visiting==>%s,%s" % (request.url, str(e))) finally: try:
def process_item(self, item, spider): try: curUrl = item["url"] subUrls = item["subUrls"] taskJob = spider.taskJob self.save_to_hdfs(taskJob.id,taskJob.databaseId,item["html"]) taskJobHistory = spider.taskJobHistory if subUrls and len(subUrls)>0: parentUrlDepth = item["curUrlDepth"] for url in subUrls: newTaskJob = ClassCopy.copyToNewInstances(taskJob,TaskJob) newTaskJob.url=url newTaskJob.curUrlDepth=parentUrlDepth+1 newTaskJob.parentUrl = curUrl CrawlerService.parseUrlAndInsertRedis(newTaskJob, taskJobHistory=taskJobHistory) else: if llen(ConfigUtils.getRedisPorperties(KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0: if taskJob.status != TaskStatus.SUCCESS: TaskJobDao.updateTaskJobStatus(taskJob.id, TaskStatus.SUCCESS) UrlDao.updateUrlStatusListByTaskJobHistoryId(spider.taskJobHistory.id, status=UrlStatus.STOP, desc="depth spider is over") return item except Exception,e: logger.exception("CacheHtmlPipeline:"+str(e))
class RedisCallbackSpider(RedisCrawlSpider): def beforeStartUrl(self, data): return data def next_requests(self): use_set = self.settings.getbool('REDIS_START_URLS_AS_SET', defaults.START_URLS_AS_SET) fetch_one = self.server.spop if use_set else self.server.lpop if not LicenseUtils.check_licence(LicenseUtils.get_mac_address()): reactor.stop() """Returns a request to be scheduled or none.""" # XXX: Do we need to use a timeout here? found = 0 while found < self.redis_batch_size: redis_key = fetch_one(self.redis_key) taskJobHistoryId = redis_key if taskJobHistoryId != None: taskJobHistory = loadTaskJobHistoryById(taskJobHistoryId) if taskJobHistory: taskJobId = taskJobHistory.taskJobId taskJob = TaskJobDao.loadTaskById(taskJobId) if taskJob and taskJob.status == TaskStatus.PAUSE: RedisUtils.lpush( ConfigUtils.getRedisPorperties( KEYMAP.ASSIST_SPIDER_REDIS_KEY), taskJobHistoryId) break else: break else: break if hashswitch: if str(localIP) != str(tjs.get_node(redis_key)): RedisUtils.lpush(self.redis_key, redis_key) return redis_key = self.redis_key + "_" + redis_key orginData = fetch_one(redis_key) data = None # data = fetch_one(self.redis_key) try: logging.info("orginData==" + orginData) orginData = json.loads(orginData) orginData["taskJobHistoryId"] = taskJobHistoryId data = self.beforeStartUrl(orginData) except Exception, e: logging.error("Error e:") logging.error(e) logging.error(orginData) break if not data: # Queue empty. logging.warning('********dataUrl is null*************') break try: req = self.make_request_from_data(data) # req.replace(meta={"id":"123"}) req.meta["id"] = orginData.get("id") req.meta["dataParentId"] = orginData.get("dataParentId") req.meta["taskJobHistoryId"] = orginData.get( "taskJobHistoryId") req.meta["url"] = orginData.get("url") urlListStatusId = req.meta["urlListStatusId"] = orginData.get( "urlListStatusId") except Exception, e: logging.error("make_request_from_data:e:" + e) break if req: if urlListStatusId: UrlDao.updateUrlStatus(urlListStatusId, UrlStatus.RUNNING) else: urlListStatus = UrlClazz( url=orginData.get("url"), status=UrlStatus.RUNNING, parentUrl=orginData.get("parentUrl"), taskJobId=orginData.get("id"), taskJobHistoryId=taskJobHistoryId) UrlDao.addUrl(urlListStatus) yield req found += 1 else: if urlListStatusId: UrlDao.updateUrlStatus(urlListStatusId, UrlStatus.FAIL, "Request not made from data") else: urlListStatus = UrlClazz( url=orginData.get("url"), status=UrlStatus.RUNNING, parentUrl=orginData.get("parentUrl"), taskJobId=orginData.get("id"), taskJobHistoryId=taskJobHistoryId) UrlDao.addUrl(urlListStatus) self.logger.debug("Request not made from data: %r", data)
def _do_upinsert(self, item): now = str(datetime.now()) data = item["data"] url = item["url"] jobTemplateFieldList = item["jobTemplateFieldList"] jobTemplate = item["jobTemplate"] self.dataParentId = jobTemplate.dataParentId if hasattr( jobTemplate, "dataParentId") else None extraData = jobTemplate.extraData self.taskJob = item["taskJob"] # searchTaskJob = item["searchTaskJob"] taskJobHistroy = item["taskJobHistroy"] self.taskJobHistoryId = jobTemplate.taskJobHistoryId taskJobHistroyId = str(taskJobHistroy.id) paramMap = {} self.taskJobParamList = [] if taskJobHistroy != None: self.taskJobParamList.append( TaskJobParam(paramNameEn="task_job_id_sequence", paramValue=taskJobHistroyId)) paramMap["task_job_id_sequence"] = taskJobHistroyId # if searchTaskJob!=None: # self.taskJobParamList.append(TaskJobParam(paramNameEn=searchTaskJob.name, paramValue=searchTaskJob.name)) # paramMap[searchTaskJob.name] = searchTaskJob.name # self.taskJobParamList = [] # if self.taskJobHistoryId!=None: # self.taskJobParamList=CacheFactory.get("task_job_param", self.taskJobHistoryId) # if self.taskJobParamList!=None: # for taskJobParam in self.taskJobParamList: # paramMap[taskJobParam.paramNameEn]=taskJobParam.paramValue tableName = jobTemplate.tableName jobTemplateId = jobTemplate.id databaseId = jobTemplate.databaseId if jobTemplate.databaseId != "-1" and jobTemplate.databaseId != None else self.taskJob.databaseId db = self.dbclient.getConnection(databaseId) if db == None: logging.warning('db is null,please check it with databaseid :%s' % databaseId) if llen( ConfigUtils.getRedisPorperties( KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0: if self.taskJob.status != TaskStatus.SUCCESS: TaskJobDao.updateTaskJobStatus(self.taskJob.id, TaskStatus.SUCCESS) UrlDao.updateUrlStatusListByTaskJobHistoryId( self.taskJobHistoryId, status=UrlStatus.STOP, desc="no db") return sqlArray = [] if data == None or len(data) == 0: logging.warning( 'insert data not exist,please retry crawler or check template or check error' ) if llen( ConfigUtils.getRedisPorperties( KEYMAP.MAIN_SPIDER_REDIS_KEY)) == 0: if self.taskJob.status != TaskStatus.SUCCESS: TaskJobDao.updateTaskJobStatus(self.taskJob.id, TaskStatus.SUCCESS) UrlDao.updateUrlStatusListByTaskJobHistoryId( self.taskJobHistoryId, status=UrlStatus.STOP, desc="no data") return logging.info('----pipelines insert data-----%s' % str(data)) for d in data: d["task_job_url"] = url if self.dataParentId != None: d["parent_id"] = self.dataParentId d["id"] = str(uuid.uuid1()) if self.dbclient.db_type == 'kafka': d['TemplateName'] = jobTemplate.name d['UrlStatus'] = 0 d['Timestamps'] = int(time.time()) if self.dbclient.db_type == 'hdfs' or self.dbclient.db_type == 'mongodb': sqlArray.append( db.insert(jobTemplate.id, tableName, d, paramMap)) else: sqlArray.append(db.insert(tableName, d, paramMap)) if jobTemplateId != None: try: childJobTemplateList = TemplateDao.queryJobTemplateListByParentId( jobTemplateId) self.loadNext(childJobTemplateList, dict(extraData.items() + d.items())) except Exception, e: logging.error(e.message)
def saveUrlListStatus(urlListStatus): if urlListStatus: # RedisUtils.lpush(ConfigUtils.getRedisPorperties(KEYMAP.URL_LIST_SPIDER_REDIS_KEY) + "_" + urlListStatus.taskJobId, # json.dumps(urlListStatus)) UrlDao.addUrl(urlListStatus)