def stopService(self): log.msg(format='startPageSpiderService->stopService stop startPageSpiderService serviceName=(%(serviceName)s)',serviceName=self.name) service.Service.stopService(self) self.spiderService.removeSpiderService() self._crawler.stop() if self.name in self.spiderService.namedServices: self.spiderService.removeService(self)
def parse(self, response): print 'startPageSpider==========================>',response.url # log.msg(format='%(iPid)s, %(url)s, %(project)s ', iPid = self.taskId, url = response.url, project=self.project) listQuqueCount = self.redis.llen('scrapy:startPageSpider:listQuque:%s' % self.taskId) if listQuqueCount == 1: self._crawler.signals.send_catch_log('writeListQuque') elif listQuqueCount == 0: self._crawler.signals.send_catch_log('emptyListQuque') print 'startPageSpider---------send_catch_log->emptyListQuque' if response.url not in self.hasCrawlSet: pattern = re.compile(r'%s' % self.project['szStartUrlReg']) self.hasCrawlSet.add(response.url) if pattern.match(response.url) and response.url not in self.hasInsertSet: title = "|".join(response.xpath('/html/head/title/text()').extract()) insertSql = 'INSERT INTO project_start_page(iPid, szUrl, szTitle,dtLastScrapyTime) VALUES(%d, "%s", "%s", "%s")' % (self.taskId, response.url, title, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) self.dbUtils.insert(insertSql) self.hasInsertSet.add(response.url) self.redis.lpush('scrapy:startPageSpider:listQuque:%s' % self.taskId, response.url) #self.redis.sadd('scrapy:startPageSpider:startPage:2', response.url) log.msg(format='spider=startPageSpider iPid=%(i)s, title=%(t)s url=%(u)s', i = self.taskId, t=title, u=response.url) _allow = ( _allow for _allow in self.project['szStartUrlReg'].split('~')) self.linkExtractor = LinkExtractor(allow_domains=self.domain, allow=_allow) links = [ link for link in self.linkExtractor.extract_links(response) if link.url not in self.hasCrawlSet ] for link in links: yield self.make_requests_from_url(link.url)
def addTask(data, taskId): taskName = self.getTaskName(taskId) if taskName not in self.namedServices: _taskService = taskService(taskId=taskId, taskName=taskName, setting = data) self.addService(_taskService) log.msg(format='add task success taskId=%(taskId)s taskName=%(taskName)s', taskId=taskId, taskName=taskName) return True else : log.msg('add task error task is existed taskId=%(taskId)s taskName=%(taskName)s', taskId=taskId, taskName=taskName) return False
def removeSpiderService(self): _running = 0 for serviceName in self.namedServices: if self.namedServices[serviceName].running == 1: _running = 1 break if _running == 0 : log.msg(format='spiderService->removeSpiderService stop spiderService serviceName=(%(serviceName)s)',serviceName=self.name) if self.name in self.taskService.namedServices: self.taskService.removeService(self)
def startTask(self, taskId): serviceName = 'spiderService_%s' % taskId if serviceName not in self.namedServices: _spiderService = spiderService(taskId=taskId, taskService=self) _spiderService.setName('spiderService_%s' % taskId) self.addService(_spiderService) log.msg(format='TaskService->startTask(%(taskId)s)',taskId=taskId) return True else: log.msg(format='TaskService->startTask(%(taskId)s) serviceName=%(serviceName)s Exists',taskId=taskId, serviceName=serviceName) return False
def parse(self, response): #self.redis.sadd('scrapy:startPageSpider:startPage:3', response.url) if response.url not in self.hasCrawlSet: #self.redis.sadd('scrapy:startPageSpider:startPage:4', response.url) self.hasCrawlSet.add(response.url) _allow = ( _allow for _allow in self.project['szUrlReg'].split('~')) self.linkExtractor = LinkExtractor(allow_domains=self.domain, allow=_allow) links = [ link for link in self.linkExtractor.extract_links(response) if link.url not in self.hasInsertSet ] #self.redis.hset('scrapy:startPageSpider:listPage:count', response.url, len(links)) for link in links: if link.url in self.hasInsertSet : continue insertSql = 'INSERT INTO project_list_page(iPid, szUrl, szTitle, szSourceUrl,dtLastScrapyTime) VALUES(%d, "%s", "%s", "%s", "%s")' % (self.taskId,link.url, link.text, response.url, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time()))) self.dbUtils.insert(insertSql) self.hasInsertSet.add(link.url) log.msg(format='spider=listPageSpider iPid=%(i)s, title=%(t)s url=%(u)s', i = self.taskId, t=link.text, u=link.url)
def startService(self): _spider_start_page_setting = self.setting.get(SPIDER_TYPE_START_PAGE) if _spider_start_page_setting is not None: _spider_start_page_setting['szStartUrl']=self.setting.get('szStartUrl') _spider_start_page_setting['szRegStartUrl']=self.setting.get('szRegStartUrl') # self._startStartPageSpider(_spider_start_page_setting) self._crawlerProcess.start() # sService = startPageService(self) lSeevice = listPageService(self) cService = contentPageService(self) # eService = extraPageService(self) # self.addService(sService) self.addService(lSeevice) self.addService(cService) # self.addService(eService) service.MultiService.startService(self) log.msg('taskService->startService')
def removeTask(self, taskId): if taskId is None: log.msg('remove task error , taskid is None') return None taskName = self.getTaskName(taskId) if taskName in self.namedServices: self.removeService(self.getServiceNamed(taskName)) log.msg(format='remove task success taskId=%(taskId)s taskName=%(taskName)s', taskId=taskId, taskName=taskName) return True else : log.msg('remove task error , taskid is not existed') return False
def configErr(msg, taskId): log.msg('get config error taskid= %s ' % taskId ) log.msg('error: ') log.msg(msg)
def startService(self): self.startTask(1) service.MultiService.startService(self) log.msg("taskRootService->startService")
def startService(self): service.Service.startService(self) log.msg('contentPageService->%s' % self.taskService.taskId)
def stopService(self): service.MultiService.stopService(self) log.msg('taskService->stopService')
def stopService(self): log.msg(format='spiderService->stopService stop spiderService serviceName=(%(serviceName)s)',serviceName=self.name) service.MultiService.stopService(self) self.removeSpiderService()
def startService(self): service.MultiService.startService(self) log.msg(format='start spiderService serviceName=(%(serviceName)s)',serviceName=self.name) self._startPageSpiderService._crawler.signals.connect(self._listPageSpiderService.startCrawl, 'writeListQuque') self._startPageSpiderService._crawler.signals.connect(self._listPageSpiderService.pausedCrawl, 'emptyListQuque') self._listPageSpiderService._crawler.signals.connect(self._listPageSpiderService.pausedCrawl, 'emptyListQuque')
def startService(self): service.Service.startService(self) log.msg('TaskService->startService')
def stopTask(self,taskId): serviceName = 'spiderService_%s' % taskId log.msg(format='TaskService->stopTask(%(taskId)s) serviceName=%(serviceName)s ',taskId=taskId, serviceName=serviceName) if serviceName in self.namedServices: self.namedServices[serviceName].stopService() return True