Пример #1
0
 def stopService(self):
     log.msg(format='startPageSpiderService->stopService stop startPageSpiderService serviceName=(%(serviceName)s)',serviceName=self.name)
     service.Service.stopService(self)
     self.spiderService.removeSpiderService()
     self._crawler.stop()
     if self.name in self.spiderService.namedServices:
         self.spiderService.removeService(self)
Пример #2
0
    def parse(self, response):
        print 'startPageSpider==========================>',response.url
#         log.msg(format='%(iPid)s, %(url)s, %(project)s ', iPid = self.taskId, url = response.url, project=self.project)
        listQuqueCount = self.redis.llen('scrapy:startPageSpider:listQuque:%s' % self.taskId)
        if listQuqueCount == 1:
            self._crawler.signals.send_catch_log('writeListQuque')
        elif listQuqueCount == 0:
            self._crawler.signals.send_catch_log('emptyListQuque')
            print 'startPageSpider---------send_catch_log->emptyListQuque'
        if response.url not in self.hasCrawlSet:
            pattern = re.compile(r'%s' % self.project['szStartUrlReg'])
            self.hasCrawlSet.add(response.url)
            if pattern.match(response.url) and response.url not in self.hasInsertSet:
                title = "|".join(response.xpath('/html/head/title/text()').extract())
                insertSql = 'INSERT INTO project_start_page(iPid, szUrl, szTitle,dtLastScrapyTime) VALUES(%d, "%s", "%s", "%s")' % (self.taskId, response.url,  title,  time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
                self.dbUtils.insert(insertSql)
                self.hasInsertSet.add(response.url)
                self.redis.lpush('scrapy:startPageSpider:listQuque:%s' % self.taskId, response.url)
                #self.redis.sadd('scrapy:startPageSpider:startPage:2', response.url)
                log.msg(format='spider=startPageSpider iPid=%(i)s, title=%(t)s url=%(u)s', i = self.taskId, t=title, u=response.url)

            _allow = ( _allow for _allow in self.project['szStartUrlReg'].split('~'))
            self.linkExtractor = LinkExtractor(allow_domains=self.domain, allow=_allow)
            links = [ link for link in self.linkExtractor.extract_links(response) if link.url not in self.hasCrawlSet ]
            for link in links:
                yield self.make_requests_from_url(link.url)
Пример #3
0
 def addTask(data, taskId):
     taskName = self.getTaskName(taskId)
     if taskName not in self.namedServices:
         _taskService = taskService(taskId=taskId, taskName=taskName, setting = data)
         self.addService(_taskService)
         log.msg(format='add task success taskId=%(taskId)s taskName=%(taskName)s', taskId=taskId, taskName=taskName)
         return True
     else :
         log.msg('add task error task is existed taskId=%(taskId)s taskName=%(taskName)s', taskId=taskId, taskName=taskName)
         return False
Пример #4
0
    def removeSpiderService(self):
        _running = 0
        for serviceName in self.namedServices:
            if self.namedServices[serviceName].running == 1:
                _running = 1
                break

        if _running == 0 :
            log.msg(format='spiderService->removeSpiderService stop spiderService serviceName=(%(serviceName)s)',serviceName=self.name)
            if self.name in self.taskService.namedServices:
                self.taskService.removeService(self)
Пример #5
0
 def startTask(self, taskId):
     serviceName = 'spiderService_%s' % taskId
     if serviceName not in self.namedServices:
         _spiderService = spiderService(taskId=taskId, taskService=self)
         _spiderService.setName('spiderService_%s' % taskId)
         self.addService(_spiderService)
         log.msg(format='TaskService->startTask(%(taskId)s)',taskId=taskId)
         return True
     else:
         log.msg(format='TaskService->startTask(%(taskId)s) serviceName=%(serviceName)s Exists',taskId=taskId, serviceName=serviceName)
         return False
Пример #6
0
 def parse(self, response):
     #self.redis.sadd('scrapy:startPageSpider:startPage:3', response.url)
     if response.url not in self.hasCrawlSet:
         #self.redis.sadd('scrapy:startPageSpider:startPage:4', response.url)
         self.hasCrawlSet.add(response.url)
         _allow = ( _allow for _allow in self.project['szUrlReg'].split('~'))
         self.linkExtractor = LinkExtractor(allow_domains=self.domain, allow=_allow)
         links = [ link for link in self.linkExtractor.extract_links(response) if link.url not in self.hasInsertSet ]
         #self.redis.hset('scrapy:startPageSpider:listPage:count', response.url, len(links))
         for link in links:
             if link.url in self.hasInsertSet : continue
             insertSql = 'INSERT INTO project_list_page(iPid, szUrl, szTitle, szSourceUrl,dtLastScrapyTime) VALUES(%d, "%s", "%s", "%s", "%s")' % (self.taskId,link.url, link.text, response.url, time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())))
             self.dbUtils.insert(insertSql)
             self.hasInsertSet.add(link.url)
             log.msg(format='spider=listPageSpider iPid=%(i)s, title=%(t)s url=%(u)s', i = self.taskId, t=link.text, u=link.url)
Пример #7
0
    def startService(self):
        _spider_start_page_setting =  self.setting.get(SPIDER_TYPE_START_PAGE)
        if _spider_start_page_setting is not None:
            _spider_start_page_setting['szStartUrl']=self.setting.get('szStartUrl')
            _spider_start_page_setting['szRegStartUrl']=self.setting.get('szRegStartUrl')
#             self._startStartPageSpider(_spider_start_page_setting)
            
        
        self._crawlerProcess.start()
#     sService = startPageService(self)
        lSeevice = listPageService(self)
        cService = contentPageService(self)
        # eService = extraPageService(self)
#         self.addService(sService)
        self.addService(lSeevice)
        self.addService(cService)
        # self.addService(eService)
        service.MultiService.startService(self)
        log.msg('taskService->startService')
Пример #8
0
 def removeTask(self, taskId):
     if taskId is None:
         log.msg('remove task error , taskid is None')
         return None    
     taskName = self.getTaskName(taskId)
     if taskName in self.namedServices:
         self.removeService(self.getServiceNamed(taskName))
         log.msg(format='remove task success taskId=%(taskId)s taskName=%(taskName)s', taskId=taskId, taskName=taskName)
         return True
     else :
         log.msg('remove task error , taskid is not existed')
         return False
Пример #9
0
 def configErr(msg, taskId):
     log.msg('get config error taskid= %s ' % taskId )
     log.msg('error: ')
     log.msg(msg)
Пример #10
0
 def startService(self):
     self.startTask(1)
     service.MultiService.startService(self)
     log.msg("taskRootService->startService")
Пример #11
0
 def startService(self):
     service.Service.startService(self)
     log.msg('contentPageService->%s' % self.taskService.taskId)
Пример #12
0
 def stopService(self):
     service.MultiService.stopService(self)
     log.msg('taskService->stopService')
Пример #13
0
 def stopService(self):
     log.msg(format='spiderService->stopService stop spiderService serviceName=(%(serviceName)s)',serviceName=self.name)
     service.MultiService.stopService(self)
     self.removeSpiderService()
Пример #14
0
 def startService(self):
     service.MultiService.startService(self)
     log.msg(format='start spiderService serviceName=(%(serviceName)s)',serviceName=self.name)
     self._startPageSpiderService._crawler.signals.connect(self._listPageSpiderService.startCrawl, 'writeListQuque')
     self._startPageSpiderService._crawler.signals.connect(self._listPageSpiderService.pausedCrawl, 'emptyListQuque')
     self._listPageSpiderService._crawler.signals.connect(self._listPageSpiderService.pausedCrawl, 'emptyListQuque')
Пример #15
0
 def startService(self):
     service.Service.startService(self)
     log.msg('TaskService->startService')
Пример #16
0
 def stopTask(self,taskId):
     serviceName = 'spiderService_%s' % taskId
     log.msg(format='TaskService->stopTask(%(taskId)s) serviceName=%(serviceName)s ',taskId=taskId, serviceName=serviceName)
     if serviceName in self.namedServices:
         self.namedServices[serviceName].stopService()
     return True