Пример #1
0
class JrjSpider(scrapy.Spider):
    name = 'jrj'
    allowed_domains = ['stock.jrj.com.cn']
    start_urls = ['http://*****:*****@class="titmain"]//h1//text()'
        temp = './/div[@class="texttit_m1"]//p//text()'
        #        print(response)
        item = CrawlerItem()
        item['link'] = response.url
        ans0 = response.xpath(temp0).getall()
        ans1 = response.xpath(temp).getall()
        item['title'] = ans0
        item['content'] = ans1
        preInfo = None
        if (self.preInfoUrlDict != None):
            preInfo = self.preInfoUrlDict[item['link']]
        elif (len(self.preInfoList) == 1):
            preInfo = self.preInfoList[0]
        ansFinal = {
            'type': 'crawlerResult',
            'content': {
                'link': item['link'],
                'title': ans0,
                'content': ans1,
                'preInfo': preInfo
            }
        }
        ansJson = json.dumps(ansFinal)
        self.cacheAgent.push(ansJson)
        #        self.cache.close()
        #        self.cacheAgent.close()
        yield item
Пример #2
0
class GeneralSpider(scrapy.Spider):
    name = 'general'
    allowed_domains = []
    start_urls = ['http://finance.jrj.com.cn/2020/04/24012529362098.shtml']

    def __init__(self,
                 cacheCrawlerPath='',
                 cacheKey='',
                 cacheAgentPath='',
                 *args,
                 **kwargs):
        super().__init__(*args, **kwargs)
        self.cacheKey = cacheKey
        self.cacheCrawlerPath = cacheCrawlerPath
        print('aa', cacheCrawlerPath, 'bb', cacheAgentPath, cacheKey)
        self.cache = Cache(cacheCrawlerPath)
        self.cacheAgent = Cache(cacheAgentPath)
        self.oContentExtract = CContentExtract('boilerpipe')
        #        jsonStr = self.cache[int(cacheKey)]
        _, jsonStr = self.cache.pull()
        print('cc', jsonStr)
        if (jsonStr == None):
            oUrlList = [
                'http://finance.jrj.com.cn/2020/04/24012529362098.shtml'
            ]
            self.start_urls = oUrlList
        else:
            oUrlList = json.loads(jsonStr)
            self.start_urls = oUrlList['urlList']
            self.logInfo = oUrlList['logInfo']
            self.preInfoList: list = oUrlList['preInfo']
            self.preInfoUrlDict = None
            if (len(self.preInfoList) == len(self.start_urls)):
                self.preInfoUrlDict = dict()
                for idx, url in enumerate(self.start_urls):
                    self.preInfoUrlDict[url] = self.preInfoList[idx]

            logInfo = {'type': 'logInfo', 'content': {'data': self.logInfo}}
            logInfoStr = json.dumps(logInfo)
            self.cacheAgent.push(logInfoStr)

    def parse(self, response):
        temp0 = './/div[@class="titmain"]//h1//text()'
        temp = './/div[@class="texttit_m1"]//p//text()'
        #        print(response)
        item = CrawlerItem()
        item['link'] = response.url
        html = response.text
        #        print(html)
        #        ans0 = response.xpath(temp0).getall()
        #        ans1 = response.xpath(temp).getall()
        ans0, ans1 = self.oContentExtract.boilerpipe(html)
        item['title'] = ans0
        item['content'] = ans1
        print(ans0, ans1)
        preInfo = None
        if (self.preInfoUrlDict != None):
            preInfo = self.preInfoUrlDict[item['link']]
        elif (len(self.preInfoList) == 1):
            preInfo = self.preInfoList[0]
        ansFinal = {
            'type': 'crawlerResult',
            'content': {
                'data': {
                    'link': item['link'],
                    'title': ans0,
                    'content': ans1
                },
                'preInfo': preInfo
            }
        }
        ansJson = json.dumps(ansFinal)
        self.cacheAgent.push(ansJson)
        #        self.cache.close()
        #        self.cacheAgent.close()
        yield item
Пример #3
0
class CCrawlerManager:
    def __init__(self, name, workDirectory: str, oLog: CLog, cachePath: str,
                 cacheAgentPath: str):
        self.workDirectory = workDirectory
        self.jobsList = None
        self.oLog = oLog
        self.outputFolder = workDirectory
        self.name = name + '_crawler'
        self.jobCnt = 0
        self._cachePathCrawler = cachePath
        self._cachePathAgent = cacheAgentPath
        self.cache = Cache(cachePath)

    def _newProcess(self, crawlerName, oUrlCacheKey: str):
        outFilePath = 'file:///' + self.outputFolder + self.name + '.json'
        #        print(outFilePath,urlsFilePath)
        #        process = subprocess.Popen(['scrapy','crawl',crawlerName,'-o',outFilePath,'-a',
        #                                    'cacheCrawlerPath='+ self._cachePathCrawler,'-a',
        #                                    'cacheKey='+oUrlCacheKey,'-a',
        #                                    'cacheAgentPath=' + self._cachePathAgent],
        #                                   shell=True,
        #                                   cwd=self.workDirectory)
        #        print('scrapy','crawl',crawlerName,'-o',outFilePath,'-a',
        #                                    'cacheCrawlerPath='+ self._cachePathCrawler,'-a',
        #                                    'cacheKey='+oUrlCacheKey,'-a',
        #                                    'cacheAgentPath=' + self._cachePathAgent)
        process = subprocess.Popen([
            'scrapy', 'crawl', crawlerName, '-a', 'cacheCrawlerPath=' +
            self._cachePathCrawler, '-a', 'cacheKey=' + oUrlCacheKey, '-a',
            'cacheAgentPath=' + self._cachePathAgent
        ],
                                   shell=True,
                                   cwd=self.workDirectory)
        #        print('scrapy','crawl',crawlerName,'-a',
        #                                    'cacheCrawlerPath='+ self._cachePathCrawler,'-a',
        #                                    'cacheKey='+oUrlCacheKey,'-a',
        #                                    'cacheAgentPath=' + self._cachePathAgent)

        return process

    def engineStart(self, jobsList: list):
        for oUrlList in jobsList:
            oUrlList.index = self.jobCnt
            tempKey = self._prepareJob(oUrlList.exportJson())
            self.oLog.safeRecordTime(str(oUrlList.index) + "start")
            temp = self._newProcess('general', tempKey)
            #            temp.wait()
            self.oLog.safeRecordTime(str(oUrlList.index) + "end")
            return temp

    def _prepareJob(self, content: str):
        #        key = str(self.jobCnt)
        #        if(self.cache.get(key)==False):
        #            raise ValueError("this key exists in the cache")
        #            return None
        #        else:
        #            self.cache[key] = content
        #            self.jobCnt+=1
        #            return key
        self.jobCnt += 1
        #        print(self.cache.directory)
        key = self.cache.push(content)
        return str(key)

    def closeCache(self):
        self.cache.close()