class JrjSpider(scrapy.Spider): name = 'jrj' allowed_domains = ['stock.jrj.com.cn'] start_urls = ['http://*****:*****@class="titmain"]//h1//text()' temp = './/div[@class="texttit_m1"]//p//text()' # print(response) item = CrawlerItem() item['link'] = response.url ans0 = response.xpath(temp0).getall() ans1 = response.xpath(temp).getall() item['title'] = ans0 item['content'] = ans1 preInfo = None if (self.preInfoUrlDict != None): preInfo = self.preInfoUrlDict[item['link']] elif (len(self.preInfoList) == 1): preInfo = self.preInfoList[0] ansFinal = { 'type': 'crawlerResult', 'content': { 'link': item['link'], 'title': ans0, 'content': ans1, 'preInfo': preInfo } } ansJson = json.dumps(ansFinal) self.cacheAgent.push(ansJson) # self.cache.close() # self.cacheAgent.close() yield item
class GeneralSpider(scrapy.Spider): name = 'general' allowed_domains = [] start_urls = ['http://finance.jrj.com.cn/2020/04/24012529362098.shtml'] def __init__(self, cacheCrawlerPath='', cacheKey='', cacheAgentPath='', *args, **kwargs): super().__init__(*args, **kwargs) self.cacheKey = cacheKey self.cacheCrawlerPath = cacheCrawlerPath print('aa', cacheCrawlerPath, 'bb', cacheAgentPath, cacheKey) self.cache = Cache(cacheCrawlerPath) self.cacheAgent = Cache(cacheAgentPath) self.oContentExtract = CContentExtract('boilerpipe') # jsonStr = self.cache[int(cacheKey)] _, jsonStr = self.cache.pull() print('cc', jsonStr) if (jsonStr == None): oUrlList = [ 'http://finance.jrj.com.cn/2020/04/24012529362098.shtml' ] self.start_urls = oUrlList else: oUrlList = json.loads(jsonStr) self.start_urls = oUrlList['urlList'] self.logInfo = oUrlList['logInfo'] self.preInfoList: list = oUrlList['preInfo'] self.preInfoUrlDict = None if (len(self.preInfoList) == len(self.start_urls)): self.preInfoUrlDict = dict() for idx, url in enumerate(self.start_urls): self.preInfoUrlDict[url] = self.preInfoList[idx] logInfo = {'type': 'logInfo', 'content': {'data': self.logInfo}} logInfoStr = json.dumps(logInfo) self.cacheAgent.push(logInfoStr) def parse(self, response): temp0 = './/div[@class="titmain"]//h1//text()' temp = './/div[@class="texttit_m1"]//p//text()' # print(response) item = CrawlerItem() item['link'] = response.url html = response.text # print(html) # ans0 = response.xpath(temp0).getall() # ans1 = response.xpath(temp).getall() ans0, ans1 = self.oContentExtract.boilerpipe(html) item['title'] = ans0 item['content'] = ans1 print(ans0, ans1) preInfo = None if (self.preInfoUrlDict != None): preInfo = self.preInfoUrlDict[item['link']] elif (len(self.preInfoList) == 1): preInfo = self.preInfoList[0] ansFinal = { 'type': 'crawlerResult', 'content': { 'data': { 'link': item['link'], 'title': ans0, 'content': ans1 }, 'preInfo': preInfo } } ansJson = json.dumps(ansFinal) self.cacheAgent.push(ansJson) # self.cache.close() # self.cacheAgent.close() yield item
class CAgent: def __init__(self, name, oDir: CDirectoryConfig, oConfigByYaml: CConfigByYaml, connectKnowlegeServer=False): self.name = name self.crawlerManager: CCrawlerManager = None self.storageManager: CStorage = None self.knowledgeManagerClient: CKnowledgeClient = None self.oDir: CDirectoryConfig = oDir self.oConf = oConfigByYaml self.oLog = CLog(oDir['Log'], self.name + '_log') self.dbWeb = '' self.cacheAgent = Cache(oDir['cacheAgentFolder']) self.cacheCrawler = Cache(oDir['cacheCrawlerFolder']) self.flagConnectKnowlegeServer = connectKnowlegeServer fKeyboardInterruptRegistrar(self._callbackKeyboardInterrupt) self.flagUserClose = False # fKeyboardInterruptRegistrar._register['test'] = self._callbackKeyboardInterrupt def _configStorage(self, mode='mongoDB'): oSubConfig = self.oConf['Storage'] self.dbWeb = oSubConfig['dbWeb'] if (oSubConfig.get('mode') != None): mode = oSubConfig['mode'] path = self.dbWeb if (mode == 'mongoDB'): self.storageManager = CStorageMongoDB(self.name, path) def _configCrawler(self): self.crawlerManager = CCrawlerManager(self.name, self.oDir['crawlerCWD'], self.oLog, self.oDir['cacheCrawlerFolder'], self.oDir['cacheAgentFolder']) def _configKnowledgeManager(self): oSubConfig = self.oConf['KnowledgeManager'] addressTuple = (oSubConfig['address'], oSubConfig['port']) key = oSubConfig['password'] key = bytes(key, 'utf-8') print(key) self.knowledgeManagerClient = CKnowledgeClient(addressTuple, key, self.oLog) if self.flagConnectKnowlegeServer: err = self.knowledgeManagerClient.connect() if err == False: raise ValueError("KnowledgeManager connection failed") def configAll(self): self._configCrawler() self.oLog.safeRecordTime('CrawlerManager conf finished') self._configKnowledgeManager() self.oLog.safeRecordTime('KnowledgeManager conf finished') self._configStorage() self.oLog.safeRecordTime('StorageManager conf finished') def startCrawling(self, jobsList: list): return self.crawlerManager.engineStart(jobsList) def fetchResult( self, handler, subProcHandle, timeWaitStep=1, maxWaitTimes=5 ): #total continuous waittime will be (timeWaitStep * maxWaitTimes) result = '' cnt = 0 global WRITE_TO_STORAGE_FLAG WRITE_TO_STORAGE_FLAG = True while (True): _, result = self.cacheAgent.pull() if (result != None): result = json.loads(result) ans = handler(result['type'], result['content']) # print(ans) for temp in ans: self.storageManager.storeData(temp[0], temp[1], temp[2]) # break cnt = 0 #clear counter elif (timeWaitStep * maxWaitTimes > 0): if (cnt >= maxWaitTimes ): # if continuous wait time equals to maxWaitTimes WRITE_TO_STORAGE_FLAG = False return False elif subProcHandle.poll( ) != None: #if the subprocess is finished WRITE_TO_STORAGE_FLAG = False return subProcHandle.poll() else: time.sleep(timeWaitStep) cnt += 1 #counter add one else: WRITE_TO_STORAGE_FLAG = False raise ValueError( "timeWaitStep * maxWaitTimes should be bigger than 0") def clearCache(self): self.cacheAgent.clear() self.cacheCrawler.clear() def closeCache(self): self.cacheAgent.close() self.cacheCrawler.close() self.crawlerManager.closeCache() def _callbackKeyboardInterrupt(self, *args, **kwargs): global WRITE_TO_STORAGE_FLAG self.flagUserClose = True if (WRITE_TO_STORAGE_FLAG is True): numRemainedMsg = len(self.cacheAgent) MSG = "Agent is fetching the result to the Storage," + \ " number of remained items: " + str(numRemainedMsg) + \ ", will close later." return False, MSG else: return True, '' def test(self): #code for testing keyboard interruption handle global WRITE_TO_STORAGE_FLAG WRITE_TO_STORAGE_FLAG = True for i in range(1000): time.sleep(0.01) WRITE_TO_STORAGE_FLAG = False # # print('Press Ctrl+C') # for x in range(1,100): # time.sleep(0.2) # print(x) def close(self): self.knowledgeManagerClient.close() self.closeCache()