def getPdfFile(self, pdfRealUrl, filePath): ''' get pdf file by pdf real url @param pdfRealUrl:pdf file's real url @param filePath:local disk path which will save pdf file @return: success return true, or return false ''' bReturn = False f = None try: if pdfRealUrl: response = requests.get(pdfRealUrl, stream=True) f = open(filePath, "wb") for chunk in response.iter_content(chunk_size=1024): if chunk: f.write(chunk) appLogger.info( 'success get file from the internet. the file size is %s bytes' % os.path.getsize(filePath)) bReturn = True else: # print 'sleep 10 second...' sleepTime = random.randrange(5, 20, 1) time.sleep( sleepTime ) #if we can not get real pdf url, thread will sleep 3000 ms in order to simulate the time to download the file except Exception, err: appLogger.error(err)
def execute(self): ''' main function to execute this app ''' if self.streamLineTemplate: pTotalCount = 1 #the total number of process processes = [] tmpInfo = '' for index, process in enumerate(self.streamLineTemplate): pCount = process.get('pCount') #process number pTotalCount += pCount thread = process.get('Thread') #thread number processes.append([pCount, thread]) tmpInfo += 'starts %s process(including main process)\n' % pTotalCount for i, p in enumerate(processes): tmpInfo += 'process stage %d starts %d process:\n' % (i + 1, p[0]) tmpInfo += 'every bizprocessor thread in the process is the following:\n' if i == 0: thread = p[1] tmpInfo += '%40s.%40s * %s\n' % (thread[0], thread[1], thread[2]) else: for thread in p[1]: tmpInfo += '%40s.%40s * %d\n' % (thread[0], thread[1], thread[2]) appLogger.info(tmpInfo) processList, outputQueue = self.CreateSteamLine() stat = Statistics() for process in processList: process.start() #start processor process_record = 0 productCount = -1 while True: streamBox = outputQueue.get() if streamBox: if isinstance(streamBox, StreamLogger): stat.addProcessorLog(streamBox) process_record += 1 #the product which has been processed adds 1 if process_record % 3 == 0: appLogger.info('\n%s' % stat.getStatisticInfo()) #print log if isinstance(streamBox, StopSignal): productCount = streamBox.productCount #get product count if productCount == process_record: #if product count equals processed product count then stop app break del (streamBox) else: time.sleep(0.01) for process in processList: process.terminate() process.join() appLogger.info('\n%s' % stat.getStatisticInfo()) appLogger.info('%s thread stop' % self.__class__.__name__) appLogger.info('the app stop')
def getResultData(self): keyWords=getKeywords(self.appConfig) for keyWord in keyWords: results=self.apiSpider.queryData(keyWord) if not results or len(results)==0: appLogger.info('key word: %s results number is 0' % keyWord) else: appLogger.info('key word: %s results number is %d' % (keyWord,len(results))) for result in results: yield result
def printEndMessage(self,message): ser1Lock.acquire() sReturn='Finishing: ' if message: timeArray=self.timeMap.get(message) if not timeArray: timeArray=[] timeArray.append(self.getCurTime()) # periodArray=self.periodMap.get(message) sReturn='Finishing: %-50s spending: %6d ms, max: %6d ms, min: %6d ms, avg: %6d ms' % (message,self.getPeriod(message),max(self.periodMap.get(message)),min(self.periodMap.get(message)),self.getAvg(self.periodMap.get(message))) appLogger.info(sReturn) ser1Lock.release()
def printStartMessage(self,message): ser1Lock.acquire() sReturn='Beginning: ' if message: timeArray=self.timeMap.get(message) if not timeArray: timeArray=[] self.timeMap[message]= timeArray sReturn='Beginning: %-s...' %message timeArray.append(self.getCurTime()) appLogger.info(sReturn) ser1Lock.release()
def execute(self): ''' the main function to execute software ''' #create streamline streamLineArray = self.CreateSteamLine() #start every bizprocessor for index, item in enumerate(streamLineArray): if isinstance(item, BaseProcessor): item.start() while True: time.sleep(0.1) appLogger.info('%s thread stop' % self.__class__.__name__)
def getRealPdfUrl(self, pdfUrl): ''' get real pdf url by pdfUrl @param pdfUrl: the pdfUrl which is gotten from ieee xplore api @return: real pdf url ''' sReturn = None try: if pdfUrl: #claim a MozillaCookieJar instance to save cookie cookie = cookielib.MozillaCookieJar() cookie.load(self.COOKIE_PATH, ignore_discard=True, ignore_expires=True) # httpHandler = urllib2.HTTPHandler() # httpsHandler = urllib2.HTTPSHandler() cookieHandler = urllib2.HTTPCookieProcessor(cookie) opener = urllib2.build_opener(cookieHandler) urllib2.install_opener(opener) #access pdfUrl # pt.printStartMessage('get real pdf url from the internet') loop = 0 while True: if loop == 1: # appLogger.error('loop 1 times, but we still cannot get real pdf url') break loop += 1 # queueLock.acquire() requestHeaders = self.getRandomRequestHeaders() request = urllib2.Request(pdfUrl, headers=requestHeaders) response = urllib2.urlopen(request) sleepTime = random.randrange(1, 3, 1) # print 'sleep %s s' % sleepTime time.sleep(sleepTime) queueLock.release() #save cookie soup = BeautifulSoup(response, features='lxml') appLogger.info(pdfUrl) if soup.iframe: sReturn = soup.iframe.attrs.get( 'src') #get real pdf url break else: print requestHeaders time.sleep(10) except Exception, err: appLogger.error(err)
def getTotalStatistics(self): sReturn='The whole procedure:' tempMap={} for (k,v) in self.periodMap.items(): if ' - ' in k: lastStr=k.split(' - ')[-1] tempArray = tempMap.get(lastStr) if not tempArray: tempArray=[] tempMap[lastStr]=tempArray tempArray.append(self.getAvg(v)) for (k,v) in tempMap.items(): sReturn+=' %s : %d ;' % (k,self.getAvg(v)) appLogger.info(sReturn) return sReturn
def queryData(self, keyWords=''): ''' query ieee xplore database to get results @param keyWords: key words @return: a list which contains query results, and every result is the json formate. The max results count is 40000 ''' # lReturn=[{"_id":"5b839c9b7bbd7112ec94e5bf","issn":"0148-9267","start_page":"106","publication_number":6720219,"rank":16,"article_number":"6792690","title":"Évelyne Gayou, Editor: Polychrome Portraits 14: Pierre Schaeffer","abstract_url":"https://ieeexplore.ieee.org/xpl/articleDetails.jsp?arnumber=6792690","issue":"1","is_number":6790986,"index_terms":{},"publication_title":"Computer Music Journal","volume":"34","access_type":"LOCKED","content_type":"Journals","authors":{"authors":[{"author_order":1,"affiliation":"San Francisco, California, USA.","full_name":"Thom Blum"}]},"publication_date":"March 2010","fileId":"","publisher":"MIT Press","doi":"10.1162/comj.2010.34.1.106","pdf_url":"https://ieeexplore.ieee.org/stamp/stamp.jsp?arnumber=6792690","partnum":"6792690","end_page":"111","citing_paper_count":0}] lReturn = [] try: begin = 1 #query start record number query = XPLORE(self.API_KEY) query.maximumResults(self.QUERY_RETURN_MAX_RESULTS) query.queryText(keyWords) query.resultsSorting('publication_year', 'desc') query.resultsFilter('content_type', 'Journals') #only query journals query.resultsFilter( 'open_access', 'True') #only query the articles which is open access if self.QUERY_BEGIN_YEAR: query.resultsFilter('start_year', self.QUERY_BEGIN_YEAR) if self.QUERY_END_YEAR: query.resultsFilter('end_year', self.QUERY_END_YEAR) appLogger.info(self.getQueryInfo(keyWords)) while True: query.startingResult(begin) results = query.callAPI(debugModeOff=True) print results self.CUR_QUERY_COUNT += 1 articles = self.getArticles(results) #get articles list if articles: lReturn.extend(articles) # add articles to result list size = len(articles) #get query total number if size == self.QUERY_RETURN_MAX_RESULTS and self.CUR_QUERY_COUNT < self.MAX_QUERY_COUNT_LIMIT: #if still has more articles,continue query begin = len(lReturn) + 1 else: break else: break except Exception, err: appLogger.error(err)
def run(self): self.outputQueue.put(object(),block=True) appLogger.info('init queue...') # time.sleep(20) while self.__class__.isServer: beginTime=time.time() processObj=self.process() endTime=time.time() # if isinstance(processObj,StopSignal): # self.__class__.isServer=False # processObj=None # appLogger.info('%s thread stop' % self.__class__.__name__) if isinstance(processObj, StreamLogger): processObj.setProcessorLog(self.__class__.__name__,beginTime,endTime) if processObj and self.outputQueue: if isinstance(processObj,StreamBox): self.__class__.productCount=self.__class__.productCount+1 if isinstance(processObj,StopSignal): processObj.productCount=self.__class__.productCount self.__class__.isServer=False self.outputQueue.put(processObj,block=True) # print 'producer put a box in the queue' time.sleep(0.01)
cf = ConfigParser() cf.read(configFilePath) keyWords = getKeywords(cf) pt.printStartMessage('initiate') apiSpider = getApiSpider(cf) webPageSpider = getWebPageSipder(cf) mongoDBDAO = getDatabase(cf) threadCount = getTHreadCount(cf) # pt.printEndMessage('initiate') pt.printStartMessage('processes') taskQueue = Queue() for keyWord in keyWords: appLogger.info( '------------------------------------------------------------') pt.printStartMessage('query articles by keywords:' + keyWord) results = apiSpider.queryData(keyWord) pt.printEndMessage('query articles by keywords:' + keyWord) if not results or len(results) == 0: print 'Results number is 0' break else: print 'Results number is %d' % len(results) for result in results: taskQueue.put(result) pt.printStartMessage('processes result set') threadArray = [] for i in range(threadCount): nt = NormalThread(taskQueue, apiSpider, webPageSpider, mongoDBDAO,
#initialize app configFilePath = '../config.conf' cf = ConfigParser() cf.read(configFilePath) keyWords = getKeywords(cf) pt.printStartMessage('initiate') apiSpider = getApiSpider(cf) webPageSpider = getWebPageSipder(cf) mongoDBDAO = getDatabase(cf) # pt.printEndMessage('initiate') pt.printStartMessage('processes') for keyWord in keyWords: appLogger.info( '------------------------------------------------------------') pt.printStartMessage('query articles by keywords:' + keyWord) results = apiSpider.queryData(keyWord) pt.printEndMessage('query articles by keywords:' + keyWord) if not results or len(results) == 0: print 'Results number is 0' break else: print 'Results number is %d' % len(results) pt.printStartMessage('processes result set') resultNum = 0 for result in results: appLogger.info( '----------------------------%d--------------------------------' % resultNum) resultNum += 1
def run(self): appLogger.info('Thread %s start' % self.getName()) if not self.taskQueue: return while True: try: queueLock.acquire() if self.taskQueue.empty(): queueLock.release() break result = self.taskQueue.get(block=True) queueLock.release() self.printTool.printStartMessage( '%s processes result' % threading.Thread.getName(self)) self.printTool.printStartMessage( '%s gets pdf url' % threading.Thread.getName(self)) pdfUrl = self.apiSpider.getPdfUrl(result) # print pdfUrl pdfRealUrl = self.webPageSpider.getRealPdfUrl(pdfUrl) # print pdfRealUrl self.printTool.printEndMessage('%s gets pdf url' % threading.Thread.getName(self)) self.printTool.printStartMessage( '%s gets pdf file' % threading.Thread.getName(self)) if pdfRealUrl: #if real file not exist then use simulated file fileName = result.get('article_number') + '.pdf' else: fileName = 'simulated file.pdf' fileTempPath = self.webPageSpider.generateTempFilePath( fileName) fileId = '' flag = self.webPageSpider.getPdfFile(pdfRealUrl, fileTempPath) self.printTool.printEndMessage('%s gets pdf file' % threading.Thread.getName(self)) self.printTool.printStartMessage( '%s inserts pdf file into the database' % threading.Thread.getName(self)) if flag: #if get pdf file success then save the file into the database fileId = self.mongoDBDAO.insertFile(fileTempPath, fileName, isDelFile=True) else: fileId = self.mongoDBDAO.insertFile(fileTempPath, fileName, isDelFile=False) self.printTool.printEndMessage( '%s inserts pdf file into the database' % threading.Thread.getName(self)) self.printTool.printStartMessage( '%s inserts articles into the database' % threading.Thread.getName(self)) result['fileId'] = fileId #set fileId in the result self.mongoDBDAO.insertOneData( **result) #save a result into the database self.printTool.printEndMessage( '%s inserts articles into the database' % threading.Thread.getName(self)) self.printTool.printEndMessage('%s processes result' % threading.Thread.getName(self)) except Exception, err: appLogger.error(err) finally:
isDelFile=True) else: fileId = self.mongoDBDAO.insertFile(fileTempPath, fileName, isDelFile=False) self.printTool.printEndMessage( '%s inserts pdf file into the database' % threading.Thread.getName(self)) self.printTool.printStartMessage( '%s inserts articles into the database' % threading.Thread.getName(self)) result['fileId'] = fileId #set fileId in the result self.mongoDBDAO.insertOneData( **result) #save a result into the database self.printTool.printEndMessage( '%s inserts articles into the database' % threading.Thread.getName(self)) self.printTool.printEndMessage('%s processes result' % threading.Thread.getName(self)) except Exception, err: appLogger.error(err) finally: try: queueLock.release() except Exception, err: pass appLogger.info('Thread %s end' % self.getName()) if __name__ == '__main__': pass