def __init__(self): print '.. init Reptile' _config = Config() _Reptile.__init__(self, _config.getint('reptile', 'page_num')) self.curPageUrl = '' startpages = _config.get('reptile', 'startpage').split() _netlocs = [] for url in startpages: self._queue.put(url) print '.. init startpages: ', startpages self._sourceparser = SourceParser(startpages)
class Reptile(_Reptile): ''' main reptile ''' def __init__(self): print '.. init Reptile' _config = Config() _Reptile.__init__(self, _config.getint('reptile', 'page_num')) self.curPageUrl = '' startpages = _config.get('reptile', 'startpage').split() _netlocs = [] for url in startpages: self._queue.put(url) print '.. init startpages: ', startpages self._sourceparser = SourceParser(startpages) def matchUrl(self, url): print 'match url:', url return self._sourceparser.matchUrl(url) def run(self): print '.. run' while not self._queue.empty(): time.sleep(random.randint(5,20)) print '.. while not run' url = self._queue.get() self._sourceparser.setCurPageUrl(url) #if not self.outPageRange(): #if True: print '.. post: ', url _source = self.requestSource(url) if not _source: continue print '.. get: source length ', len(_source) self._sourceparser.setSource(_source) self._sourceparser.saveSource(self.downloadedPageNum) _absurls = self._sourceparser.getAbsUrls() for url in _absurls: self.inQueue(url)