def read_project_cache(self): """ Try to read the project cache file """ # Get cache filename info('Reading Project Cache...') cachereader = utils.HarvestManCacheReaderWriter( self.get_proj_cache_directory()) obj, found = cachereader.read_project_cache() self._cfg.cachefound = found self.cache = obj if not found: # Fresh cache - create structure... self.cache.create('url', 'last_modified', 'etag', 'updated', 'location', 'checksum', 'content_length', 'data', 'headers') # Create an index on URL self.cache.create_index('url') else: pass
def dump_headers(self): """ Dump the headers of the web pages downloaded, into a DBM file """ # print dbmfile extrainfo("Writing url headers database") headersdict = {} for node in self.collections.preorder(): coll = node.get() for urlobjidx in coll.getAllURLs(): urlobj = self.get_url(urlobjidx) if urlobj: url = urlobj.get_full_url() # Get headers headers = urlobj.get_url_content_info() if headers: headersdict[url] = str(headers) cache = utils.HarvestManCacheReaderWriter( self.get_proj_cache_directory()) return cache.write_url_headers(headersdict)
def post_download_setup(self): """ Actions to perform after project is complete """ # Loop through URL db, one by one and then for those # URLs which were downloaded but did not succeed, try again. # But make sure we don't download links which were not-modified # on server-side (HTTP 304) and hence were skipped. failed = [] # Broken links (404) nbroken = 0 for node in self._urldb.preorder(): urlobj = node.get() # print 'URL=>',urlobj.get_full_url() if urlobj.status == 404: # print 'BROKEN', urlobj.get_full_url() nbroken += 1 elif urlobj.qstatus == urlparser.URL_DONE_DOWNLOAD and \ urlobj.status != 0 and urlobj.status != 304: failed.append(urlobj) self._numfailed = len(failed) # print 'BROKEN=>', nbroken if self._cfg.retryfailed: info(' ') # try downloading again if self._numfailed: info('Redownloading failed links...', ) self._redownload = True for urlobj in failed: if urlobj.fatal or urlobj.starturl: continue extrainfo('Re-downloading', urlobj.get_full_url()) self._numretried += 1 self.thread_download(urlobj) # Wait for the downloads to complete... if self._numretried: extrainfo("Waiting for the re-downloads to complete...") self._urlThreadPool.wait(10.0, self._cfg.timeout) worked = 0 # Let us calculate the failed rate again... for urlobj in failed: if urlobj.status == 0: # Download was done worked += 1 self._numfailed2 = self._numfailed - worked # Stop the url thread pool # Stop worker threads self._urlThreadPool.stop_all_threads() # bugfix: Moved the time calculation code here. t2 = time.time() self._cfg.endtime = t2 # Write cache file if self._cfg.pagecache and self._cfg.writecache: cachewriter = utils.HarvestManCacheReaderWriter( self.get_proj_cache_directory()) self.add_headers_to_cache() cachewriter.write_project_cache(self.cache) # If url header dump is enabled, dump it if self._cfg.urlheaders: self.dump_headers() if self._cfg.localise: self.localise_links() # Write archive file... if self._cfg.archive: self.archive_project() # dump url tree (dependency tree) to a file if self._cfg.urltreefile: self.dump_urltree() if not self._cfg.project: return nlinks = self._urldb.size # print stats of the project nservers, ndirs, nfiltered = objects.rulesmgr.get_stats() nfailed = self._numfailed numstillfailed = self._numfailed2 numfiles = self.savedfiles numfilesinrepos = self.reposfiles numfilesincache = self.cachefiles numretried = self._numretried fetchtime = self._cfg.endtime - self._cfg.starttime statsd = { 'links': nlinks, 'filtered': nfiltered, 'processed': nlinks - nfiltered, 'broken': nbroken, 'extservers': nservers, 'extdirs': ndirs, 'failed': nfailed, 'fatal': numstillfailed, 'files': numfiles, 'filesinrepos': numfilesinrepos, 'filesincache': numfilesincache, 'retries': numretried, 'bytes': self.bytes, 'fetchtime': fetchtime, } self.print_project_info(statsd) objects.eventmgr.raise_event('postdownload', None)