예제 #1
0
    def read_project_cache(self):
        """ Try to read the project cache file """

        # Get cache filename
        info('Reading Project Cache...')
        cachereader = utils.HarvestManCacheReaderWriter(
            self.get_proj_cache_directory())
        obj, found = cachereader.read_project_cache()
        self._cfg.cachefound = found
        self.cache = obj
        if not found:
            # Fresh cache - create structure...
            self.cache.create('url', 'last_modified', 'etag', 'updated',
                              'location', 'checksum', 'content_length', 'data',
                              'headers')

            # Create an index on URL
            self.cache.create_index('url')
        else:
            pass
예제 #2
0
    def dump_headers(self):
        """ Dump the headers of the web pages
        downloaded, into a DBM file """

        # print dbmfile
        extrainfo("Writing url headers database")

        headersdict = {}
        for node in self.collections.preorder():
            coll = node.get()

            for urlobjidx in coll.getAllURLs():
                urlobj = self.get_url(urlobjidx)

                if urlobj:
                    url = urlobj.get_full_url()
                    # Get headers
                    headers = urlobj.get_url_content_info()
                    if headers:
                        headersdict[url] = str(headers)

        cache = utils.HarvestManCacheReaderWriter(
            self.get_proj_cache_directory())
        return cache.write_url_headers(headersdict)
예제 #3
0
    def post_download_setup(self):
        """ Actions to perform after project is complete """

        # Loop through URL db, one by one and then for those
        # URLs which were downloaded but did not succeed, try again.
        # But make sure we don't download links which were not-modified
        # on server-side (HTTP 304) and hence were skipped.
        failed = []
        # Broken links (404)
        nbroken = 0

        for node in self._urldb.preorder():
            urlobj = node.get()
            # print 'URL=>',urlobj.get_full_url()

            if urlobj.status == 404:
                # print 'BROKEN', urlobj.get_full_url()
                nbroken += 1
            elif urlobj.qstatus == urlparser.URL_DONE_DOWNLOAD and \
                   urlobj.status != 0 and urlobj.status != 304:
                failed.append(urlobj)

        self._numfailed = len(failed)
        # print 'BROKEN=>', nbroken

        if self._cfg.retryfailed:
            info(' ')

            # try downloading again
            if self._numfailed:
                info('Redownloading failed links...', )
                self._redownload = True

                for urlobj in failed:
                    if urlobj.fatal or urlobj.starturl: continue
                    extrainfo('Re-downloading', urlobj.get_full_url())
                    self._numretried += 1
                    self.thread_download(urlobj)

                # Wait for the downloads to complete...
                if self._numretried:
                    extrainfo("Waiting for the re-downloads to complete...")
                    self._urlThreadPool.wait(10.0, self._cfg.timeout)

                worked = 0
                # Let us calculate the failed rate again...
                for urlobj in failed:
                    if urlobj.status == 0:
                        # Download was done
                        worked += 1

                self._numfailed2 = self._numfailed - worked

        # Stop the url thread pool
        # Stop worker threads
        self._urlThreadPool.stop_all_threads()

        # bugfix: Moved the time calculation code here.
        t2 = time.time()

        self._cfg.endtime = t2

        # Write cache file
        if self._cfg.pagecache and self._cfg.writecache:
            cachewriter = utils.HarvestManCacheReaderWriter(
                self.get_proj_cache_directory())
            self.add_headers_to_cache()
            cachewriter.write_project_cache(self.cache)

        # If url header dump is enabled, dump it
        if self._cfg.urlheaders:
            self.dump_headers()

        if self._cfg.localise:
            self.localise_links()

        # Write archive file...
        if self._cfg.archive:
            self.archive_project()

        # dump url tree (dependency tree) to a file
        if self._cfg.urltreefile:
            self.dump_urltree()

        if not self._cfg.project: return

        nlinks = self._urldb.size
        # print stats of the project
        nservers, ndirs, nfiltered = objects.rulesmgr.get_stats()
        nfailed = self._numfailed
        numstillfailed = self._numfailed2

        numfiles = self.savedfiles
        numfilesinrepos = self.reposfiles
        numfilesincache = self.cachefiles

        numretried = self._numretried

        fetchtime = self._cfg.endtime - self._cfg.starttime

        statsd = {
            'links': nlinks,
            'filtered': nfiltered,
            'processed': nlinks - nfiltered,
            'broken': nbroken,
            'extservers': nservers,
            'extdirs': ndirs,
            'failed': nfailed,
            'fatal': numstillfailed,
            'files': numfiles,
            'filesinrepos': numfilesinrepos,
            'filesincache': numfilesincache,
            'retries': numretried,
            'bytes': self.bytes,
            'fetchtime': fetchtime,
        }

        self.print_project_info(statsd)

        objects.eventmgr.raise_event('postdownload', None)