def SanitizeURI(self, uri):

        # if not a host request
        if (uri != '' and string.find(uri, '/') >= 0):
            # if there is no protocol, assume it's HTTP
            uri = pywrapurl.URL(uri, 'http').Assemble()
        return uri
    def get(self,
            collection,
            uriAt,
            sort,
            view,
            page,
            partial_match,
            flatList,
            logging_me=True,
            debugging_me=False):
        """Returns the content diagnostics"""

        if logging_me:
            logging.info("[diagnose_handler:get] Sitemap flatList = " +
                         flatList)

        servers = (self.cfg.globalParams.GetServerManager().Set(
            'urltracker_server').Servers())
        uriAt = self.SanitizeURI(uriAt)
        for server in servers:
            client = urltracker_client.URLTrackerClient(
                server.host(), int(server.port()))
            contents = client.Get(string.strip(collection),
                                  string.strip(uriAt), string.strip(sort),
                                  string.strip(view), self.GetIntValue(page),
                                  self.GetIntValue(partial_match),
                                  self.GetIntValue(flatList))
            if contents == None: continue
            #
            # Note:  This last-minute `pagerank' update is the sole difference btwn
            #        this procedure (viz., `get()') and `export()' below.
            #
            # Note:  The call to `exportDiagnostics()' in "AdminCaller.java" actually
            #        calls _this_ (viz., `get()'), __not__ `export()' below.
            #
            # Note:  The `execute()' code in "ExportDiagnosticsHandler.java" ignores
            #        this meticulously calculated `pagerank' data altogether.  Hmmm.
            #
            for content in contents[:-1]:
                if content.get('type') == 'FileContentData':
                    content['pagerank'] = self.GetPageRank(
                        long(pywrapurl.URL(content['uri']).Fingerprint()))
                if debugging_me:
                    if self.GetIntValue(flatList) == 0:
                        content['name'] = '[tree=' + flatList + '] ' + content[
                            'name']
                    else:
                        content['name'] = '[flat=' + flatList + '] ' + content[
                            'name']

            return 'response = %s\n' % repr(contents)

        return 'response = []\n'
    def getFile(self, uriAt):
        """Returns the content status for the URI uriAt"""

        collection_names = ent_collection.ListCollections(
            self.cfg.globalParams)
        collection_fingerprint_map = {}
        uriAt = self.SanitizeURI(uriAt)
        for name in collection_names:
            collection_fingerprint_map[pywraphash.Fingerprint(name)] = name
        urltracker_servers = (self.cfg.globalParams.GetServerManager().Set(
            'urltracker_server').Servers())
        DocID = long(pywrapurl.URL(uriAt).Fingerprint())

        for urltracker_server in urltracker_servers:
            urltracker_client_ = urltracker_client.URLTrackerClient(
                urltracker_server.host(), int(urltracker_server.port()))
            (response, last_successful_crawl_timestamp,
             auth_method) = urltracker_client_.GetFile(
                 string.strip(uriAt), collection_fingerprint_map)
            if response == None: continue
            pagerank = self.GetPageRank(DocID)
            cached = self.IsDocCached(DocID)
            forwardLinks = self.GetLinkCount(DocID, 1)
            backwardLinks = self.GetLinkCount(DocID, 0)
            date = self.GetDate(DocID)
            lastmodifieddate = self.GetLastModifiedDate(DocID)

            dict = {
                'pagerank': pagerank,
                'cached': cached,
                'date': date,
                'lastmodifieddate': lastmodifieddate,
                'forwardLinks': forwardLinks,
                'backwardLinks': backwardLinks
            }

            # Note that this timestamp and the CRAWLED_NEW state may appear in
            # history list of states in @response. However, if it ages enough,
            # it will be removed from the history list. Therefore we need to
            # store away and pass around along with, if applicable, its auth_method.
            if last_successful_crawl_timestamp:
                dict['lastSuccessfulCrawlTimestamp'] = long(
                    last_successful_crawl_timestamp)
                if auth_method:
                    dict['authMethod'] = int(auth_method)

            response.append(dict)
            return 'response = %s\n' % repr(response)

        return 'response = []\n'
예제 #4
0
    def handleDone(self):
        """All servers have responded, now process responses."""

        perHostSorterList = self.perHostUrlSorters.values()
        for urlSorter in perHostSorterList:
            urlSorter.setDone()
        self.futureUrlInfoSorter.setDone()

        perHostSorterUrlCounts = map(lambda (x): x.size(), perHostSorterList)
        totalCount = self.sum_(perHostSorterUrlCounts)

        if self.maxRequested < totalCount:
            # past due urls are more than enough for request, trimming down
            # these queue to give equal share for each hosts and ignore undue urls
            avgSize = self.maxRequested / len(self.perHostUrlSorters)
            counts = []
            for urlSorter in perHostSorterList:
                counts.append(min(urlSorter.size(), avgSize))

            totalCount = self.sum_(counts)
            idx = 0
            while totalCount < self.maxRequested:
                if perHostSorterUrlCounts[idx] > counts[idx]:
                    counts[idx] += 1
                    totalCount += 1
                idx = (idx + 1) % len(counts)
            # now trimming the perHostUrlSorters
            for idx in range(len(counts)):
                perHostSorterList[idx].setSize(counts[idx])
        else:
            # take all past due urls, and still has room for undue urls
            # distribute undue urls into per-host queues
            futureUrls = self.futureUrlInfoSorter.getUrls()
            idx = 0
            while totalCount < self.maxRequested and idx < len(futureUrls):
                urlInfo = futureUrls[idx]
                host = pywrapurl.URL(urlInfo.path()).host()
                if not self.perHostUrlSorters.has_key(host):
                    self.perHostUrlSorters[host] = UrlInfoSorter()
                    self.perHostUrlSorters[host].setDone()
                self.perHostUrlSorters[host].append(urlInfo)
                totalCount += 1
                idx += 1
        self.futureUrlInfoSorter = None
        self.size_ = totalCount
 def CommandHandler(self, request, handler):
     ' handler would be called with an argument of type pywrapurl.URL'
     request.SetContentTypeTEXT()
     request.output().WriteString(handler(pywrapurl.URL(
         request.req_path())))
     request.Reply()
예제 #6
0
    def makeDirectoryName(self, site):
        """Generate fingerprint (directory name) for a collection."""

        # Use the version that takes a string and not char*. Fixes http://b/1001790
        return '%x' % pywrapurl.URL(site, '').Fingerprint()