def SanitizeURI(self, uri): # if not a host request if (uri != '' and string.find(uri, '/') >= 0): # if there is no protocol, assume it's HTTP uri = pywrapurl.URL(uri, 'http').Assemble() return uri
def get(self, collection, uriAt, sort, view, page, partial_match, flatList, logging_me=True, debugging_me=False): """Returns the content diagnostics""" if logging_me: logging.info("[diagnose_handler:get] Sitemap flatList = " + flatList) servers = (self.cfg.globalParams.GetServerManager().Set( 'urltracker_server').Servers()) uriAt = self.SanitizeURI(uriAt) for server in servers: client = urltracker_client.URLTrackerClient( server.host(), int(server.port())) contents = client.Get(string.strip(collection), string.strip(uriAt), string.strip(sort), string.strip(view), self.GetIntValue(page), self.GetIntValue(partial_match), self.GetIntValue(flatList)) if contents == None: continue # # Note: This last-minute `pagerank' update is the sole difference btwn # this procedure (viz., `get()') and `export()' below. # # Note: The call to `exportDiagnostics()' in "AdminCaller.java" actually # calls _this_ (viz., `get()'), __not__ `export()' below. # # Note: The `execute()' code in "ExportDiagnosticsHandler.java" ignores # this meticulously calculated `pagerank' data altogether. Hmmm. # for content in contents[:-1]: if content.get('type') == 'FileContentData': content['pagerank'] = self.GetPageRank( long(pywrapurl.URL(content['uri']).Fingerprint())) if debugging_me: if self.GetIntValue(flatList) == 0: content['name'] = '[tree=' + flatList + '] ' + content[ 'name'] else: content['name'] = '[flat=' + flatList + '] ' + content[ 'name'] return 'response = %s\n' % repr(contents) return 'response = []\n'
def getFile(self, uriAt): """Returns the content status for the URI uriAt""" collection_names = ent_collection.ListCollections( self.cfg.globalParams) collection_fingerprint_map = {} uriAt = self.SanitizeURI(uriAt) for name in collection_names: collection_fingerprint_map[pywraphash.Fingerprint(name)] = name urltracker_servers = (self.cfg.globalParams.GetServerManager().Set( 'urltracker_server').Servers()) DocID = long(pywrapurl.URL(uriAt).Fingerprint()) for urltracker_server in urltracker_servers: urltracker_client_ = urltracker_client.URLTrackerClient( urltracker_server.host(), int(urltracker_server.port())) (response, last_successful_crawl_timestamp, auth_method) = urltracker_client_.GetFile( string.strip(uriAt), collection_fingerprint_map) if response == None: continue pagerank = self.GetPageRank(DocID) cached = self.IsDocCached(DocID) forwardLinks = self.GetLinkCount(DocID, 1) backwardLinks = self.GetLinkCount(DocID, 0) date = self.GetDate(DocID) lastmodifieddate = self.GetLastModifiedDate(DocID) dict = { 'pagerank': pagerank, 'cached': cached, 'date': date, 'lastmodifieddate': lastmodifieddate, 'forwardLinks': forwardLinks, 'backwardLinks': backwardLinks } # Note that this timestamp and the CRAWLED_NEW state may appear in # history list of states in @response. However, if it ages enough, # it will be removed from the history list. Therefore we need to # store away and pass around along with, if applicable, its auth_method. if last_successful_crawl_timestamp: dict['lastSuccessfulCrawlTimestamp'] = long( last_successful_crawl_timestamp) if auth_method: dict['authMethod'] = int(auth_method) response.append(dict) return 'response = %s\n' % repr(response) return 'response = []\n'
def handleDone(self): """All servers have responded, now process responses.""" perHostSorterList = self.perHostUrlSorters.values() for urlSorter in perHostSorterList: urlSorter.setDone() self.futureUrlInfoSorter.setDone() perHostSorterUrlCounts = map(lambda (x): x.size(), perHostSorterList) totalCount = self.sum_(perHostSorterUrlCounts) if self.maxRequested < totalCount: # past due urls are more than enough for request, trimming down # these queue to give equal share for each hosts and ignore undue urls avgSize = self.maxRequested / len(self.perHostUrlSorters) counts = [] for urlSorter in perHostSorterList: counts.append(min(urlSorter.size(), avgSize)) totalCount = self.sum_(counts) idx = 0 while totalCount < self.maxRequested: if perHostSorterUrlCounts[idx] > counts[idx]: counts[idx] += 1 totalCount += 1 idx = (idx + 1) % len(counts) # now trimming the perHostUrlSorters for idx in range(len(counts)): perHostSorterList[idx].setSize(counts[idx]) else: # take all past due urls, and still has room for undue urls # distribute undue urls into per-host queues futureUrls = self.futureUrlInfoSorter.getUrls() idx = 0 while totalCount < self.maxRequested and idx < len(futureUrls): urlInfo = futureUrls[idx] host = pywrapurl.URL(urlInfo.path()).host() if not self.perHostUrlSorters.has_key(host): self.perHostUrlSorters[host] = UrlInfoSorter() self.perHostUrlSorters[host].setDone() self.perHostUrlSorters[host].append(urlInfo) totalCount += 1 idx += 1 self.futureUrlInfoSorter = None self.size_ = totalCount
def CommandHandler(self, request, handler): ' handler would be called with an argument of type pywrapurl.URL' request.SetContentTypeTEXT() request.output().WriteString(handler(pywrapurl.URL( request.req_path()))) request.Reply()
def makeDirectoryName(self, site): """Generate fingerprint (directory name) for a collection.""" # Use the version that takes a string and not char*. Fixes http://b/1001790 return '%x' % pywrapurl.URL(site, '').Fingerprint()