class QuarterMaster(object): def __init__(self): zkhosts = hqconfig.get('zkhosts', None) logging.warn('zkhosts=%s', zkhosts) self.coord = Coordinator(zkhosts, alivenode='master') if zkhosts else None self.mongo = pymongo.Connection(host=hqconfig.get('mongo')) self.jobconfigs = JobConfigs(self.mongo.crawl) # crawlinfo is historically named 'wide' but not really wide crawl # specific. #self.crawlinfo = CrawlInfo('wide') self.worksetmapper = WorksetMapper(hqconfig.NWORKSETS_BITS) # distributor for each job self.distributors = {} def shutdown(self): self.coord.shutdown() #self.crawlinfo.shutdown() @property def servers(self): return self.coord and self.coord.get_servers() @property def servers_status(self): return self.coord and self.coord.get_servers_status() def get_distributor(self, job): if job not in self.distributors: self.distributors[job] = Distributor(self.jobconfigs.get_job(job), self.worksetmapper) return self.distributors[job]
class Headquarters(object): '''now just a collection of CrawlJobs''' def __init__(self): self.jobs = {} self.jobslock = threading.RLock() # single shared CrawlInfo database # named 'wide' for historical reasons. self.crawlinfo = CrawlInfo('wide') self.mongo = pymongo.Connection(hqconfig.get('mongo')) self.configdb = self.mongo.crawl self.domaininfo = DomainInfo(self.configdb) self.jobconfigs = MongoJobConfigs(self.configdb) self.coordinator = Coordinator(hqconfig.get('zkhosts')) def shutdown(self): for job in self.jobs.values(): job.shutdown() self.domaininfo.shutdown() self.configdb = None self.mongo.disconnect() def get_job(self, jobname, nocreate=False): with self.jobslock: job = self.jobs.get(jobname) if job is None: if nocreate and not self.jobconfigs.job_exists(jobname): raise ValueError('unknown job %s' % jobname) job = self.jobs[jobname] = CrawlJob( self.jobconfigs, jobname, self.crawlinfo, self.domaininfo) self.coordinator.publish_job(job) return job self.schedulers = {} self.incomingqueues = {} def get_workset_status(self, job): r = self.get_job(job).get_workset_status() r['hq'] = id(self) return r PARAMS = [('loglevel', int)] @property def loglevel(self): return logging.getLogger().getEffectiveLevel() @loglevel.setter def loglevel(self, level): logging.getLogger().setLevel(level) def reload_domaininfo(self): self.domaininfo.load()
def __init__(self, jobname): self.jobname = jobname self.divbase = os.path.join(hqconfig.get('datadir'), jobname, 'div') #self.coord = hqconfig.factory.coordinator self.coord = Coordinator(hqconfig.get('zkhosts'), readonly=1) self._get_servers() self.nodename = os.uname()[1]
class Shuffle(object): def __init__(self, jobname): self.jobname = jobname self.divbase = os.path.join(hqconfig.get('datadir'), jobname, 'div') #self.coord = hqconfig.factory.coordinator self.coord = Coordinator(hqconfig.get('zkhosts'), readonly=1) self._get_servers() self.nodename = os.uname()[1] def _get_servers(self): # TODO: read server info from coordinator # TODO: servers list can change in the middle of operation. self.id2host = self.coord.get_job_servers(self.jobname) self.servers = len(self.id2host) # currently fixed - TODO self.clients = 25 def ws2id(self, wsid): return (wsid % self.clients) / self.servers def shuffle_divert(self, wsid): divdir = os.path.join(self.divbase, str(wsid)) deque = FileDequeue(divdir) serverid = self.ws2id(wsid) if serverid not in self.id2host: raise ValueError, 'server for ws %d is unknown' % wsid server = self.id2host[serverid] if server == self.nodename: raise ValueError, 'refusing to shuffle to myself' if not self.coord.is_server_alive(server): raise IOError, 'server %s is not alive' % server client = DiscoveredClient(server, self.jobname) def dequewrapper(q): count = 0 while 1: curi = q.get(timeout=0.1) if curi is None: break count += 1 sys.stderr.write('\r%s/%s: submitting %d to %s' % ( self.jobname, wsid, count, server)) yield curi sys.stderr.write('\n') client.batch_submit_discovered(dequewrapper(deque))
def __init__(self): self.jobs = {} self.jobslock = threading.RLock() # single shared CrawlInfo database # named 'wide' for historical reasons. self.crawlinfo = CrawlInfo('wide') self.mongo = pymongo.Connection(hqconfig.get('mongo')) self.configdb = self.mongo.crawl self.domaininfo = DomainInfo(self.configdb) self.jobconfigs = MongoJobConfigs(self.configdb) self.coordinator = Coordinator(hqconfig.get('zkhosts'))
def __init__(self): zkhosts = hqconfig.get('zkhosts', None) logging.warn('zkhosts=%s', zkhosts) self.coord = Coordinator(zkhosts, alivenode='master') if zkhosts else None self.mongo = pymongo.Connection(host=hqconfig.get('mongo')) self.jobconfigs = JobConfigs(self.mongo.crawl) # crawlinfo is historically named 'wide' but not really wide crawl # specific. #self.crawlinfo = CrawlInfo('wide') self.worksetmapper = WorksetMapper(hqconfig.NWORKSETS_BITS) # distributor for each job self.distributors = {}
def __init__(self): self.jobs = {} self.jobslock = threading.RLock() mongoserver = hqconfig.get('mongo') logging.warn('using MongoDB: %s', mongoserver) self.mongo = pymongo.Connection(mongoserver) self.configdb = self.mongo.crawl # single shared CrawlInfo database # named 'wide' for historical reasons. #self.crawlinfo = CrawlInfo(self.configdb, 'wide') self.crawlinfo = None # disabled for performance reasons # lazy initialization (FIXME: there must be better abstraction) self.domaininfo = None #self.domaininfo = DomainInfo(self.configdb) self.jobconfigs = JobConfigs(self.configdb) self.coordinator = Coordinator(hqconfig.get('zkhosts'))
class Headquarters(object): '''now just a collection of CrawlJobs''' def __init__(self): self.jobs = {} self.jobslock = threading.RLock() mongoserver = hqconfig.get('mongo') logging.warn('using MongoDB: %s', mongoserver) self.mongo = pymongo.Connection(mongoserver) self.configdb = self.mongo.crawl # single shared CrawlInfo database # named 'wide' for historical reasons. #self.crawlinfo = CrawlInfo(self.configdb, 'wide') self.crawlinfo = None # disabled for performance reasons # lazy initialization (FIXME: there must be better abstraction) self.domaininfo = None #self.domaininfo = DomainInfo(self.configdb) self.jobconfigs = JobConfigs(self.configdb) self.coordinator = Coordinator(hqconfig.get('zkhosts')) def shutdown(self): for job in self.jobs.values(): logging.info("shutting down job %s", job) job.shutdown() if self.domaininfo: logging.info("shutting down domaininfo") self.domaininfo.shutdown() if self.crawlinfo: logging.info("shutting down crawlinfo") self.crawlinfo.shutdown() self.configdb = None self.mongo.disconnect() def get_domaininfo(self): if self.domaininfo is None: self.domaininfo = DomainInfo(self.configdb) return self.domaininfo def get_job(self, jobname, nocreate=False): with self.jobslock: job = self.jobs.get(jobname) if job is None: if nocreate and not self.jobconfigs.job_exists(jobname): raise UnknownJobError('unknown job %s' % jobname) job = self.jobs[jobname] = CrawlJob(self, jobname) self.coordinator.publish_job(job) return job self.schedulers = {} self.incomingqueues = {} def get_workset_status(self, job): r = self.get_job(job).get_workset_status() r['hq'] = id(self) return r PARAMS = [('loglevel', int)] @property def loglevel(self): return logging.getLogger().getEffectiveLevel() @loglevel.setter def loglevel(self, level): logging.getLogger().setLevel(level) def reload_domaininfo(self): if self.domaininfo: self.domaininfo.load()