def __init__(self, jobname): self.jobname = jobname self.divbase = os.path.join(hqconfig.get('datadir'), jobname, 'div') #self.coord = hqconfig.factory.coordinator self.coord = Coordinator(hqconfig.get('zkhosts'), readonly=1) self._get_servers() self.nodename = os.uname()[1]
def __init__(self): self.jobs = {} self.jobslock = threading.RLock() self.mongo = pymongo.Connection(hqconfig.get('mongo')) self.configdb = self.mongo.crawl self.jobconfigs = JobConfigs(self.configdb) #self.coordinator = Coordinator(hqconfig.get('zkhosts')) self.maxinqueuesize = hqconfig.get(('inq', 'maxqueuesize'), 4)
def __init__(self): self.jobs = {} self.jobslock = threading.RLock() # single shared CrawlInfo database # named 'wide' for historical reasons. self.crawlinfo = CrawlInfo('wide') self.mongo = pymongo.Connection(hqconfig.get('mongo')) self.configdb = self.mongo.crawl self.domaininfo = DomainInfo(self.configdb) self.jobconfigs = MongoJobConfigs(self.configdb) self.coordinator = Coordinator(hqconfig.get('zkhosts'))
def __init__(self): zkhosts = hqconfig.get('zkhosts', None) logging.warn('zkhosts=%s', zkhosts) self.coord = Coordinator(zkhosts, alivenode='master') if zkhosts else None self.mongo = pymongo.Connection(host=hqconfig.get('mongo')) self.jobconfigs = JobConfigs(self.mongo.crawl) # crawlinfo is historically named 'wide' but not really wide crawl # specific. #self.crawlinfo = CrawlInfo('wide') self.worksetmapper = WorksetMapper(hqconfig.NWORKSETS_BITS) # distributor for each job self.distributors = {}
def __init__(self, jobname, bufsize=20): self.qdir = os.path.join(hqconfig.get('datadir'), jobname, 'ex') if not os.path.isdir(self.qdir): os.makedirs(self.qdir) FileEnqueue.recover(self.qdir) self.enq = FileEnqueue(self.qdir, buffer=bufsize, suffix='ex') self.queuedcount = 0
def __init__(self): self.jobs = {} self.jobslock = threading.RLock() mongoserver = hqconfig.get('mongo') logging.warn('using MongoDB: %s', mongoserver) self.mongo = pymongo.Connection(mongoserver) self.configdb = self.mongo.crawl # single shared CrawlInfo database # named 'wide' for historical reasons. #self.crawlinfo = CrawlInfo(self.configdb, 'wide') self.crawlinfo = None # disabled for performance reasons # lazy initialization (FIXME: there must be better abstraction) self.domaininfo = None #self.domaininfo = DomainInfo(self.configdb) self.jobconfigs = JobConfigs(self.configdb) self.coordinator = Coordinator(hqconfig.get('zkhosts'))
def configdb(): #return mongo().crawl import mongowrapper mongoserver = hqconfig.get('mongo') connection_params = dict(host=mongoserver) logging.info('using MongoDB: %s', mongoserver) return mongowrapper.MongoDatabaseWrapper(connection_params, 'crawl')
def setuplogging(level=logging.INFO, filename='hq.log'): logsdir = os.path.join(hqconfig.get('datadir'), 'logs') if not os.path.isdir(logsdir): os.makedirs(logsdir) logging.basicConfig( filename=os.path.join(logsdir, filename), level=level, format='%(asctime)s %(levelname)s %(name)s %(message)s', datefmt='%F %T')
def init_seen(self): if not self.seen: try: cachesize = hqconfig.get('seencache') if cachesize: cachesize = int(cachesize)*(1024**2) except: cachesize = None self.seen = Seen(dbdir=hqconfig.seendir(self.jobname), block_cache_size=cachesize)
def dispatcher_leveldb(domaininfo, job, *args, **kwargs): # refuse to run if MergeDispatcher files exist mseendir = os.path.join(hqconfig.get('datadir'), job, 'mseen') if os.path.isdir(mseendir): raise Exception('found directory %r, which suggests "merge"' ' dispatcher is in use. remove it if that is' ' no longer the case' % mseendir) from dispatcher import Dispatcher return Dispatcher(domaininfo, job, *args, **kwargs)
def dispatcher_merge(domaininfo, job, *args, **kwargs): # refuse to run if MergeDispatcher directory does not exist mseendir = os.path.join(hqconfig.get('datadir'), job, 'mseen') if not os.path.isdir(mseendir): raise Exception('directory %r does not exist. create it and' ' put SEEN file with initial seen list.' % mseendir) from mergedispatcher import MergeDispatcher return MergeDispatcher(domaininfo, job, *args, **kwargs)
def __call__(self, job): # seen cache parameter is in MB seen = self.job_seen.get(job) if not seen: cachesize = hqconfig.get('seencache') if cachesize: cachesize = int(cachesize)*(1024**2) seen = Seen(dbdir=hqconfig.seendir(job)) self.job_seen[job] = seen return seen
def __init__(self, job, mapper): self.jobname = job self.mapper = map self.basedir = os.path.join(hqconfig.get('datadir'), self.jobname, 'div') if not os.path.isdir(self.basedir): os.makedirs(self.basedir) self.queues = {} for fn in os.listdir(self.basedir): self.queues[fn] = DivertQueue(self.basedir, fn)
def processinq(self, maxn): '''process incoming queue. maxn paramter adivces upper limit on number of URIs processed in this single call. actual number of URIs processed may exceed it if incoming queue stores URIs in chunks.''' # lazy initialization of seen db if not self.seen: try: cachesize = hqconfig.get('seencache') if cachesize: cachesize = int(cachesize)*(1024**2) except: cachesize = None self.seen = Seen(dbdir=hqconfig.seendir(self.jobname), block_cache_size=cachesize) result = dict(processed=0, scheduled=0, excluded=0, saved=0, td=0.0, ts=0.0) for count in xrange(maxn): t0 = time.time() furi = self.inq.get(0.01) result['td'] += (time.time() - t0) if furi is None: break result['processed'] += 1 ws = self.mapper.workset(furi) if self.is_workset_active(ws): # no need to call self.workset_activating(). it's already # done by Scheduler. di = self.get_domaininfo(furi['u']) if di and di['exclude']: result['excluded'] += 1 continue t0 = time.time() suri = self.seen.already_seen(furi) if suri['e'] < int(time.time()): if 'w' in furi: a = furi['w'] else: a = dict() for k in ('p','v','x'): m = furi.get(k) if m is not None: a[k] = m curi = dict(u=furi['u'], id=suri['_id'], a=a) self.scheduler.schedule(curi, ws) result['scheduled'] += 1 result['ts'] += (time.time() - t0) else: if self.workset_state[ws]: self.workset_deactivating(ws) # client is not active self.diverter.divert(str(ws), furi) result['saved'] += 1 return result
def setuplogging(level=logging.INFO, filename='hq.log'): logconfig = dict( level=level, format='%(asctime)s %(levelname)s %(name)s %(message)s', datefmt='%F %T' ) logsdir = os.path.join(hqconfig.get('datadir'), 'logs') if not os.path.isdir(logsdir): try: os.makedirs(logsdir) logcnfig['filename'] = os.path.join(logsdir, filename) except OSError as ex: print >>sys.stderr, "failed to create logging directory {} ({})".format( logsdir, ex) logging.basicConfig(**logconfig)
def __init__(self, hq, jobname): self.hq = hq self.jobconfigs = self.hq.jobconfigs self.jobname = jobname self.mapper = CrawlMapper(self, hqconfig.NWORKSETS_BITS) self.scheduler = Scheduler(hqconfig.worksetdir(self.jobname), self.mapper) readsorted = hqconfig.getint('inq.sort', 1) inqdir = hqconfig.inqdir(self.jobname) def enqfactory(qdir, **kwargs): return PriorityEnqueue(qdir, **kwargs) def deqfactory(qdir, **kwargs): if readsorted: kwargs.update(reader=FPSortingQueueFileReader) return PriorityDequeue(qdir, **kwargs) self.inq = IncomingQueue(inqdir, enq=enqfactory, deq=deqfactory, buffsize=1000) # self.eninq = PriorityEnqueue( # qdir=hqconfig.inqdir(self.jobname), # buffer=1000) # deinqargs = {} # if readsorted: # deinqargs['reader'] = FPSortingQueueFileReader # self.deinq = PriorityDequeue(qdir=hqconfig.inqdir(self.jobname), # **deinqargs) self._dispatcher_mode = hqconfig.get( ('jobs', self.jobname, 'dispatcher'), 'internal') self.dispatcher = None #self.init_dispatcher() # currently disabled by default - too slow self.use_crawlinfo = False self.save_crawlinfo = False self.last_inq_count = 0 self.addedcount = 0 self.processedcount = 0
def __init__(self, hq, jobname): self.hq = hq self.jobconfigs = self.hq.jobconfigs self.jobname = jobname self.mapper = CrawlMapper(self, hqconfig.NWORKSETS_BITS) self.scheduler = Scheduler(hqconfig.worksetdir(self.jobname), self.mapper) self.inq = PooledIncomingQueue( qdir=hqconfig.inqdir(self.jobname), buffsize=1000) self._dispatcher_mode = hqconfig.get( ('jobs', self.jobname, 'dispatcher'), 'internal') self.dispatcher = None #self.init_dispatcher() # currently disabled by default - too slow self.use_crawlinfo = False self.save_crawlinfo = False self.last_inq_count = 0
def __init__(self, domaininfo, job, mapper, scheduler, inq, maxsize=int(2e9)): """Dispatcher that performs seen check by merging sorted cURL records against fixed-size records of URL-IDs. This version can resume processing previously terminated by system crash etc. without double scheduling. :param domaininfo: :param job: crawl job name :type job: str :param mapper: workset mapper :param scheduler: workset scheduler :param inq: incoming queue :param maxsize: max size of input for a batch """ # TODO: currently Dispatcher.__init__() initializes seenfactory, # which is not necessary for MergeDispatcher. #super(MergeDispatcher, self).__init__(domaininfo, job, mapper, # scheduler, inq) # begin dup with Dispatcher.__init__() self.domaininfo = domaininfo self.jobname = job self.mapper = mapper self.scheduler = scheduler self.inq = inq self.diverter = Diverter(self.jobname, self.mapper) self.excludedlist = ExcludedList(self.jobname) self.processedcount = 0 # end self.seendir = os.path.join(hqconfig.get('datadir'), self.jobname, 'mseen') self.maxsize = maxsize
def mongo(): import pymongo mongoserver = hqconfig.get('mongo') logging.warn('using MongoDB: %s', mongoserver) return pymongo.Connection(mongoserver)
def __init__(self): self.jobs = {} self.jobslock = threading.RLock() self.mongo = pymongo.Connection(hqconfig.get('mongo')) self.configdb = self.mongo.crawl self.jobconfigs = JobConfigs(self.configdb)
def GET(self): if setup_problems: web.header("content-type", "text/html") return self.render("error_setup", setup_problems) if db is None: web.header("content-type", "text/html") return "MongoDB connection is not available." " Make sure mongodb is running at %s" % hqconfig.get("mongo") errors = None try: jobs = jobconfigs.get_alljobs() except Exception, ex: errors = [str(ex)] jobs = []
except ImportError, ex: setup_problems.append(ex) import hqconfig try: import urihash from mongocrawlinfo import CrawlInfo except ImportError, ex: setup_problems.append(ex) from mongojobconfigs import JobConfigs from weblib import BaseApp from zkcoord import Coordinator try: mongo = pymongo.Connection(host=hqconfig.get("mongo")) db = mongo.crawl except: mongo = None db = None coord = Coordinator(hqconfig.get("zkhosts")) jobconfigs = JobConfigs(db) urls = ("/?", "Status", "/q/(.*)", "Query") app = web.application(urls, globals()) class Status(BaseApp): """implements control web user interface for crawl headquarters"""
def coordinator(): from zkcoord import Coordinator return Coordinator(hqconfig.get('zkhosts'))
import pymongo import json import time import re from urlparse import urlsplit, urlunsplit import atexit import logging import urihash from weblib import BaseApp from mongocrawlinfo import CrawlInfo from zkcoord import Coordinator import hqconfig try: mongo = pymongo.Connection(hqconfig.get('mongo')) db = mongo.crawl except: mongo = None db = None urls = ( '/?', 'Status', '/q/(.*)', 'Query' ) app = web.application(urls, globals()) class Status(BaseApp): '''implements control web user interface for crawl headquarters''' def GET(self): if db is None:
try: jobconfigs.add_job_server(job, server) r = dict(p, success=1) except Exception, ex: r = dict(p, success=0, error=str(ex)) return json.dumps(r) class Static: """fallback static files handler. so as to make status page work even without static files serving configuration at container level. """ STATICDIR = os.path.abspath(os.path.join(os.path.dirname(__file__), '../static')) def GET(self, path): apath = os.path.join(self.STATICDIR, path) if not os.path.isfile(apath): raise web.notfound(path) return open(apath) if __name__ == '__main__': logging.basicConfig(filename='/tmp/status.log', level=logging.INFO) try: app.run() except Exception as ex: logging.critical('app.run() terminated with error', exc_info=1) else: # for debugging web.config.debug = hqconfig.get('web')['debug'] application = app.wsgifunc()