def __init__(self, jobconfigs, jobname, crawlinfo, domaininfo): self.jobconfigs = jobconfigs self.jobname = jobname self.mapper = CrawlMapper(self, self.NWORKSETS_BITS) self.workset_state = [0 for i in range(self.mapper.nworksets)] # seen-db initialization is delayed until it's actually needed self.seen = None #self.seen = Seen(dbdir=os.path.join(HQ_HOME, 'seen', self.jobname)) self.crawlinfodb = crawlinfo self.domaininfo = domaininfo self.scheduler = Scheduler(hqconfig.worksetdir(self.jobname), self.mapper) # self.inq = HashSplitIncomingQueue( # qdir=hqconfig.inqdir(self.jobname), # buffsize=500) self.inq = PooledIncomingQueue( qdir=hqconfig.inqdir(self.jobname), buffsize=1000) self.diverter = Diverter(self.jobname, self.mapper) #self.discovered_executor = ThreadPoolExecutor(poolsize=1) # currently disabled by default - too slow self.use_crawlinfo = False self.save_crawlinfo = False
def __init__(self, job, dispatcher_type, maxn): self.job = job self.maxn = maxn self.domaininfo = hqconfig.factory.domaininfo() self.jobconfigs = hqconfig.factory.jobconfigs() self.coordinator = hqconfig.factory.coordinator() # per-job objects # TODO: process multiple jobs in one process self.mapper = CrawlMapper(CrawlJob(self.job, self.jobconfigs), hqconfig.NWORKSETS_BITS) self.scheduler = Scheduler(hqconfig.worksetdir(self.job), self.mapper, reading=False) self.inqueue = IncomingQueue(hqconfig.inqdir(self.job), deq=PriorityDequeue) self.dispatcher = build_dispatcher(dispatcher_type, self.domaininfo, self.job, mapper=self.mapper, scheduler=self.scheduler, inqueue=self.inqueue) if os.uname()[0] == 'Linux': if InqueueProcessor.inqwatcher is None: iqw = InqueueProcessor.inqwatcher = InqueueWatcher() iqw.start() self.watch = InqueueProcessor.inqwatcher.addwatch(self.inqueue.qdir)
def __init__(self, hq, jobname): self.hq = hq self.jobconfigs = self.hq.jobconfigs self.jobname = jobname self.mapper = CrawlMapper(self, hqconfig.NWORKSETS_BITS) self.scheduler = Scheduler(hqconfig.worksetdir(self.jobname), self.mapper) readsorted = hqconfig.getint('inq.sort', 1) inqdir = hqconfig.inqdir(self.jobname) def enqfactory(qdir, **kwargs): return PriorityEnqueue(qdir, **kwargs) def deqfactory(qdir, **kwargs): if readsorted: kwargs.update(reader=FPSortingQueueFileReader) return PriorityDequeue(qdir, **kwargs) self.inq = IncomingQueue(inqdir, enq=enqfactory, deq=deqfactory, buffsize=1000) # self.eninq = PriorityEnqueue( # qdir=hqconfig.inqdir(self.jobname), # buffer=1000) # deinqargs = {} # if readsorted: # deinqargs['reader'] = FPSortingQueueFileReader # self.deinq = PriorityDequeue(qdir=hqconfig.inqdir(self.jobname), # **deinqargs) self._dispatcher_mode = hqconfig.get( ('jobs', self.jobname, 'dispatcher'), 'internal') self.dispatcher = None #self.init_dispatcher() # currently disabled by default - too slow self.use_crawlinfo = False self.save_crawlinfo = False self.last_inq_count = 0 self.addedcount = 0 self.processedcount = 0
def __init__(self, hq, jobname): self.hq = hq self.jobconfigs = self.hq.jobconfigs self.jobname = jobname self.mapper = CrawlMapper(self, hqconfig.NWORKSETS_BITS) self.scheduler = Scheduler(hqconfig.worksetdir(self.jobname), self.mapper) self.inq = PooledIncomingQueue( qdir=hqconfig.inqdir(self.jobname), buffsize=1000) self._dispatcher_mode = hqconfig.get( ('jobs', self.jobname, 'dispatcher'), 'internal') self.dispatcher = None #self.init_dispatcher() # currently disabled by default - too slow self.use_crawlinfo = False self.save_crawlinfo = False self.last_inq_count = 0
def wsdir(self, job, wsid=None): if wsid is None: return hqconfig.worksetdir(job) else: return os.path.join(hqconfig.worksetdir(job), '%d' % wsid)
def __init__(self, job, mapper): self.jobname = job self.mapper = mapper wsdir = hqconfig.worksetdir(self.jobname) self.worksets = [WorksetWriter(wsdir, wsid) for wsid in xrange(self.mapper.nworksets)]