예제 #1
0
파일: hq.py 프로젝트: travisfw/hq
    def __init__(self, jobconfigs, jobname, crawlinfo, domaininfo):
        self.jobconfigs = jobconfigs
        self.jobname = jobname
        self.mapper = CrawlMapper(self, self.NWORKSETS_BITS)
        self.workset_state = [0 for i in range(self.mapper.nworksets)]

        # seen-db initialization is delayed until it's actually needed
        self.seen = None
        #self.seen = Seen(dbdir=os.path.join(HQ_HOME, 'seen', self.jobname))
        self.crawlinfodb = crawlinfo
        self.domaininfo = domaininfo
        self.scheduler = Scheduler(hqconfig.worksetdir(self.jobname),
                                   self.mapper)
        # self.inq = HashSplitIncomingQueue(
        #     qdir=hqconfig.inqdir(self.jobname),
        #     buffsize=500)
        self.inq = PooledIncomingQueue(
            qdir=hqconfig.inqdir(self.jobname),
            buffsize=1000)

        self.diverter = Diverter(self.jobname, self.mapper)

        #self.discovered_executor = ThreadPoolExecutor(poolsize=1)

        # currently disabled by default - too slow
        self.use_crawlinfo = False
        self.save_crawlinfo = False
예제 #2
0
    def __init__(self, job, dispatcher_type, maxn):
        self.job = job
        self.maxn = maxn

        self.domaininfo = hqconfig.factory.domaininfo()
        self.jobconfigs = hqconfig.factory.jobconfigs()
        self.coordinator = hqconfig.factory.coordinator()

        # per-job objects
        # TODO: process multiple jobs in one process
        self.mapper = CrawlMapper(CrawlJob(self.job, self.jobconfigs),
                                  hqconfig.NWORKSETS_BITS)
        self.scheduler = Scheduler(hqconfig.worksetdir(self.job),
                                   self.mapper, reading=False)
        self.inqueue = IncomingQueue(hqconfig.inqdir(self.job),
                                     deq=PriorityDequeue)
        self.dispatcher = build_dispatcher(dispatcher_type,
                                           self.domaininfo, self.job,
                                           mapper=self.mapper,
                                           scheduler=self.scheduler,
                                           inqueue=self.inqueue)

        if os.uname()[0] == 'Linux':
            if InqueueProcessor.inqwatcher is None:
                iqw = InqueueProcessor.inqwatcher = InqueueWatcher()
                iqw.start()
            self.watch = InqueueProcessor.inqwatcher.addwatch(self.inqueue.qdir)
예제 #3
0
파일: hq.py 프로젝트: kngenie/crawlhq
    def __init__(self, hq, jobname):
        self.hq = hq
        self.jobconfigs = self.hq.jobconfigs
        self.jobname = jobname
        self.mapper = CrawlMapper(self, hqconfig.NWORKSETS_BITS)
        self.scheduler = Scheduler(hqconfig.worksetdir(self.jobname),
                                   self.mapper)

        readsorted = hqconfig.getint('inq.sort', 1)

        inqdir = hqconfig.inqdir(self.jobname)

        def enqfactory(qdir, **kwargs):
            return PriorityEnqueue(qdir, **kwargs)
        def deqfactory(qdir, **kwargs):
            if readsorted:
                kwargs.update(reader=FPSortingQueueFileReader)
            return PriorityDequeue(qdir, **kwargs)

        self.inq = IncomingQueue(inqdir, enq=enqfactory, deq=deqfactory,
                                 buffsize=1000)

        # self.eninq = PriorityEnqueue(
        #     qdir=hqconfig.inqdir(self.jobname),
        #     buffer=1000)

        # deinqargs = {}
        # if readsorted:
        #     deinqargs['reader'] = FPSortingQueueFileReader
        # self.deinq = PriorityDequeue(qdir=hqconfig.inqdir(self.jobname),
        #                              **deinqargs)

        self._dispatcher_mode = hqconfig.get(
            ('jobs', self.jobname, 'dispatcher'), 'internal')

        self.dispatcher = None
        #self.init_dispatcher()

        # currently disabled by default - too slow
        self.use_crawlinfo = False
        self.save_crawlinfo = False

        self.last_inq_count = 0

        self.addedcount = 0
        self.processedcount = 0
예제 #4
0
파일: hq.py 프로젝트: travisfw/crawlhq
    def __init__(self, hq, jobname):
        self.hq = hq
        self.jobconfigs = self.hq.jobconfigs
        self.jobname = jobname
        self.mapper = CrawlMapper(self, hqconfig.NWORKSETS_BITS)
        self.scheduler = Scheduler(hqconfig.worksetdir(self.jobname),
                                   self.mapper)

        self.inq = PooledIncomingQueue(
            qdir=hqconfig.inqdir(self.jobname),
            buffsize=1000)

        self._dispatcher_mode = hqconfig.get(
            ('jobs', self.jobname, 'dispatcher'), 'internal')
                                            
        self.dispatcher = None
        #self.init_dispatcher()

        # currently disabled by default - too slow
        self.use_crawlinfo = False
        self.save_crawlinfo = False

        self.last_inq_count = 0
예제 #5
0
파일: __init__.py 프로젝트: kngenie/crawlhq
 def wsdir(self, job, wsid=None):
     if wsid is None:
         return hqconfig.worksetdir(job)
     else:
         return os.path.join(hqconfig.worksetdir(job), '%d' % wsid)
예제 #6
0
파일: dispatcher.py 프로젝트: travisfw/hq
 def __init__(self, job, mapper):
     self.jobname = job
     self.mapper = mapper
     wsdir = hqconfig.worksetdir(self.jobname)
     self.worksets = [WorksetWriter(wsdir, wsid)
                      for wsid in xrange(self.mapper.nworksets)]