Exemplo n.º 1
0
    def configure(self):
        """ Configure this class with this config object """

        try:
            import urlparser

            # urlparser.HarvestManUrlParser.reset_IDX()

            self._baseUrlObj = urlparser.HarvestManUrlParser(
                self._configobj.url, 'normal', 0, self._configobj.url,
                self._configobj.projdir)
            SetUrlObject(self._baseUrlObj)
        except urlparser.HarvestManUrlParserError:
            return False

        self._baseUrlObj.starturl = True

        if self._configobj.fastmode:
            self._basetracker = crawler.HarvestManUrlFetcher(
                0, self._baseUrlObj, True)
        else:
            # Disable usethreads
            self._configobj.usethreads = False
            # Disable blocking
            self._configobj.blocking = False
            self._basetracker = crawler.HarvestManUrlDownloader(
                0, self._baseUrlObj, False)

        self._trackers.append(self._basetracker)
        return True
Exemplo n.º 2
0
    def configure(self):
        """ Configure this class with this config object """

        try:
            import urlparser

            self._baseUrlObj = urlparser.HarvestManUrlParser(
                self._configobj.url, urltypes.TYPE_ANY, 0, self._configobj.url,
                self._configobj.projdir)
            dmgr = GetObject('datamanager')
            dmgr.add_url(self._baseUrlObj)

        except urlparser.HarvestManUrlParserError:
            return False

        self._baseUrlObj.starturl = True

        if self._configobj.fastmode:
            self._basetracker = crawler.HarvestManUrlFetcher(
                0, self._baseUrlObj, True)
        else:
            # Disable usethreads
            self._configobj.usethreads = False
            # Disable blocking
            self._configobj.blocking = False
            self._basetracker = crawler.HarvestManUrlDownloader(
                0, self._baseUrlObj, False)

        self._trackers.append(self._basetracker)
        return True
Exemplo n.º 3
0
    def set_state(self, state):
        """ Set state to a previous saved state """

        # Get base url object
        self._baseUrlObj = state.get('_baseUrlObj')
        # If base url object is None, we cannot proceed
        # so return -1
        if self._baseUrlObj is None:
            return -1

        # Set state for simple data-members
        self._pushes = state.get('_pushes', 0)
        self._lockedinst = state.get('_lockedinst', 0)
        self._lasttimestamp = state.get('_lasttimestamp', time.time())
        self._requests = state.get('_requests', 0)
        self._lastblockedtime = state.get('_lastblockedtime', 0)
        self.buffer = state.get('buffer', [])

        # Set state for queues
        self.url_q.queue = state.get('url_q', MyDeque())

        self.data_q.queue = state.get('data_q', MyDeque())

        # If both queues are empty, we don't have anything to do
        if len(self.url_q.queue) == 0 and len(self.data_q.queue) == 0:
            moreinfo('Size of data/url queues are zero, nothing to re-run')
            return -1

        cfg = GetObject('config')
        self._configobj = cfg

        if cfg.fastmode:
            # Create threads and set their state
            for idx, tdict in state.get('threadinfo').items():
                role = tdict.get('role')
                t = None

                if role == 'fetcher':
                    t = crawler.HarvestManUrlFetcher(idx, None)
                    self._numfetchers += 1
                elif role == 'crawler':
                    t = crawler.HarvestManUrlCrawler(idx, None)
                    t.links = tdict.get('links')
                    self._numcrawlers += 1

                if t:
                    t._status = tdict.get('_status')
                    t._loops = tdict.get('_loops')
                    t._url = tdict.get('_url')
                    t._urlobject = tdict.get('_urlobject')
                    t.buffer = tdict.get('buffer')
                    if t._urlobject: t._resuming = True

                    self.add_tracker(t)
                    t.setDaemon(True)

            # Set base tracker
            self._basetracker = self._trackers[0]
Exemplo n.º 4
0
    def dead_thread_callback(self, t):
        """ Call back function called by a thread if it
        dies with an exception. This class then creates
        a fresh thread, migrates the data of the dead
        thread to it """

        try:
            self._cond.acquire()
            # First find out the type
            role = t.get_role()
            new_t = None

            if role == 'fetcher':
                new_t = crawler.HarvestManUrlFetcher(t.get_index(), None)
            elif role == 'crawler':
                new_t = crawler.HarvestManUrlCrawler(t.get_index(), None)

            # Migrate data and start thread
            if new_t:
                new_t._url = t._url
                new_t._urlobject = t._urlobject
                new_t.buffer = copy.deepcopy(t.buffer)
                # If this is a crawler get links also
                if role == 'crawler':
                    new_t.links = t.links[:]

                # Replace dead thread in the list
                idx = self._trackers.index(t)
                self._trackers[idx] = new_t
                new_t._resuming = True
                new_t.start()
                time.sleep(2.0)

                return 0
            else:
                # Could not make new thread, so decrement
                # count of threads.
                # Remove from tracker list
                self._trackers.remove(t)

                if role == 'fetcher':
                    self._numfetchers -= 1
                elif role == 'crawler':
                    self._numcrawlers -= 1

                return -1
        finally:
            self._cond.release()
Exemplo n.º 5
0
    def crawl(self):
        """ Starts crawling for this project """

        # Reset flag
        self._flag = 0

        # Clear the event flag
        # self.exitobj.clear()

        if os.name == 'nt':
            t1 = time.clock()
        else:
            t1 = time.time()

        # Set start time on config object
        self._configobj.starttime = t1

        if not self._configobj.urlserver:
            self.push(self._baseUrlObj, 'crawler')
        else:
            try:
                # Flush url server of any previous urls by
                # sending a flush command.
                send_url("flush", self._configobj.urlhost,
                         self._configobj.urlport)
                send_url(str(self._baseUrlObj.index), self._configobj.urlhost,
                         self._configobj.urlport)
            except:
                pass

        if self._configobj.fastmode:

            # Start harvestman controller thread
            import datamgr

            self._controller = datamgr.harvestManController()
            self._controller.start()

            # Create the number of threads in the config file
            # Pre-launch the number of threads specified
            # in the config file.

            # Initialize thread dictionary
            self._basetracker.setDaemon(True)
            self._basetracker.start()

            while self._basetracker.get_status() != 0:
                time.sleep(0.1)

            for x in range(1, self._configobj.maxtrackers):

                # Back to equality among threads
                if x % 2 == 0:
                    t = crawler.HarvestManUrlFetcher(x, None)
                else:
                    t = crawler.HarvestManUrlCrawler(x, None)

                self.add_tracker(t)
                t.setDaemon(True)
                t.start()

            for t in self._trackers:

                if t.get_role() == 'fetcher':
                    self._numfetchers += 1
                elif t.get_role() == 'crawler':
                    self._numcrawlers += 1

            # bug: give the threads some time to start,
            # otherwise we exit immediately sometimes.
            time.sleep(2.0)

            self.mainloop()

            # Set flag to 1 to denote that downloading is finished.
            self._flag = 1

            self.stop_threads(noexit=True)
        else:
            self._basetracker.action()
Exemplo n.º 6
0
    def crawl(self):
        """ Starts crawling for this project """

        # Reset flag
        self._flag = 0

        t1 = time.time()

        # Set start time on config object
        self._configobj.starttime = t1

        self.push(self._baseUrlObj, 'crawler')

        if self._configobj.fastmode:

            # Start harvestman controller thread
            import datamgr

            self._controller = datamgr.HarvestManController()
            self._controller.start()

            # Create the number of threads in the config file
            # Pre-launch the number of threads specified
            # in the config file.

            # Initialize thread dictionary
            self._basetracker.setDaemon(True)
            self._basetracker.start()

            while self._basetracker.get_status() != 0:
                time.sleep(0.1)

            for x in range(1, self._configobj.maxtrackers):

                # Back to equality among threads
                if x % 2 == 0:
                    t = crawler.HarvestManUrlFetcher(x, None)
                else:
                    t = crawler.HarvestManUrlCrawler(x, None)

                self.add_tracker(t)
                t.setDaemon(True)
                t.start()

            for t in self._trackers:

                if t.get_role() == 'fetcher':
                    self._numfetchers += 1
                elif t.get_role() == 'crawler':
                    self._numcrawlers += 1

            # bug: give the threads some time to start,
            # otherwise we exit immediately sometimes.
            time.sleep(2.0)

            self.mainloop()

            # Set flag to 1 to denote that downloading is finished.
            self._flag = 1

            self.stop_threads(noexit=True)
        else:
            self._basetracker.action()