예제 #1
0
    def restart(self):
        """ Alternate method to start from a previous restored state """

        # Start harvestman controller thread
        import datamgr

        self._controller = datamgr.harvestManController()
        self._controller.start()

        # Start base tracker
        self._basetracker.start()
        time.sleep(2.0)

        for t in self._trackers[1:]:
            try:
                t.start()
            except AssertionError, e:
                logconsole(e)
                pass
예제 #2
0
    def restart(self):
        """ Alternate method to start from a previous restored state """

        # Start harvestman controller thread
        import datamgr
        
        self._controller = datamgr.harvestManController()
        self._controller.start()

        # Start base tracker
        self._basetracker.start()
        time.sleep(2.0)
        
        for t in self._trackers[1:]:
            try:
                t.start()
            except AssertionError, e:
                logconsole(e)
                pass
예제 #3
0
    def crawl(self):
        """ Starts crawling for this project """

        # Reset flag
        self._flag = 0

        # Clear the event flag
        # self.exitobj.clear()

        if os.name == 'nt':
            t1 = time.clock()
        else:
            t1 = time.time()

        # Set start time on config object
        self._configobj.starttime = t1

        if not self._configobj.urlserver:
            self.push(self._baseUrlObj, 'crawler')
        else:
            try:
                # Flush url server of any previous urls by
                # sending a flush command.
                send_url("flush", self._configobj.urlhost,
                         self._configobj.urlport)
                send_url(str(self._baseUrlObj.index), self._configobj.urlhost,
                         self._configobj.urlport)
            except:
                pass

        if self._configobj.fastmode:

            # Start harvestman controller thread
            import datamgr

            self._controller = datamgr.harvestManController()
            self._controller.start()

            # Create the number of threads in the config file
            # Pre-launch the number of threads specified
            # in the config file.

            # Initialize thread dictionary
            self._basetracker.setDaemon(True)
            self._basetracker.start()

            while self._basetracker.get_status() != 0:
                time.sleep(0.1)

            for x in range(1, self._configobj.maxtrackers):

                # Back to equality among threads
                if x % 2 == 0:
                    t = crawler.HarvestManUrlFetcher(x, None)
                else:
                    t = crawler.HarvestManUrlCrawler(x, None)

                self.add_tracker(t)
                t.setDaemon(True)
                t.start()

            for t in self._trackers:

                if t.get_role() == 'fetcher':
                    self._numfetchers += 1
                elif t.get_role() == 'crawler':
                    self._numcrawlers += 1

            # bug: give the threads some time to start,
            # otherwise we exit immediately sometimes.
            time.sleep(2.0)

            self.mainloop()

            # Set flag to 1 to denote that downloading is finished.
            self._flag = 1

            self.stop_threads(noexit=True)
        else:
            self._basetracker.action()
예제 #4
0
    def crawl(self):
        """ Starts crawling for this project """

        # Reset flag
        self._flag = 0

        # Clear the event flag
        # self.exitobj.clear()
        
        if os.name=='nt':
            t1=time.clock()
        else:
            t1=time.time()

        # Set start time on config object
        self._configobj.starttime = t1

        if not self._configobj.urlserver:
            self.push(self._baseUrlObj, 'crawler')
        else:
            try:
                # Flush url server of any previous urls by
                # sending a flush command.
                send_url("flush", self._configobj.urlhost, self._configobj.urlport)
                send_url(str(self._baseUrlObj.index),
                         self._configobj.urlhost,
                         self._configobj.urlport)
            except:
                pass

        if self._configobj.fastmode:

            # Start harvestman controller thread
            import datamgr
            
            self._controller = datamgr.harvestManController()
            self._controller.start()
            
            # Create the number of threads in the config file
            # Pre-launch the number of threads specified
            # in the config file.

            # Initialize thread dictionary
            self._basetracker.setDaemon(True)
            self._basetracker.start()

            while self._basetracker.get_status() != 0:
                time.sleep(0.1)

            for x in range(1, self._configobj.maxtrackers):

                # Back to equality among threads
                if x % 2==0:
                    t = crawler.HarvestManUrlFetcher(x, None)
                else:
                    t = crawler.HarvestManUrlCrawler(x, None)

                self.add_tracker(t)
                t.setDaemon(True)
                t.start()

            for t in self._trackers:

                if t.get_role() == 'fetcher':
                    self._numfetchers += 1
                elif t.get_role() == 'crawler':
                    self._numcrawlers += 1

            # bug: give the threads some time to start,
            # otherwise we exit immediately sometimes.
            time.sleep(2.0)

            self.mainloop()
            
            # Set flag to 1 to denote that downloading is finished.
            self._flag = 1
            
            self.stop_threads(noexit = True)
        else:
            self._basetracker.action()