def restart(self): """ Alternate method to start from a previous restored state """ # Start harvestman controller thread import datamgr self._controller = datamgr.harvestManController() self._controller.start() # Start base tracker self._basetracker.start() time.sleep(2.0) for t in self._trackers[1:]: try: t.start() except AssertionError, e: logconsole(e) pass
def crawl(self): """ Starts crawling for this project """ # Reset flag self._flag = 0 # Clear the event flag # self.exitobj.clear() if os.name == 'nt': t1 = time.clock() else: t1 = time.time() # Set start time on config object self._configobj.starttime = t1 if not self._configobj.urlserver: self.push(self._baseUrlObj, 'crawler') else: try: # Flush url server of any previous urls by # sending a flush command. send_url("flush", self._configobj.urlhost, self._configobj.urlport) send_url(str(self._baseUrlObj.index), self._configobj.urlhost, self._configobj.urlport) except: pass if self._configobj.fastmode: # Start harvestman controller thread import datamgr self._controller = datamgr.harvestManController() self._controller.start() # Create the number of threads in the config file # Pre-launch the number of threads specified # in the config file. # Initialize thread dictionary self._basetracker.setDaemon(True) self._basetracker.start() while self._basetracker.get_status() != 0: time.sleep(0.1) for x in range(1, self._configobj.maxtrackers): # Back to equality among threads if x % 2 == 0: t = crawler.HarvestManUrlFetcher(x, None) else: t = crawler.HarvestManUrlCrawler(x, None) self.add_tracker(t) t.setDaemon(True) t.start() for t in self._trackers: if t.get_role() == 'fetcher': self._numfetchers += 1 elif t.get_role() == 'crawler': self._numcrawlers += 1 # bug: give the threads some time to start, # otherwise we exit immediately sometimes. time.sleep(2.0) self.mainloop() # Set flag to 1 to denote that downloading is finished. self._flag = 1 self.stop_threads(noexit=True) else: self._basetracker.action()
def crawl(self): """ Starts crawling for this project """ # Reset flag self._flag = 0 # Clear the event flag # self.exitobj.clear() if os.name=='nt': t1=time.clock() else: t1=time.time() # Set start time on config object self._configobj.starttime = t1 if not self._configobj.urlserver: self.push(self._baseUrlObj, 'crawler') else: try: # Flush url server of any previous urls by # sending a flush command. send_url("flush", self._configobj.urlhost, self._configobj.urlport) send_url(str(self._baseUrlObj.index), self._configobj.urlhost, self._configobj.urlport) except: pass if self._configobj.fastmode: # Start harvestman controller thread import datamgr self._controller = datamgr.harvestManController() self._controller.start() # Create the number of threads in the config file # Pre-launch the number of threads specified # in the config file. # Initialize thread dictionary self._basetracker.setDaemon(True) self._basetracker.start() while self._basetracker.get_status() != 0: time.sleep(0.1) for x in range(1, self._configobj.maxtrackers): # Back to equality among threads if x % 2==0: t = crawler.HarvestManUrlFetcher(x, None) else: t = crawler.HarvestManUrlCrawler(x, None) self.add_tracker(t) t.setDaemon(True) t.start() for t in self._trackers: if t.get_role() == 'fetcher': self._numfetchers += 1 elif t.get_role() == 'crawler': self._numcrawlers += 1 # bug: give the threads some time to start, # otherwise we exit immediately sometimes. time.sleep(2.0) self.mainloop() # Set flag to 1 to denote that downloading is finished. self._flag = 1 self.stop_threads(noexit = True) else: self._basetracker.action()