def _startBulba(self): """This runs the Taras Bulba process. A Taras Bulba spawns and kills worker processes on demand. The reason for killing workers is to work around potential memory leaks. Since a Bulba is forked from the main process early on, it has a very low RAM footprint, so re-forking the workers off a Bulba every so often makes sure their RAM usage is reset.""" try: Exceptions.disable_pdb_on_error() MyLogger.subprocess_id = "TB" # loop until the completion event is raised # at this stage the workers are dead (or not started) while not self._taras_exit_event.is_set(): if self.verbose: print >> log, "(re)creating worker processes" # create the workers self._compute_workers = [] self._io_workers = [] for i, core in enumerate(self._cores): proc_id = "comp%02d" % i self._compute_workers.append( multiprocessing.Process(name=proc_id, target=self._start_worker, args=(self, proc_id, [core], self._compute_queue, self.pause_on_start))) for i, queue in enumerate(self._io_queues): proc_id = "io%02d" % i self._io_workers.append( multiprocessing.Process(name=proc_id, target=self._start_worker, args=(self, proc_id, None, queue, self.pause_on_start))) # start the workers if self.verbose: print >> log, "starting worker processes" worker_map = {} for proc in self._compute_workers + self._io_workers: proc.start() worker_map[proc.pid] = proc dead_workers = {} # set event to indicate workers are started self._workers_started_event.set() # go to sleep until we're told to do the whole thing again while not self._taras_restart_event.is_set(): if self.verbose: print >> log, "waiting for restart signal" try: self._taras_restart_event.wait(5) if self.verbose: print >> log, "wait done" except KeyboardInterrupt: print >> log, ModColor.Str("Ctrl+C caught, exiting") self._termination_event.set() self._taras_exit_event.set() # check for dead children, unless workers_started event has been cleared by restartWorkers() # (in which case they're already going to be exiting) if self._workers_started_event.is_set(): for pid, proc in worker_map.iteritems(): if not proc.is_alive(): proc.join() dead_workers[proc.pid] = proc if proc.exitcode < 0: print >> log, ModColor.Str( "worker '%s' killed by signal %s" % (proc.name, SIGNALS_TO_NAMES_DICT[-proc.exitcode]) ) else: print >> log, ModColor.Str( "worker '%s' died with exit code %d" % (proc.name, proc.exitcode)) # if workers have died, initiate bailout if dead_workers: print >> log, ModColor.Str( "%d worker(s) have died. Initiating shutdown." % len(dead_workers)) self._taras_restart_event.set( ) # to break out of loop self._termination_event.set() self._taras_exit_event.set() self._taras_restart_event.clear() if self._termination_event.is_set(): if self.verbose: print >> log, "terminating workers, since termination event is set" for proc in worker_map.itervalues(): if proc.is_alive(): proc.terminate() if self.verbose: print >> log, "reaping workers" # join processes for pid, proc in worker_map.iteritems(): if self.verbose: print >> log, "reaping worker %d" % pid proc.join() if self.verbose: print >> log, "worker %d's immortal soul has been put to rest" % pid # for pid, proc in worker_map.iteritems(): # if self.verbose: # print>> log, "joining worker '%s' (%d) %s %s"%(proc.name, pid, proc.is_alive(), proc.exitcode) # proc.join(5) # if proc.is_alive(): # print>> log, ModColor.Str("worker '%s' clinging on to life after 5s, killing it"%proc.name) # proc.terminate() # proc.join(5) if self.verbose: print >> log, "all workers have been reaped" if self.verbose: print >> log, "exiting" except: print >> log, ModColor.Str( "exception raised in Taras Bulba process, see below. This is a bug!" ) print >> log, traceback.format_exc() self._workers_started_event.set() self._termination_event.set() self._taras_exit_event.set()
def main(OP=None, messages=[]): if OP is None: OP = MyPickle.Load(SaveFile) print("Using settings from %s, then command line."%SaveFile) DicoConfig = OP.DicoConfig ImageName = DicoConfig["Output"]["Name"] if not ImageName: raise Exceptions.UserInputError("--Output-Name not specified, can't continue.") if not DicoConfig["Data"]["MS"]: raise Exceptions.UserInputError("--Data-MS not specified, can't continue.") # create directory if it exists dirname = os.path.dirname(ImageName) if not os.path.exists(dirname) and not dirname == "": os.mkdir(dirname) # setup logging logger.logToFile(ImageName + ".log", append=DicoConfig["Log"]["Append"]) global log log = logger.getLogger("DDFacet") # disable colors and progressbars if requested ModColor.silent = SkyModel.Other.ModColor.silent = \ progressbar.ProgressBar.silent = \ DicoConfig["Log"]["Boring"] if messages: if not DicoConfig["Log"]["Boring"]: #os.system('clear') logo.print_logo() for msg in messages: print(msg, file=log) print("Checking system configuration:", file=log) # check for SHM size ram_size = os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES') shm_stats = os.statvfs('/dev/shm') shm_size = shm_stats.f_bsize * shm_stats.f_blocks shm_relsize = shm_size / float(ram_size) shm_avail = shm_stats.f_bsize * shm_stats.f_bavail / float(ram_size) if shm_relsize < 0.6: print(ModColor.Str("""WARNING: max shared memory size is only {:.0%} of total RAM size. This can cause problems for large imaging jobs. A setting of 90% is recommended for DDFacet and killMS. If your processes keep failing with SIGBUS or "bus error" messages, it is most likely for this reason. You can change the memory size by running $ sudo mount -o remount,size=90% /dev/shm To make the change permanent, edit /etc/defaults/tmps, and add a line saying "SHM_SIZE=90%". """.format(shm_relsize)), file=log) else: print(" Max shared memory size is {:.0%} of total RAM size; {:.0%} currently available".format(shm_relsize, shm_avail), file=log) try: output = subprocess.check_output(["/sbin/sysctl", "vm.max_map_count"],universal_newlines=True) except Exception: print(ModColor.Str("""WARNING: /sbin/sysctl vm.max_map_count failed. Unable to check this setting."""), file=log) max_map_count = None else: max_map_count = int(output.strip().rsplit(" ", 1)[-1]) if max_map_count is not None: if max_map_count < 500000: print(ModColor.Str("""WARNING: sysctl vm.max_map_count = {}. This may be too little for large DDFacet and killMS jobs. If you get strange "file exists" errors on /dev/shm, them try to bribe, beg or threaten your friendly local sysadmin into setting vm.max_map_count=1000000 in /etc/sysctl.conf. """.format(max_map_count)), file=log) else: print(" sysctl vm.max_map_count = {}".format(max_map_count), file=log) # check for memory lock limits import resource msoft, mhard = resource.getrlimit(resource.RLIMIT_MEMLOCK) if msoft >=0 or mhard >=0: print(ModColor.Str("""WARNING: your system has a limit on memory locks configured. This may possibly slow down DDFacet performance. You can try removing the limit by running $ ulimit -l unlimited If this gives an "operation not permitted" error, you can try to bribe, beg or threaten your friendly local sysadmin into doing # echo "* - memlock unlimited" >> /etc/security/limits.conf """), file=log) if DicoConfig["Debug"]["Pdb"] == "always": print("--Debug-Pdb=always: unexpected errors will be dropped into pdb", file=log) Exceptions.enable_pdb_on_error(ModColor.Str("DDFacet has encountered an unexpected error. Dropping you into pdb for a post-mortem.\n" + "(This is because you're running with --Debug-Pdb set to 'always'.)")) elif DicoConfig["Debug"]["Pdb"] == "auto" and not DicoConfig["Log"]["Boring"]: print("--Debug-Pdb=auto and not --Log-Boring: unexpected errors will be dropped into pdb", file=log) Exceptions.enable_pdb_on_error(ModColor.Str("DDFacet has encountered an unexpected error. Dropping you into pdb for a post-mortem.\n" + "(This is because you're running with --Debug-Pdb set to 'auto' and --Log-Boring is off.)")) # print current options OP.Print(dest=log) # enable memory logging logger.enableMemoryLogging(DicoConfig["Log"]["Memory"]) # get rid of old shm arrays from previous runs Multiprocessing.cleanupStaleShm() # initialize random seed from config if set, or else from system time if DicoConfig["Misc"]["RandomSeed"] is not None: DicoConfig["Misc"]["RandomSeed"]=int(DicoConfig["Misc"]["RandomSeed"]) print("random seed=%d (explicit)" % DicoConfig["Misc"]["RandomSeed"], file=log) else: DicoConfig["Misc"]["RandomSeed"] = int(time.time()) print("random seed=%d (automatic)" % DicoConfig["Misc"]["RandomSeed"], file=log) np.random.seed(DicoConfig["Misc"]["RandomSeed"]) # init NCPU for different bits of parallelism ncpu = int(DicoConfig["Parallel"]["NCPU"] or psutil.cpu_count()) DicoConfig["Parallel"]["NCPU"]=ncpu _pyArrays.pySetOMPNumThreads(ncpu) NpParallel.NCPU_global = ModFFTW.NCPU_global = ncpu numexpr.set_num_threads(ncpu) print("using up to %d CPUs for parallelism" % ncpu, file=log) # write parset OP.ToParset("%s.parset"%ImageName) Mode = DicoConfig["Output"]["Mode"] # init semaphores, as they're needed for weight calculation too ClassFacetMachine.ClassFacetMachine.setup_semaphores(DicoConfig) # data machine initialized for all cases except PSF-only mode # psf machine initialized for all cases except Predict-only mode Imager = ClassDeconvMachine.ClassImagerDeconv(GD=DicoConfig, BaseName=ImageName, predict_only=(Mode == "Predict" or Mode == "Subtract"), data=(Mode != "PSF"), psf=(Mode != "Predict" and Mode != "Dirty" and Mode != "Subtract"), readcol=(Mode != "Predict" and Mode != "PSF"), deconvolve=("Clean" in Mode)) Imager.Init() # Imager.testDegrid() # stop if "Predict" in Mode or "Subtract" in Mode: Imager.GivePredict() if "Clean" in Mode: Imager.main() elif "Dirty" in Mode: sparsify = DicoConfig["Comp"]["Sparsification"] if sparsify and isinstance(sparsify, list): sparsify = sparsify[0] Imager.GiveDirty(psf="PSF" in Mode, sparsify=sparsify) elif "PSF" in Mode: sparsify = DicoConfig["Comp"]["Sparsification"] if sparsify and isinstance(sparsify, list): sparsify = sparsify[0] Imager.MakePSF(sparsify=sparsify) elif "RestoreAndShift" == Mode: Imager.RestoreAndShift()
def main(OP=None, messages=[]): if OP is None: OP = MyPickle.Load(SaveFile) print "Using settings from %s, then command line." % SaveFile DicoConfig = OP.DicoConfig ImageName = DicoConfig["Output"]["Name"] if not ImageName: raise Exceptions.UserInputError( "--Output-Name not specified, can't continue.") if not DicoConfig["Data"]["MS"]: raise Exceptions.UserInputError( "--Data-MS not specified, can't continue.") # create directory if it exists dirname = os.path.dirname(ImageName) if not os.path.exists(dirname) and not dirname == "": os.mkdir(dirname) # setup logging MyLogger.logToFile(ImageName + ".log", append=DicoConfig["Log"]["Append"]) global log log = MyLogger.getLogger("DDFacet") # disable colors and progressbars if requested ModColor.silent = SkyModel.Other.ModColor.silent = \ progressbar.ProgressBar.silent = \ DicoConfig["Log"]["Boring"] if messages: if not DicoConfig["Log"]["Boring"]: #os.system('clear') logo.print_logo() for msg in messages: print >> log, msg if DicoConfig["Debug"]["Pdb"] == "always": print >> log, "--Debug-Pdb=always: unexpected errors will be dropped into pdb" Exceptions.enable_pdb_on_error( ModColor.Str( "DDFacet has encountered an unexpected error. Dropping you into pdb for a post-mortem.\n" + "(This is because you're running with --Debug-Pdb set to 'always'.)" )) elif DicoConfig["Debug"][ "Pdb"] == "auto" and not DicoConfig["Log"]["Boring"]: print >> log, "--Debug-Pdb=auto and not --Log-Boring: unexpected errors will be dropped into pdb" Exceptions.enable_pdb_on_error( ModColor.Str( "DDFacet has encountered an unexpected error. Dropping you into pdb for a post-mortem.\n" + "(This is because you're running with --Debug-Pdb set to 'auto' and --Log-Boring is off.)" )) # print current options OP.Print(dest=log) # enable memory logging MyLogger.enableMemoryLogging(DicoConfig["Log"]["Memory"]) # get rid of old shm arrays from previous runs Multiprocessing.cleanupStaleShm() # initialize random seed from config if set, or else from system time if DicoConfig["Misc"]["RandomSeed"] is not None: print >> log, "random seed=%d (explicit)" % DicoConfig["Misc"][ "RandomSeed"] else: DicoConfig["Misc"]["RandomSeed"] = int(time.time()) print >> log, "random seed=%d (automatic)" % DicoConfig["Misc"][ "RandomSeed"] np.random.seed(DicoConfig["Misc"]["RandomSeed"]) # init NCPU for different bits of parallelism ncpu = DicoConfig["Parallel"]["NCPU"] or psutil.cpu_count() DicoConfig["Parallel"]["NCPU"] = ncpu _pyArrays.pySetOMPNumThreads(ncpu) NpParallel.NCPU_global = ModFFTW.NCPU_global = ncpu numexpr.set_num_threads(ncpu) print >> log, "using up to %d CPUs for parallelism" % ncpu # write parset OP.ToParset("%s.parset" % ImageName) Mode = DicoConfig["Output"]["Mode"] # init semaphores, as they're needed for weight calculation too ClassFacetMachine.ClassFacetMachine.setup_semaphores(DicoConfig) # data machine initialized for all cases except PSF-only mode # psf machine initialized for all cases except Predict-only mode Imager = ClassDeconvMachine.ClassImagerDeconv( GD=DicoConfig, BaseName=ImageName, predict_only=(Mode == "Predict" or Mode == "Subtract"), data=(Mode != "PSF"), psf=(Mode != "Predict" and Mode != "Dirty" and Mode != "Subtract"), readcol=(Mode != "Predict" and Mode != "PSF"), deconvolve=("Clean" in Mode)) Imager.Init() # Imager.testDegrid() # stop if "Predict" in Mode or "Subtract" in Mode: Imager.GivePredict() if "Clean" in Mode: Imager.main() elif "Dirty" in Mode: sparsify = DicoConfig["Comp"]["Sparsification"] if sparsify and isinstance(sparsify, list): sparsify = sparsify[0] Imager.GiveDirty(psf="PSF" in Mode, sparsify=sparsify) elif "PSF" in Mode: sparsify = DicoConfig["Comp"]["Sparsification"] if sparsify and isinstance(sparsify, list): sparsify = sparsify[0] Imager.MakePSF(sparsify=sparsify) elif "RestoreAndShift" == Mode: Imager.RestoreAndShift()
except KeyboardInterrupt: print(traceback.format_exc(), file=log) print(ModColor.Str("DDFacet interrupted by Ctrl+C", col="red"), file=log) APP.terminate() retcode = 1 #Should at least give the command line an indication of failure except Exceptions.UserInputError: print(ModColor.Str(sys.exc_info()[1], col="red"), file=log) print(ModColor.Str("There was a problem with some user input. See messages above for an indication."), file=log) APP.terminate() retcode = 1 # Should at least give the command line an indication of failure except WorkerProcessError: print(ModColor.Str("A worker process has died on us unexpectedly. This probably indicates a bug:"), file=log) print(ModColor.Str(" the original underlying error may be reported in the log [possibly far] above."), file=log) report_error = True except: if sys.exc_info()[0] is not WorkerProcessError and Exceptions.is_pdb_enabled(): APP.terminate() raise else: print(traceback.format_exc(), file=log) report_error = True if report_error: logfileName = logger.getLogFilename() logfileName = logfileName if logfileName is not None else "[file logging is not enabled]" print("", file=log) print(ModColor.Str( "There was a problem after %s; if you think this is a bug please open an issue, "% T.timehms(), col = "red"), file=log) print(ModColor.Str(" quote your version of DDFacet and attach your logfile.", col="red"), file=log) print(ModColor.Str(
print >> log, ModColor.Str( "There was a problem with some user input. See messages above for an indication." ) APP.terminate() retcode = 1 # Should at least give the command line an indication of failure except WorkerProcessError: print >> log, ModColor.Str( "A worker process has died on us unexpectedly. This probably indicates a bug:" ) print >> log, ModColor.Str( " the original underlying error may be reported in the log [possibly far] above." ) report_error = True except: if sys.exc_info( )[0] is not WorkerProcessError and Exceptions.is_pdb_enabled(): APP.terminate() raise else: print >> log, traceback.format_exc() report_error = True if report_error: logfileName = MyLogger.getLogFilename() logfileName = logfileName if logfileName is not None else "[file logging is not enabled]" print >> log, "" print >> log, ModColor.Str( "There was a problem after %s; if you think this is a bug please open an issue, " % T.timehms(), col="red") print >> log, ModColor.Str(