Exemplo n.º 1
0
    def _startBulba(self):
        """This runs the Taras Bulba process. A Taras Bulba spawns and kills worker processes on demand.
        The reason for killing workers is to work around potential memory leaks. Since a Bulba is forked
        from the main process early on, it has a very low RAM footprint, so re-forking the workers off
        a Bulba every so often makes sure their RAM usage is reset."""
        try:
            Exceptions.disable_pdb_on_error()
            MyLogger.subprocess_id = "TB"

            # loop until the completion event is raised
            # at this stage the workers are dead (or not started)
            while not self._taras_exit_event.is_set():
                if self.verbose:
                    print >> log, "(re)creating worker processes"
                # create the workers
                self._compute_workers = []
                self._io_workers = []
                for i, core in enumerate(self._cores):
                    proc_id = "comp%02d" % i
                    self._compute_workers.append(
                        multiprocessing.Process(name=proc_id,
                                                target=self._start_worker,
                                                args=(self, proc_id, [core],
                                                      self._compute_queue,
                                                      self.pause_on_start)))
                for i, queue in enumerate(self._io_queues):
                    proc_id = "io%02d" % i
                    self._io_workers.append(
                        multiprocessing.Process(name=proc_id,
                                                target=self._start_worker,
                                                args=(self, proc_id, None,
                                                      queue,
                                                      self.pause_on_start)))

                # start the workers
                if self.verbose:
                    print >> log, "starting  worker processes"
                worker_map = {}
                for proc in self._compute_workers + self._io_workers:
                    proc.start()
                    worker_map[proc.pid] = proc
                dead_workers = {}

                # set event to indicate workers are started
                self._workers_started_event.set()

                # go to sleep until we're told to do the whole thing again
                while not self._taras_restart_event.is_set():
                    if self.verbose:
                        print >> log, "waiting for restart signal"
                    try:
                        self._taras_restart_event.wait(5)
                        if self.verbose:
                            print >> log, "wait done"
                    except KeyboardInterrupt:
                        print >> log, ModColor.Str("Ctrl+C caught, exiting")
                        self._termination_event.set()
                        self._taras_exit_event.set()
                    # check for dead children, unless workers_started event has been cleared by restartWorkers()
                    # (in which case they're already going to be exiting)
                    if self._workers_started_event.is_set():
                        for pid, proc in worker_map.iteritems():
                            if not proc.is_alive():
                                proc.join()
                                dead_workers[proc.pid] = proc
                                if proc.exitcode < 0:
                                    print >> log, ModColor.Str(
                                        "worker '%s' killed by signal %s" %
                                        (proc.name,
                                         SIGNALS_TO_NAMES_DICT[-proc.exitcode])
                                    )
                                else:
                                    print >> log, ModColor.Str(
                                        "worker '%s' died with exit code %d" %
                                        (proc.name, proc.exitcode))
                        # if workers have died, initiate bailout
                        if dead_workers:
                            print >> log, ModColor.Str(
                                "%d worker(s) have died. Initiating shutdown."
                                % len(dead_workers))
                            self._taras_restart_event.set(
                            )  # to break out of loop
                            self._termination_event.set()
                            self._taras_exit_event.set()
                self._taras_restart_event.clear()
                if self._termination_event.is_set():
                    if self.verbose:
                        print >> log, "terminating workers, since termination event is set"
                    for proc in worker_map.itervalues():
                        if proc.is_alive():
                            proc.terminate()
                if self.verbose:
                    print >> log, "reaping workers"
                # join processes
                for pid, proc in worker_map.iteritems():
                    if self.verbose:
                        print >> log, "reaping worker %d" % pid
                    proc.join()
                    if self.verbose:
                        print >> log, "worker %d's immortal soul has been put to rest" % pid

                # for pid, proc in worker_map.iteritems():
                #     if self.verbose:
                #         print>> log, "joining worker '%s' (%d) %s %s"%(proc.name, pid, proc.is_alive(), proc.exitcode)
                #     proc.join(5)
                #     if proc.is_alive():
                #         print>> log, ModColor.Str("worker '%s' clinging on to life after 5s, killing it"%proc.name)
                #         proc.terminate()
                #         proc.join(5)
                if self.verbose:
                    print >> log, "all workers have been reaped"
            if self.verbose:
                print >> log, "exiting"
        except:
            print >> log, ModColor.Str(
                "exception raised in Taras Bulba process, see below. This is a bug!"
            )
            print >> log, traceback.format_exc()
            self._workers_started_event.set()
            self._termination_event.set()
            self._taras_exit_event.set()
Exemplo n.º 2
0
def main(OP=None, messages=[]):
    if OP is None:
        OP = MyPickle.Load(SaveFile)
        print("Using settings from %s, then command line."%SaveFile)

    DicoConfig = OP.DicoConfig

    ImageName = DicoConfig["Output"]["Name"]
    if not ImageName:
        raise Exceptions.UserInputError("--Output-Name not specified, can't continue.")
    if not DicoConfig["Data"]["MS"]:
        raise Exceptions.UserInputError("--Data-MS not specified, can't continue.")

    # create directory if it exists
    dirname = os.path.dirname(ImageName)
    if not os.path.exists(dirname) and not dirname == "":
        os.mkdir(dirname)

    # setup logging
    logger.logToFile(ImageName + ".log", append=DicoConfig["Log"]["Append"])
    global log
    log = logger.getLogger("DDFacet")

    # disable colors and progressbars if requested
    ModColor.silent = SkyModel.Other.ModColor.silent = \
                      progressbar.ProgressBar.silent = \
                      DicoConfig["Log"]["Boring"]

    if messages:
        if not DicoConfig["Log"]["Boring"]:
            #os.system('clear')
            logo.print_logo()
        for msg in messages:
            print(msg, file=log)

    print("Checking system configuration:", file=log)
    # check for SHM size
    ram_size = os.sysconf('SC_PAGE_SIZE') * os.sysconf('SC_PHYS_PAGES')
    shm_stats = os.statvfs('/dev/shm')
    shm_size = shm_stats.f_bsize * shm_stats.f_blocks
    shm_relsize = shm_size / float(ram_size)
    shm_avail = shm_stats.f_bsize * shm_stats.f_bavail / float(ram_size)

    if shm_relsize < 0.6:
        print(ModColor.Str("""WARNING: max shared memory size is only {:.0%} of total RAM size.
            This can cause problems for large imaging jobs. A setting of 90% is recommended for 
            DDFacet and killMS. If your processes keep failing with SIGBUS or "bus error" messages,
            it is most likely for this reason. You can change the memory size by running
                $ sudo mount -o remount,size=90% /dev/shm
            To make the change permanent, edit /etc/defaults/tmps, and add a line saying "SHM_SIZE=90%".
            """.format(shm_relsize)), file=log)
    else:
        print("  Max shared memory size is {:.0%} of total RAM size; {:.0%} currently available".format(shm_relsize, shm_avail), file=log)

    try:
        output = subprocess.check_output(["/sbin/sysctl", "vm.max_map_count"],universal_newlines=True)
    except Exception:
        print(ModColor.Str("""WARNING: /sbin/sysctl vm.max_map_count failed. Unable to check this setting."""), file=log)
        max_map_count = None
    else:
        max_map_count = int(output.strip().rsplit(" ", 1)[-1])

    if max_map_count is not None:
        if max_map_count < 500000:
            print(ModColor.Str("""WARNING: sysctl vm.max_map_count = {}. 
            This may be too little for large DDFacet and killMS jobs. If you get strange "file exists" 
            errors on /dev/shm, them try to bribe, beg or threaten your friendly local sysadmin into 
            setting vm.max_map_count=1000000 in /etc/sysctl.conf.
                """.format(max_map_count)), file=log)
        else:
            print("  sysctl vm.max_map_count = {}".format(max_map_count), file=log)

    # check for memory lock limits
    import resource
    msoft, mhard = resource.getrlimit(resource.RLIMIT_MEMLOCK)
    if msoft >=0 or mhard >=0:
        print(ModColor.Str("""WARNING: your system has a limit on memory locks configured.
            This may possibly slow down DDFacet performance. You can try removing the limit by running
                $ ulimit -l unlimited
            If this gives an "operation not permitted" error, you can try to bribe, beg or threaten 
            your friendly local sysadmin into doing
                # echo "*        -   memlock     unlimited" >> /etc/security/limits.conf
        """), file=log)


    if DicoConfig["Debug"]["Pdb"] == "always":
        print("--Debug-Pdb=always: unexpected errors will be dropped into pdb", file=log)
        Exceptions.enable_pdb_on_error(ModColor.Str("DDFacet has encountered an unexpected error. Dropping you into pdb for a post-mortem.\n" +
                                           "(This is because you're running with --Debug-Pdb set to 'always'.)"))
    elif DicoConfig["Debug"]["Pdb"] == "auto" and not DicoConfig["Log"]["Boring"]:
        print("--Debug-Pdb=auto and not --Log-Boring: unexpected errors will be dropped into pdb", file=log)
        Exceptions.enable_pdb_on_error(ModColor.Str("DDFacet has encountered an unexpected error. Dropping you into pdb for a post-mortem.\n" +
            "(This is because you're running with --Debug-Pdb set to 'auto' and --Log-Boring is off.)"))

    # print current options
    OP.Print(dest=log)

    # enable memory logging
    logger.enableMemoryLogging(DicoConfig["Log"]["Memory"])

    # get rid of old shm arrays from previous runs
    Multiprocessing.cleanupStaleShm()

    # initialize random seed from config if set, or else from system time
    if DicoConfig["Misc"]["RandomSeed"] is not None:
        DicoConfig["Misc"]["RandomSeed"]=int(DicoConfig["Misc"]["RandomSeed"])
        print("random seed=%d (explicit)" % DicoConfig["Misc"]["RandomSeed"], file=log)
    else:
        DicoConfig["Misc"]["RandomSeed"] = int(time.time())
        print("random seed=%d (automatic)" % DicoConfig["Misc"]["RandomSeed"], file=log)
    np.random.seed(DicoConfig["Misc"]["RandomSeed"])

    # init NCPU for different bits of parallelism
    ncpu = int(DicoConfig["Parallel"]["NCPU"] or psutil.cpu_count())
    DicoConfig["Parallel"]["NCPU"]=ncpu
    _pyArrays.pySetOMPNumThreads(ncpu)
    NpParallel.NCPU_global = ModFFTW.NCPU_global = ncpu
    numexpr.set_num_threads(ncpu)
    print("using up to %d CPUs for parallelism" % ncpu, file=log)

    # write parset
    OP.ToParset("%s.parset"%ImageName)

    Mode = DicoConfig["Output"]["Mode"]

    # init semaphores, as they're needed for weight calculation too
    ClassFacetMachine.ClassFacetMachine.setup_semaphores(DicoConfig)

    # data machine initialized for all cases except PSF-only mode
    # psf machine initialized for all cases except Predict-only mode
    Imager = ClassDeconvMachine.ClassImagerDeconv(GD=DicoConfig,
                                                  BaseName=ImageName,
                                                  predict_only=(Mode == "Predict" or Mode == "Subtract"),
                                                  data=(Mode != "PSF"),
                                                  psf=(Mode != "Predict" and Mode != "Dirty" and Mode != "Subtract"),
                                                  readcol=(Mode != "Predict" and Mode != "PSF"),
                                                  deconvolve=("Clean" in Mode))

    Imager.Init()

    # Imager.testDegrid()
    # stop
    if "Predict" in Mode or "Subtract" in Mode:
        Imager.GivePredict()
    if "Clean" in Mode:
        Imager.main()
    elif "Dirty" in Mode:
        sparsify = DicoConfig["Comp"]["Sparsification"]
        if sparsify and isinstance(sparsify, list):
            sparsify = sparsify[0]
        Imager.GiveDirty(psf="PSF" in Mode, sparsify=sparsify)
    elif "PSF" in Mode:
        sparsify = DicoConfig["Comp"]["Sparsification"]
        if sparsify and isinstance(sparsify, list):
            sparsify = sparsify[0]
        Imager.MakePSF(sparsify=sparsify)
    elif "RestoreAndShift" == Mode:
        Imager.RestoreAndShift()
Exemplo n.º 3
0
def main(OP=None, messages=[]):
    if OP is None:
        OP = MyPickle.Load(SaveFile)
        print "Using settings from %s, then command line." % SaveFile

    DicoConfig = OP.DicoConfig

    ImageName = DicoConfig["Output"]["Name"]
    if not ImageName:
        raise Exceptions.UserInputError(
            "--Output-Name not specified, can't continue.")
    if not DicoConfig["Data"]["MS"]:
        raise Exceptions.UserInputError(
            "--Data-MS not specified, can't continue.")

    # create directory if it exists
    dirname = os.path.dirname(ImageName)
    if not os.path.exists(dirname) and not dirname == "":
        os.mkdir(dirname)

    # setup logging
    MyLogger.logToFile(ImageName + ".log", append=DicoConfig["Log"]["Append"])
    global log
    log = MyLogger.getLogger("DDFacet")

    # disable colors and progressbars if requested
    ModColor.silent = SkyModel.Other.ModColor.silent = \
                      progressbar.ProgressBar.silent = \
                      DicoConfig["Log"]["Boring"]

    if messages:
        if not DicoConfig["Log"]["Boring"]:
            #os.system('clear')
            logo.print_logo()
        for msg in messages:
            print >> log, msg

    if DicoConfig["Debug"]["Pdb"] == "always":
        print >> log, "--Debug-Pdb=always: unexpected errors will be dropped into pdb"
        Exceptions.enable_pdb_on_error(
            ModColor.Str(
                "DDFacet has encountered an unexpected error. Dropping you into pdb for a post-mortem.\n"
                +
                "(This is because you're running with --Debug-Pdb set to 'always'.)"
            ))
    elif DicoConfig["Debug"][
            "Pdb"] == "auto" and not DicoConfig["Log"]["Boring"]:
        print >> log, "--Debug-Pdb=auto and not --Log-Boring: unexpected errors will be dropped into pdb"
        Exceptions.enable_pdb_on_error(
            ModColor.Str(
                "DDFacet has encountered an unexpected error. Dropping you into pdb for a post-mortem.\n"
                +
                "(This is because you're running with --Debug-Pdb set to 'auto' and --Log-Boring is off.)"
            ))

    # print current options
    OP.Print(dest=log)

    # enable memory logging
    MyLogger.enableMemoryLogging(DicoConfig["Log"]["Memory"])

    # get rid of old shm arrays from previous runs
    Multiprocessing.cleanupStaleShm()

    # initialize random seed from config if set, or else from system time
    if DicoConfig["Misc"]["RandomSeed"] is not None:
        print >> log, "random seed=%d (explicit)" % DicoConfig["Misc"][
            "RandomSeed"]
    else:
        DicoConfig["Misc"]["RandomSeed"] = int(time.time())
        print >> log, "random seed=%d (automatic)" % DicoConfig["Misc"][
            "RandomSeed"]
    np.random.seed(DicoConfig["Misc"]["RandomSeed"])

    # init NCPU for different bits of parallelism
    ncpu = DicoConfig["Parallel"]["NCPU"] or psutil.cpu_count()
    DicoConfig["Parallel"]["NCPU"] = ncpu
    _pyArrays.pySetOMPNumThreads(ncpu)
    NpParallel.NCPU_global = ModFFTW.NCPU_global = ncpu
    numexpr.set_num_threads(ncpu)
    print >> log, "using up to %d CPUs for parallelism" % ncpu

    # write parset
    OP.ToParset("%s.parset" % ImageName)

    Mode = DicoConfig["Output"]["Mode"]

    # init semaphores, as they're needed for weight calculation too
    ClassFacetMachine.ClassFacetMachine.setup_semaphores(DicoConfig)

    # data machine initialized for all cases except PSF-only mode
    # psf machine initialized for all cases except Predict-only mode
    Imager = ClassDeconvMachine.ClassImagerDeconv(
        GD=DicoConfig,
        BaseName=ImageName,
        predict_only=(Mode == "Predict" or Mode == "Subtract"),
        data=(Mode != "PSF"),
        psf=(Mode != "Predict" and Mode != "Dirty" and Mode != "Subtract"),
        readcol=(Mode != "Predict" and Mode != "PSF"),
        deconvolve=("Clean" in Mode))

    Imager.Init()

    # Imager.testDegrid()
    # stop
    if "Predict" in Mode or "Subtract" in Mode:
        Imager.GivePredict()
    if "Clean" in Mode:
        Imager.main()
    elif "Dirty" in Mode:
        sparsify = DicoConfig["Comp"]["Sparsification"]
        if sparsify and isinstance(sparsify, list):
            sparsify = sparsify[0]
        Imager.GiveDirty(psf="PSF" in Mode, sparsify=sparsify)
    elif "PSF" in Mode:
        sparsify = DicoConfig["Comp"]["Sparsification"]
        if sparsify and isinstance(sparsify, list):
            sparsify = sparsify[0]
        Imager.MakePSF(sparsify=sparsify)
    elif "RestoreAndShift" == Mode:
        Imager.RestoreAndShift()
Exemplo n.º 4
0
    except KeyboardInterrupt:
        print(traceback.format_exc(), file=log)
        print(ModColor.Str("DDFacet interrupted by Ctrl+C", col="red"), file=log)
        APP.terminate()
        retcode = 1 #Should at least give the command line an indication of failure
    except Exceptions.UserInputError:
        print(ModColor.Str(sys.exc_info()[1], col="red"), file=log)
        print(ModColor.Str("There was a problem with some user input. See messages above for an indication."), file=log)
        APP.terminate()
        retcode = 1  # Should at least give the command line an indication of failure
    except WorkerProcessError:
        print(ModColor.Str("A worker process has died on us unexpectedly. This probably indicates a bug:"), file=log)
        print(ModColor.Str("  the original underlying error may be reported in the log [possibly far] above."), file=log)
        report_error = True
    except:
        if sys.exc_info()[0] is not WorkerProcessError and Exceptions.is_pdb_enabled():
            APP.terminate()
            raise
        else:
            print(traceback.format_exc(), file=log)
        report_error = True

    if report_error:
        logfileName = logger.getLogFilename()
        logfileName = logfileName if logfileName is not None else "[file logging is not enabled]"
        print("", file=log)
        print(ModColor.Str(
            "There was a problem after %s; if you think this is a bug please open an issue, "%
            T.timehms(), col = "red"), file=log)
        print(ModColor.Str("  quote your version of DDFacet and attach your logfile.", col="red"), file=log)
        print(ModColor.Str(
Exemplo n.º 5
0
        print >> log, ModColor.Str(
            "There was a problem with some user input. See messages above for an indication."
        )
        APP.terminate()
        retcode = 1  # Should at least give the command line an indication of failure
    except WorkerProcessError:
        print >> log, ModColor.Str(
            "A worker process has died on us unexpectedly. This probably indicates a bug:"
        )
        print >> log, ModColor.Str(
            "  the original underlying error may be reported in the log [possibly far] above."
        )
        report_error = True
    except:
        if sys.exc_info(
        )[0] is not WorkerProcessError and Exceptions.is_pdb_enabled():
            APP.terminate()
            raise
        else:
            print >> log, traceback.format_exc()
        report_error = True

    if report_error:
        logfileName = MyLogger.getLogFilename()
        logfileName = logfileName if logfileName is not None else "[file logging is not enabled]"
        print >> log, ""
        print >> log, ModColor.Str(
            "There was a problem after %s; if you think this is a bug please open an issue, "
            % T.timehms(),
            col="red")
        print >> log, ModColor.Str(