Exemplo n.º 1
0
    def test_reap_process_group(self):
        """
        Spin up a process that can't be killed by SIGTERM and make sure
        it gets killed anyway.
        """
        parent_setup_done = multiprocessing.Semaphore(0)
        parent_pid = multiprocessing.Value('i', 0)
        child_pid = multiprocessing.Value('i', 0)
        args = [parent_pid, child_pid, parent_setup_done]
        parent = multiprocessing.Process(target=TestReapProcessGroup._parent_of_ignores_sigterm, args=args)
        try:
            parent.start()
            self.assertTrue(parent_setup_done.acquire(timeout=5.0))
            self.assertTrue(psutil.pid_exists(parent_pid.value))
            self.assertTrue(psutil.pid_exists(child_pid.value))

            process_utils.reap_process_group(parent_pid.value, logging.getLogger(), timeout=1)

            self.assertFalse(psutil.pid_exists(parent_pid.value))
            self.assertFalse(psutil.pid_exists(child_pid.value))
        finally:
            try:
                os.kill(parent_pid.value, signal.SIGKILL)  # terminate doesnt work here
                os.kill(child_pid.value, signal.SIGKILL)  # terminate doesnt work here
            except OSError:
                pass
Exemplo n.º 2
0
    def _heartbeat_manager(self):
        """Heartbeat DAG file processor and restart it if we are not done."""
        if not self._parent_signal_conn:
            raise ValueError("Process not started.")
        if self._process and not self._process.is_alive():
            self._process.join(timeout=0)
            if not self.done:
                self.log.warning(
                    "DagFileProcessorManager (PID=%d) exited with exit code %d - re-launching",
                    self._process.pid,
                    self._process.exitcode,
                )
                self.start()

        if self.done:
            return

        parsing_stat_age = time.monotonic() - self._last_parsing_stat_received_at
        if parsing_stat_age > self._processor_timeout.total_seconds():
            Stats.incr('dag_processing.manager_stalls')
            self.log.error(
                "DagFileProcessorManager (PID=%d) last sent a heartbeat %.2f seconds ago! Restarting it",
                self._process.pid,
                parsing_stat_age,
            )
            reap_process_group(self._process.pid, logger=self.log)
            self.start()
Exemplo n.º 3
0
 def end(self):
     """
     Terminate (and then kill) the manager process launched.
     :return:
     """
     if not self._process:
         self.log.warning('Ending without manager process.')
         return
     reap_process_group(self._process.pid, logger=self.log)
     self._parent_signal_conn.close()
Exemplo n.º 4
0
 def end(self):
     """
     Terminate (and then kill) the manager process launched.
     :return:
     """
     if not self._process:
         self.log.warning('Ending without manager process.')
         return
     # Give the Manager some time to cleanly shut down, but not too long, as
     # it's better to finish sooner than wait for (non-critical) work to
     # finish
     self._process.join(timeout=1.0)
     reap_process_group(self._process.pid, logger=self.log)
     self._parent_signal_conn.close()
Exemplo n.º 5
0
    def terminate(self):
        if self.process is None:
            return

        if self.process.is_running():
            rcs = reap_process_group(self.process.pid, self.log)
            self._rc = rcs.get(self.process.pid)

        self.process = None

        if self._rc is None:
            # Something else reaped it before we had a chance, so let's just "guess" at an error code.
            self._rc = -9
Exemplo n.º 6
0
    def terminate(self):
        if self.process is None:
            return

        # Reap the child process - it may already be finished
        _ = self.return_code(timeout=0)

        if self.process and self.process.is_running():
            rcs = reap_process_group(self.process.pid, self.log)
            self._rc = rcs.get(self.process.pid)

        self.process = None

        if self._rc is None:
            # Something else reaped it before we had a chance, so let's just "guess" at an error code.
            self._rc = -9
Exemplo n.º 7
0
    def terminate(self):
        if self.process is None:
            return

        # Reap the child process - it may already be finished
        _ = self.return_code(timeout=0)

        if self.process and self.process.is_running():
            rcs = reap_process_group(self.process.pid, self.log)
            self._rc = rcs.get(self.process.pid)

        self.process = None

        if self._rc is None:
            # Something else reaped it before we had a chance, so let's just "guess" at an error code.
            self._rc = -9

        if self._rc == -9:
            # If either we or psutil gives out a -9 return code, it likely means
            # an OOM happened
            self.log.error(
                'Job %s was killed before it finished (likely due to running out of memory)',
                self._task_instance.job_id,
            )
Exemplo n.º 8
0
 def terminate(self):
     if self.process and psutil.pid_exists(self.process.pid):
         reap_process_group(self.process.pid, self.log)