def _get_result(self) -> QueueOutput | None: use_faulthandler = (self.ns.timeout is not None) timeout = PROGRESS_UPDATE # bpo-46205: check the status of workers every iteration to avoid # waiting forever on an empty queue. while any(worker.is_alive() for worker in self.workers): if use_faulthandler: faulthandler.dump_traceback_later(MAIN_PROCESS_TIMEOUT, exit=True) # wait for a thread try: return self.output.get(timeout=timeout) except queue.Empty: pass # display progress running = get_running(self.workers) if running and not self.ns.pgo: self.log('running: %s' % ', '.join(running)) # all worker threads are done: consume pending results try: return self.output.get(timeout=0) except queue.Empty: return None
def main() -> None: if len(sys.argv) != 2: print( "worker_process_env_path must be provided as a commandline argument", file=sys.stderr) sys.exit(1) # Load the worker process env. worker_process_env_path = pathlib.Path(sys.argv[1]) worker_process_env = layers.WorkerProcessContext.from_file( worker_process_env_path) config_logging(worker_process_env) if worker_process_env.env.experiment_config.debug_enabled(): faulthandler.dump_traceback_later(30, repeat=True) # Establish the connection to the ZMQBroadcastServer in this container. pub_url = f"tcp://localhost:{worker_process_env.broadcast_pub_port}" sub_url = f"tcp://localhost:{worker_process_env.broadcast_pull_port}" with ipc.ZMQBroadcastClient(pub_url, sub_url) as broadcast_client: # Wrap the communication layer in a workload.Stream. subrec = layers.SubprocessReceiver(broadcast_client) controller = load.prepare_controller( worker_process_env.env, iter(subrec), worker_process_env.load_path, worker_process_env.rendezvous_info, worker_process_env.hvd_config, ) controller.run()
def _get_result(self) -> QueueOutput | None: if not any(worker.is_alive() for worker in self.workers): # all worker threads are done: consume pending results try: return self.output.get(timeout=0) except queue.Empty: return None use_faulthandler = (self.ns.timeout is not None) timeout = PROGRESS_UPDATE while True: if use_faulthandler: faulthandler.dump_traceback_later(MAIN_PROCESS_TIMEOUT, exit=True) # wait for a thread try: return self.output.get(timeout=timeout) except queue.Empty: pass # display progress running = get_running(self.workers) if running and not self.ns.pgo: self.log('running: %s' % ', '.join(running))
def setUpClass(cls): cls.orig_handler = signal.signal(signal.SIGALRM, lambda *args: None) signal.setitimer(signal.ITIMER_REAL, cls.signal_delay, cls.signal_period) # Issue #25277: Use faulthandler to try to debug a hang on FreeBSD faulthandler.dump_traceback_later(10 * 60, exit=True)
def main(): # In case of import deadlock, crash after a finite timeout faulthandler.dump_traceback_later(timeout, exit=True) with ThreadPoolExecutor(num_threads) as pool: assert "pandas" not in sys.modules # pandas is imported lazily list(pool.map(thread_func, range(num_threads))) assert "pandas" in sys.modules
def main(self, tests=None, **kwargs): self.parse_args(kwargs) self.set_temp_dir() if self.ns.cleanup: self.cleanup() sys.exit(0) test_cwd = self.create_temp_dir() try: # Run the tests in a context manager that temporarily changes the CWD # to a temporary and writable directory. If it's not possible to # create or change the CWD, the original CWD will be used. # The original CWD is available from os_helper.SAVEDCWD. with os_helper.temp_cwd(test_cwd, quiet=True): # When using multiprocessing, worker processes will use test_cwd # as their parent temporary directory. So when the main process # exit, it removes also subdirectories of worker processes. self.ns.tempdir = test_cwd self._main(tests, kwargs) except SystemExit as exc: # bpo-38203: Python can hang at exit in Py_Finalize(), especially # on threading._shutdown() call: put a timeout faulthandler.dump_traceback_later(EXIT_TIMEOUT, exit=True) sys.exit(exc.code)
def build_and_run_training_pipeline(env: det.EnvContext) -> None: # Create the socket manager. The socket manager will connect to the master and read messages # until it receives the rendezvous_info. # # TODO(ryan): Pull profiler hooks out of SocketManager and into their own layer. with layers.SocketManager(env) as socket_mgr: # Create the storage manager. This is used to download the initial checkpoint here in # build_training_pipeline and also used by the workload manager to create and store # checkpoints during training. storage_mgr = storage.build(env.experiment_config["checkpoint_storage"]) [tensorboard_mgr, tensorboard_writer] = load.prepare_tensorboard(env) # Create the workload manager. The workload manager will receive workloads from the # socket_mgr, and augment them with some additional arguments. Additionally, the # workload manager is responsible for some generic workload hooks for things like timing # workloads, preparing checkpoints, and uploading completed checkpoints. Finally, the # workload manager does some sanity checks on response messages that originate from the # trial. # # TODO(ryan): Refactor WorkloadManager into separate layers that do each separate task. workload_mgr = layers.build_workload_manager( env, iter(socket_mgr), socket_mgr.get_rendezvous_info(), storage_mgr, tensorboard_mgr, tensorboard_writer, ) hvd_config = horovod.HorovodContext.from_configs( env.experiment_config, socket_mgr.get_rendezvous_info(), env.hparams ) logging.info(f"Horovod config: {hvd_config.__dict__}.") # Load the checkpoint, if necessary. Any possible sinks to this pipeline will need access # to this checkpoint. with maybe_load_checkpoint(storage_mgr, env.latest_checkpoint) as load_path: # Horovod distributed training is done inside subprocesses. if hvd_config.use: subproc = layers.SubprocessLauncher( env, iter(workload_mgr), load_path, socket_mgr.get_rendezvous_info(), hvd_config ) subproc.run() else: if env.experiment_config.debug_enabled(): faulthandler.dump_traceback_later(30, repeat=True) controller = load.prepare_controller( env, iter(workload_mgr), load_path, socket_mgr.get_rendezvous_info(), hvd_config, ) controller.run()
def maybe_periodic_stacktraces(debug_enabled: bool) -> Iterator[None]: if debug_enabled: faulthandler.dump_traceback_later(30, repeat=True) try: yield finally: if debug_enabled: faulthandler.cancel_dump_traceback_later()
def pytest_unconfigure(config): # Setup a global traceback printer callback to debug deadlocks that # would happen once pytest has completed: for instance in atexit # finalizers. At this point the stdout/stderr capture of pytest # should be disabled. # Note that we also use a shorter timeout for the per-test callback # configured via the pytest-timeout extension. import faulthandler faulthandler.dump_traceback_later(60, exit=True)
def _runtest(ns, test_name): # Handle faulthandler timeout, capture stdout+stderr, XML serialization # and measure time. output_on_failure = ns.verbose3 use_timeout = (ns.timeout is not None) if use_timeout: faulthandler.dump_traceback_later(ns.timeout, exit=True) start_time = time.perf_counter() try: support.set_match_tests(ns.match_tests) support.junit_xml_list = xml_list = [] if ns.xmlpath else None if ns.failfast: support.failfast = True if output_on_failure: support.verbose = True stream = io.StringIO() orig_stdout = sys.stdout orig_stderr = sys.stderr try: sys.stdout = stream sys.stderr = stream result = _runtest_inner(ns, test_name, display_failure=False) if result != PASSED: output = stream.getvalue() orig_stderr.write(output) orig_stderr.flush() finally: sys.stdout = orig_stdout sys.stderr = orig_stderr else: # Tell tests to be moderately quiet support.verbose = ns.verbose result = _runtest_inner(ns, test_name, display_failure=not ns.verbose) if xml_list: import xml.etree.ElementTree as ET xml_data = [ET.tostring(x).decode('us-ascii') for x in xml_list] else: xml_data = None test_time = time.perf_counter() - start_time return TestResult(test_name, result, test_time, xml_data) finally: if use_timeout: faulthandler.cancel_dump_traceback_later() support.junit_xml_list = None
def setUp(self): self.signals = 0 self.orig_handler = signal.signal(signal.SIGALRM, self.sighandler) signal.setitimer(signal.ITIMER_REAL, self.signal_delay, self.signal_period) # Use faulthandler as watchdog to debug when a test hangs # (timeout of 10 minutes) faulthandler.dump_traceback_later(10 * 60, exit=True, file=sys.__stderr__)
def runtest(ns, test): """Run a single test. ns -- regrtest namespace of options test -- the name of the test Returns the tuple (result, test_time), where result is one of the constants: INTERRUPTED KeyboardInterrupt when run under -j RESOURCE_DENIED test skipped because resource denied SKIPPED test skipped for some other reason ENV_CHANGED test failed because it changed the execution environment FAILED test failed PASSED test passed """ output_on_failure = ns.verbose3 use_timeout = ns.timeout is not None if use_timeout: faulthandler.dump_traceback_later(ns.timeout, exit=True) try: support.match_tests = ns.match_tests if ns.failfast: support.failfast = True if output_on_failure: support.verbose = True if runtest.stringio is None: stream = io.StringIO() runtest.stringio = stream else: stream = runtest.stringio stream.seek(0) stream.truncate() orig_stdout = sys.stdout orig_stderr = sys.stderr try: sys.stdout = stream sys.stderr = stream result = runtest_inner(ns, test, display_failure=False) if result[0] != PASSED: output = stream.getvalue() orig_stderr.write(output) orig_stderr.flush() finally: sys.stdout = orig_stdout sys.stderr = orig_stderr else: support.verbose = ns.verbose result = runtest_inner(ns, test, display_failure=not ns.verbose) return result finally: if use_timeout: faulthandler.cancel_dump_traceback_later() cleanup_test_droppings(test, ns.verbose)
def exit_on_deadlock(): try: TIMEOUT = 5 from faulthandler import dump_traceback_later from faulthandler import cancel_dump_traceback_later from sys import stderr dump_traceback_later(timeout=TIMEOUT, exit=True, file=stderr) yield cancel_dump_traceback_later() except ImportError: yield
def setUp(self): self.signals = 0 self.orig_handler = signal.signal(signal.SIGALRM, self.sighandler) signal.setitimer(signal.ITIMER_REAL, self.signal_delay, self.signal_period) # Use faulthandler as watchdog to debug when a test hangs # (timeout of 10 minutes) if hasattr(faulthandler, 'dump_traceback_later'): faulthandler.dump_traceback_later(10 * 60, exit=True, file=sys.__stderr__)
def test_stderr_None(self): # Issue #21497: provide a helpful error if sys.stderr is None, # instead of just an attribute error: "None has no attribute fileno". with self.check_stderr_none(): faulthandler.enable() with self.check_stderr_none(): faulthandler.dump_traceback() with self.check_stderr_none(): faulthandler.dump_traceback_later(1e-3) if hasattr(faulthandler, "register"): with self.check_stderr_none(): faulthandler.register(signal.SIGUSR1)
def pytest_runtest_protocol(item): timeout = float(item.config.getini("faulthandler_timeout") or 0.0) if timeout > 0: import faulthandler stderr = item.config.fault_handler_stderr faulthandler.dump_traceback_later(timeout, file=stderr) try: yield finally: faulthandler.cancel_dump_traceback_later() else: yield
def test_stderr_None(self): # Issue #21497: provide a helpful error if sys.stderr is None, # instead of just an attribute error: "None has no attribute fileno". with self.check_stderr_none(): faulthandler.enable() with self.check_stderr_none(): faulthandler.dump_traceback() if hasattr(faulthandler, 'dump_traceback_later'): with self.check_stderr_none(): faulthandler.dump_traceback_later(1e-3) if hasattr(faulthandler, "register"): with self.check_stderr_none(): faulthandler.register(signal.SIGUSR1)
def pytest_runtest_protocol(item: Item) -> Generator[None, None, None]: timeout = get_timeout_config_value(item.config) stderr = item.config._store[fault_handler_stderr_key] if timeout > 0 and stderr is not None: import faulthandler faulthandler.dump_traceback_later(timeout, file=stderr) try: yield finally: faulthandler.cancel_dump_traceback_later() else: yield
def pytest_runtest_protocol(self, item): timeout = self.get_timeout_config_value(item.config) stderr = item.config.fault_handler_stderr if timeout > 0 and stderr is not None: import faulthandler faulthandler.dump_traceback_later(timeout, file=stderr) try: yield finally: faulthandler.cancel_dump_traceback_later() else: yield
def pytest_runtest_protocol(item): enabled = item.config.getoption('fault_handler') timeout = item.config.getoption('fault_handler_timeout') timeout_supported = timeout_support_available() if enabled and timeout > 0 and timeout_supported: import faulthandler stderr = item.config.fault_handler_stderr faulthandler.dump_traceback_later(timeout, file=stderr) try: yield finally: faulthandler.cancel_dump_traceback_later() else: yield
def main() -> None: if len(sys.argv) != 2: print("worker_process_env_path must be provided as a commandline argument", file=sys.stderr) sys.exit(1) # Load the worker process env. worker_process_env_path = pathlib.Path(sys.argv[1]) worker_process_env = layers.WorkerProcessContext.from_file(worker_process_env_path) config_logging(worker_process_env) # API code expects credential to be available as an environment variable os.environ["DET_TASK_TOKEN"] = worker_process_env.env.det_task_token # TODO: refactor websocket, data_layer, and profiling to to not use the cli_cert. master_url = ( f"http{'s' if worker_process_env.env.use_tls else ''}://" f"{worker_process_env.env.master_addr}:{worker_process_env.env.master_port}" ) certs.cli_cert = certs.default_load(master_url=master_url) if worker_process_env.env.experiment_config.debug_enabled(): faulthandler.dump_traceback_later(30, repeat=True) # Establish the connection to the ZMQBroadcastServer in this container. pub_url = f"tcp://localhost:{worker_process_env.broadcast_pub_port}" sub_url = f"tcp://localhost:{worker_process_env.broadcast_pull_port}" with ipc.ZMQBroadcastClient(pub_url, sub_url) as broadcast_client: # Wrap the communication layer in a workload.Stream. subrec = layers.SubprocessReceiver(broadcast_client) workloads = iter(subrec) with det._catch_sys_exit(): with det._catch_init_invalid_hp(workloads): controller = load.prepare_controller( worker_process_env.env, workloads, worker_process_env.load_path, worker_process_env.rendezvous_info, worker_process_env.hvd_config, ) try: controller.run() except Exception as e: broadcast_client.send_exception_message() raise e
def _get_result(self): if not any(worker.is_alive() for worker in self.workers): # all worker threads are done: consume pending results try: return self.output.get(timeout=0) except queue.Empty: return None while True: if self.main_timeout is not None: faulthandler.dump_traceback_later(self.main_timeout, exit=True) # wait for a thread timeout = max(PROGRESS_UPDATE, PROGRESS_MIN_TIME) try: return self.output.get(timeout=timeout) except queue.Empty: pass # display progress running = get_running(self.workers) if running and not self.ns.pgo: print('running: %s' % ', '.join(running), flush=True)
def _check_dump_traceback_later(self, repeat, cancel, filename): """ Check how many times the traceback is written in timeout x 2.5 seconds, or timeout x 3.5 seconds if cancel is True: 1, 2 or 3 times depending on repeat and cancel options. Raise an error if the output doesn't match the expect format. """ timeout_str = str(datetime.timedelta(seconds=TIMEOUT)) code = """ import faulthandler import time def func(repeat, cancel, timeout): if cancel: faulthandler.cancel_dump_traceback_later() for loop in range(2): time.sleep(timeout * 1.25) faulthandler.cancel_dump_traceback_later() timeout = %s repeat = %s cancel = %s if %s: file = open(%s, "wb") else: file = None faulthandler.dump_traceback_later(timeout, repeat=repeat, file=file) func(repeat, cancel, timeout) if file is not None: file.close() """.strip() code = code % (TIMEOUT, repeat, cancel, bool(filename), repr(filename)) trace, exitcode = self.get_output(code, filename) trace = '\n'.join(trace) if not cancel: if repeat: count = 2 else: count = 1 header = r'Timeout \(%s\)!\nCurrent thread XXX:\n' % timeout_str regex = expected_traceback(8, 20, header, count=count) self.assertRegex(trace, regex) else: self.assertEqual(trace, '') self.assertEqual(exitcode, 0)
def runtest(ns, test): """Run a single test. ns -- regrtest namespace of options test -- the name of the test Returns the tuple (result, test_time, xml_data), where result is one of the constants: INTERRUPTED KeyboardInterrupt when run under -j RESOURCE_DENIED test skipped because resource denied SKIPPED test skipped for some other reason ENV_CHANGED test failed because it changed the execution environment FAILED test failed PASSED test passed EMPTY_TEST_SUITE test ran no subtests. If ns.xmlpath is not None, xml_data is a list containing each generated testsuite element. """ output_on_failure = ns.verbose3 use_timeout = (ns.timeout is not None) if use_timeout: faulthandler.dump_traceback_later(ns.timeout, exit=True) try: support.set_match_tests(ns.match_tests) # reset the environment_altered flag to detect if a test altered # the environment support.environment_altered = False support.junit_xml_list = xml_list = [] if ns.xmlpath else None if ns.failfast: support.failfast = True if output_on_failure: support.verbose = True stream = io.StringIO() orig_stdout = sys.stdout orig_stderr = sys.stderr try: sys.stdout = stream sys.stderr = stream result = runtest_inner(ns, test, display_failure=False) if result[0] != PASSED: output = stream.getvalue() orig_stderr.write(output) orig_stderr.flush() finally: sys.stdout = orig_stdout sys.stderr = orig_stderr else: support.verbose = ns.verbose # Tell tests to be moderately quiet result = runtest_inner(ns, test, display_failure=not ns.verbose) if xml_list: import xml.etree.ElementTree as ET xml_data = [ET.tostring(x).decode('us-ascii') for x in xml_list] else: xml_data = None return result + (xml_data,) finally: if use_timeout: faulthandler.cancel_dump_traceback_later() cleanup_test_droppings(test, ns.verbose) support.junit_xml_list = None
def runtest(test, verbose, quiet, huntrleaks=False, use_resources=None, output_on_failure=False, failfast=False, match_tests=None, timeout=None): """Run a single test. test -- the name of the test verbose -- if true, print more messages quiet -- if true, don't print 'skipped' messages (probably redundant) huntrleaks -- run multiple times to test for leaks; requires a debug build; a triple corresponding to -R's three arguments use_resources -- list of extra resources to use output_on_failure -- if true, display test output on failure timeout -- dump the traceback and exit if a test takes more than timeout seconds failfast, match_tests -- See regrtest command-line flags for these. Returns the tuple result, test_time, where result is one of the constants: INTERRUPTED KeyboardInterrupt when run under -j RESOURCE_DENIED test skipped because resource denied SKIPPED test skipped for some other reason ENV_CHANGED test failed because it changed the execution environment FAILED test failed PASSED test passed """ if use_resources is not None: support.use_resources = use_resources use_timeout = (timeout is not None) if use_timeout: faulthandler.dump_traceback_later(timeout, exit=True) try: support.match_tests = match_tests if failfast: support.failfast = True if output_on_failure: support.verbose = True # Reuse the same instance to all calls to runtest(). Some # tests keep a reference to sys.stdout or sys.stderr # (eg. test_argparse). if runtest.stringio is None: stream = io.StringIO() runtest.stringio = stream else: stream = runtest.stringio stream.seek(0) stream.truncate() orig_stdout = sys.stdout orig_stderr = sys.stderr try: sys.stdout = stream sys.stderr = stream result = runtest_inner(test, verbose, quiet, huntrleaks, display_failure=False) if result[0] == FAILED: output = stream.getvalue() orig_stderr.write(output) orig_stderr.flush() finally: sys.stdout = orig_stdout sys.stderr = orig_stderr else: support.verbose = verbose # Tell tests to be moderately quiet result = runtest_inner(test, verbose, quiet, huntrleaks, display_failure=not verbose) return result finally: if use_timeout: faulthandler.cancel_dump_traceback_later() cleanup_test_droppings(test, verbose)
def _runtest(ns: Namespace, test_name: str) -> TestResult: # Handle faulthandler timeout, capture stdout+stderr, XML serialization # and measure time. output_on_failure = ns.verbose3 use_timeout = (ns.timeout is not None) if use_timeout: faulthandler.dump_traceback_later(ns.timeout, exit=True) start_time = time.perf_counter() try: support.set_match_tests(ns.match_tests, ns.ignore_tests) support.junit_xml_list = xml_list = [] if ns.xmlpath else None if ns.failfast: support.failfast = True if output_on_failure: support.verbose = True stream = io.StringIO() orig_stdout = sys.stdout orig_stderr = sys.stderr print_warning = support.print_warning orig_print_warnings_stderr = print_warning.orig_stderr output = None try: sys.stdout = stream sys.stderr = stream # print_warning() writes into the temporary stream to preserve # messages order. If support.environment_altered becomes true, # warnings will be written to sys.stderr below. print_warning.orig_stderr = stream result = _runtest_inner(ns, test_name, display_failure=False) if not isinstance(result, Passed): output = stream.getvalue() finally: sys.stdout = orig_stdout sys.stderr = orig_stderr print_warning.orig_stderr = orig_print_warnings_stderr if output is not None: sys.stderr.write(output) sys.stderr.flush() else: # Tell tests to be moderately quiet support.verbose = ns.verbose result = _runtest_inner(ns, test_name, display_failure=not ns.verbose) if xml_list: import xml.etree.ElementTree as ET result.xml_data = [ ET.tostring(x).decode('us-ascii') for x in xml_list ] result.duration_sec = time.perf_counter() - start_time return result finally: if use_timeout: faulthandler.cancel_dump_traceback_later() support.junit_xml_list = None
def run_tests_multiprocess(regrtest): output = queue.Queue() pending = MultiprocessIterator(regrtest.tests) test_timeout = regrtest.ns.timeout use_timeout = (test_timeout is not None) workers = [ MultiprocessThread(pending, output, regrtest.ns) for i in range(regrtest.ns.use_mp) ] print("Run tests in parallel using %s child processes" % len(workers)) for worker in workers: worker.start() def get_running(workers): running = [] for worker in workers: current_test = worker.current_test if not current_test: continue dt = time.monotonic() - worker.start_time if dt >= PROGRESS_MIN_TIME: text = '%s (%s)' % (current_test, format_duration(dt)) running.append(text) return running finished = 0 test_index = 1 get_timeout = max(PROGRESS_UPDATE, PROGRESS_MIN_TIME) try: while finished < regrtest.ns.use_mp: if use_timeout: faulthandler.dump_traceback_later(test_timeout, exit=True) try: item = output.get(timeout=get_timeout) except queue.Empty: running = get_running(workers) if running and not regrtest.ns.pgo: print('running: %s' % ', '.join(running), flush=True) continue test, stdout, stderr, result = item if test is None: finished += 1 continue regrtest.accumulate_result(test, result) # Display progress ok, test_time = result text = format_test_result(test, ok) if (ok not in (CHILD_ERROR, INTERRUPTED) and test_time >= PROGRESS_MIN_TIME and not regrtest.ns.pgo): text += ' (%.0f sec)' % test_time elif ok == CHILD_ERROR: text = '%s (%s)' % (text, test_time) running = get_running(workers) if running and not regrtest.ns.pgo: text += ' -- running: %s' % ', '.join(running) regrtest.display_progress(test_index, text) # Copy stdout and stderr from the child process if stdout: print(stdout, flush=True) if stderr and not regrtest.ns.pgo: print(stderr, file=sys.stderr, flush=True) if result[0] == INTERRUPTED: raise KeyboardInterrupt test_index += 1 except KeyboardInterrupt: regrtest.interrupted = True pending.interrupted = True print() finally: if use_timeout: faulthandler.cancel_dump_traceback_later() # If tests are interrupted, wait until tests complete wait_start = time.monotonic() while True: running = [worker.current_test for worker in workers] running = list(filter(bool, running)) if not running: break dt = time.monotonic() - wait_start line = "Waiting for %s (%s tests)" % (', '.join(running), len(running)) if dt >= WAIT_PROGRESS: line = "%s since %.0f sec" % (line, dt) print(line, flush=True) for worker in workers: worker.join(WAIT_PROGRESS)
pythonDict["Variables"], pythonDict["Expression"] = sympy.cse(code) for i, expr in enumerate(pythonDict["Variables"]): pythonDict["Variables"][i] = { "name": str(expr[0]), "expr": str(expressionToCode(expr[1], language)) } pythonDict["Expression"] = expressionToCode(pythonDict["Expression"][0], language) return pythonDict # Begin Parsing faulthandler.enable() try: # Timeout after 10 seconds if it doesn't return faulthandler.dump_traceback_later(10) command = sys.argv[1] language = sys.argv[2] expression = sympy.sympify(sys.argv[3]) assert expression is not None, "SymPy Error: Cannot evaluate expression!" result = None if (command == 'eval'): print(json.dumps(convertSymPyToDict(expression, language), indent=4)) sys.stdout.flush() finally: faulthandler.cancel_dump_traceback_later()
def exit_on_deadlock(): dump_traceback_later(timeout=30, exit=True) yield cancel_dump_traceback_later()
def main(): global g_get_sim_time global g_instance_name global g_asked_to_pause def incomingAsyncEvent( packet ): # CAUTION: Called asynchronously from the ZeroMQ rx thread logging.info("incoming async event " + str(packet)) if "action" in packet: if packet["action"] == "event": if packet["headers"]["Instancename"] == g_instance_name: logging.info("Matches this instance name, so dispatching") engine.register_event_in(0, device_factory.external_event, packet, None) elif packet["action"] == "announce": logging.log(packet["severity"], "[broadcast message] " + packet["message"]) elif packet["action"] == "command": if packet["headers"]["Instancename"] == g_instance_name: argv = packet["argv"] logging.info("Received async command " + str(argv)) if len(argv) > 0: client.async_command(argv) def event_count_callback(): return events.event_count logging.getLogger().setLevel(logging.INFO) os.makedirs("../synth_logs", exist_ok=True) os.makedirs("../synth_accounts", exist_ok=True) params = get_params() assert g_instance_name is not None, "Instance name has not been defined, but this is required for logfile naming" init_logging(params) logging.info("*** Synth starting at real time " + str(datetime.now()) + " ***") logging.info( "Parameters:\n" + json.dumps(params, sort_keys=True, indent=4, separators=(',', ': '))) post_to_slack("Started") install_signal_catcher() Tstart = time.time() # Human time Tstart_process = time.process_time() # Time CPU usage random.seed(12345) # Ensure reproduceability if not "client" in params: logging.error("No client defined to receive simulation results") return client = importer.get_class('client', params['client']['type'])(g_instance_name, params, params['client']) if not "engine" in params: logging.error("No simulation engine defined") return engine = importer.get_class('engine', params['engine']['type'])( params['engine'], client.enter_interactive, event_count_callback) g_get_sim_time = engine.get_now_no_lock if not "events" in params: logging.warning("No events defined") events = Events(client, engine, g_instance_name, params, params["events"]) zeromq_rx.init(incomingAsyncEvent, emit_logging=True) zeromq_tx.init(emit_logging=True) logging.info("Simulation starts") faulthandler.dump_traceback_later( 600, repeat=True ) # TEMP - every 10 minutes emit a stack trace for every thread - to diagnose hanging issue err_str = "" try: while engine.events_to_come(): engine.next_event() client.tick() if g_asked_to_pause: g_asked_to_pause = False logging.info("Paused") signal.pause( ) # Suspend this process. Receiving any signal will then cause us to resume logging.info("Resuming") device_factory.close() except: err_str = traceback.format_exc( ) # Report any exception, but continue to clean-up anyway logging.error("Error at real time " + str(datetime.now()) + " (local)") logging.error(err_str) logging.info("Simulation ends") logging.info("Ending device logging (" + str(len(device_factory.g_devices)) + " devices were emulated)") events.flush() client.close() logging.info("Elapsed real time: " + str(int(time.time() - Tstart)) + " seconds.") logging.info("CPU time used: " + str(int(time.process_time() - Tstart_process)) + " seconds.") if err_str == "": post_to_slack("Finished OK") exit(0) post_to_slack(err_str) exit(-1)
def run_tests_multiprocess(regrtest): output = queue.Queue() pending = MultiprocessIterator(regrtest.tests) test_timeout = regrtest.ns.timeout use_timeout = (test_timeout is not None) workers = [MultiprocessThread(pending, output, regrtest.ns) for i in range(regrtest.ns.use_mp)] print("Run tests in parallel using %s child processes" % len(workers)) for worker in workers: worker.start() def get_running(workers): running = [] for worker in workers: current_test = worker.current_test if not current_test: continue dt = time.monotonic() - worker.start_time if dt >= PROGRESS_MIN_TIME: text = '%s (%s)' % (current_test, format_duration(dt)) running.append(text) return running finished = 0 test_index = 1 get_timeout = max(PROGRESS_UPDATE, PROGRESS_MIN_TIME) try: while finished < regrtest.ns.use_mp: if use_timeout: faulthandler.dump_traceback_later(test_timeout, exit=True) try: item = output.get(timeout=get_timeout) except queue.Empty: running = get_running(workers) if running and not regrtest.ns.pgo: print('running: %s' % ', '.join(running), flush=True) continue test, stdout, stderr, result = item if test is None: finished += 1 continue regrtest.accumulate_result(test, result) # Display progress ok, test_time = result text = format_test_result(test, ok) if (ok not in (CHILD_ERROR, INTERRUPTED) and test_time >= PROGRESS_MIN_TIME and not regrtest.ns.pgo): text += ' (%s)' % format_duration(test_time) elif ok == CHILD_ERROR: text = '%s (%s)' % (text, test_time) running = get_running(workers) if running and not regrtest.ns.pgo: text += ' -- running: %s' % ', '.join(running) regrtest.display_progress(test_index, text) # Copy stdout and stderr from the child process if stdout: print(stdout, flush=True) if stderr and not regrtest.ns.pgo: print(stderr, file=sys.stderr, flush=True) if result[0] == INTERRUPTED: raise KeyboardInterrupt test_index += 1 except KeyboardInterrupt: regrtest.interrupted = True pending.interrupted = True print() finally: if use_timeout: faulthandler.cancel_dump_traceback_later() # If tests are interrupted, wait until tests complete wait_start = time.monotonic() while True: running = [worker.current_test for worker in workers] running = list(filter(bool, running)) if not running: break dt = time.monotonic() - wait_start line = "Waiting for %s (%s tests)" % (', '.join(running), len(running)) if dt >= WAIT_PROGRESS: line = "%s since %.0f sec" % (line, dt) print(line, flush=True) for worker in workers: worker.join(WAIT_PROGRESS)
def main(args): # Set up faulthandler faulthandler.enable() # Set up logging and devices args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True) log = util.get_logger(args.save_dir, args.name) tbx = SummaryWriter(args.save_dir) device, args.gpu_ids = util.get_available_devices() log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}') args.batch_size *= max(1, len(args.gpu_ids)) # Set random seed log.info(f'Using random seed {args.seed}...') random.seed(args.seed) torch.manual_seed(args.seed) torch.cuda.manual_seed_all(args.seed) # Get embeddings log.info('Loading embeddings...') word_vectors = util.torch_from_json(args.word_emb_file) char_vectors = util.torch_from_json(args.char_emb_file) # Get model log.info('Building model...') model_params = { 'word_vectors': word_vectors, 'char_vectors': char_vectors, 'args': args } model = get_model(args.model, model_params) print('Model size: {:f} MB'.format( sum(p.nelement() * p.element_size() for p in model.parameters()) / (1024 * 1024))) # model = nn.DataParallel(model, args.gpu_ids) if args.load_path: log.info(f'Loading checkpoint from {args.load_path}...') model, step = util.load_model(model, args.load_path, args.gpu_ids) else: step = 0 model = model.to(device) model.train() ema = util.EMA(model, args.ema_decay) # Get saver saver = util.CheckpointSaver(args.save_dir, max_checkpoints=args.max_checkpoints, metric_name=args.metric_name, maximize_metric=args.maximize_metric, log=log) # Get optimizer and scheduler optimizer = optim.Adadelta(model.parameters(), args.lr, weight_decay=args.l2_wd) scheduler = sched.LambdaLR(optimizer, lambda s: 1.) # Constant LR # Get data loader log.info('Building dataset...') train_dataset = SQuAD(args.train_record_file, args.use_squad_v2) train_loader = data.DataLoader(train_dataset, batch_size=args.batch_size, shuffle=True, num_workers=args.num_workers, collate_fn=collate_fn) dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2) dev_loader = data.DataLoader(dev_dataset, batch_size=args.batch_size, shuffle=False, num_workers=args.num_workers, collate_fn=collate_fn) # Train log.info('Training...') steps_till_eval = args.eval_steps epoch = step // len(train_dataset) while epoch != args.num_epochs: epoch += 1 log.info(f'Starting epoch {epoch}...') with torch.enable_grad(), \ tqdm(total=len(train_loader.dataset)) as progress_bar: for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader: progress_bar.set_description( 'Batch data_loading finished'.ljust(30)) progress_bar.refresh() # Setup for forward cw_idxs = cw_idxs.to(device) qw_idxs = qw_idxs.to(device) cc_idxs = cc_idxs.to(device) qc_idxs = qc_idxs.to(device) batch_size = cw_idxs.size(0) optimizer.zero_grad() progress_bar.set_description( 'Batch initialization finished'.ljust(30)) progress_bar.refresh() # Forward faulthandler.dump_traceback_later(timeout=3) log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs) faulthandler.cancel_dump_traceback_later() progress_bar.set_description( 'Batch forward finished'.ljust(30)) progress_bar.refresh() y1, y2 = y1.to(device), y2.to(device) loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2) loss_val = loss.item() # Backward faulthandler.dump_traceback_later(timeout=3) loss.backward() faulthandler.cancel_dump_traceback_later() progress_bar.set_description( 'Batch backward finished'.ljust(30)) progress_bar.refresh() nn.utils.clip_grad_norm_(model.parameters(), args.max_grad_norm) optimizer.step() progress_bar.set_description('Optimization finished'.ljust(30)) progress_bar.refresh() scheduler.step() ema(model, step // batch_size) # Log info step += batch_size progress_bar.update(batch_size) progress_bar.set_postfix(epoch=epoch, NLL=loss_val) tbx.add_scalar('train/NLL', loss_val, step) tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'], step) steps_till_eval -= batch_size if steps_till_eval <= 0: steps_till_eval = args.eval_steps # Evaluate and save checkpoint log.info(f'Evaluating at step {step}...') ema.assign(model) results, pred_dict = evaluate(model, dev_loader, device, args.dev_eval_file, args.max_ans_len, args.use_squad_v2) progress_bar.set_description( 'Evaluation finished'.ljust(30)) progress_bar.refresh() saver.save(step, model, results[args.metric_name], device) ema.resume(model) # Log to console results_str = ', '.join(f'{k}: {v:05.2f}' for k, v in results.items()) log.info(f'Dev {results_str}') # Log to TensorBoard log.info('Visualizing in TensorBoard...') for k, v in results.items(): tbx.add_scalar(f'dev/{k}', v, step) util.visualize(tbx, pred_dict=pred_dict, eval_path=args.dev_eval_file, step=step, split='dev', num_visuals=args.num_visuals)
def runtest(ns, test): """Run a single test. ns -- regrtest namespace of options test -- the name of the test Returns the tuple (result, test_time, xml_data), where result is one of the constants: INTERRUPTED KeyboardInterrupt when run under -j RESOURCE_DENIED test skipped because resource denied SKIPPED test skipped for some other reason ENV_CHANGED test failed because it changed the execution environment FAILED test failed PASSED test passed EMPTY_TEST_SUITE test ran no subtests. If ns.xmlpath is not None, xml_data is a list containing each generated testsuite element. """ output_on_failure = ns.verbose3 use_timeout = (ns.timeout is not None) if use_timeout: faulthandler.dump_traceback_later(ns.timeout, exit=True) try: support.set_match_tests(ns.match_tests) # reset the environment_altered flag to detect if a test altered # the environment support.environment_altered = False support.junit_xml_list = xml_list = [] if ns.xmlpath else None if ns.failfast: support.failfast = True if output_on_failure: support.verbose = True stream = io.StringIO() orig_stdout = sys.stdout orig_stderr = sys.stderr try: sys.stdout = stream sys.stderr = stream result = runtest_inner(ns, test, display_failure=False) if result[0] != PASSED: output = stream.getvalue() orig_stderr.write(output) orig_stderr.flush() finally: sys.stdout = orig_stdout sys.stderr = orig_stderr else: support.verbose = ns.verbose # Tell tests to be moderately quiet result = runtest_inner(ns, test, display_failure=not ns.verbose) if xml_list: import xml.etree.ElementTree as ET xml_data = [ET.tostring(x).decode('us-ascii') for x in xml_list] else: xml_data = None return result + (xml_data, ) finally: if use_timeout: faulthandler.cancel_dump_traceback_later() cleanup_test_droppings(test, ns.verbose) support.junit_xml_list = None
def runtest(ns, test): """Run a single test. ns -- regrtest namespace of options test -- the name of the test Returns the tuple (result, test_time), where result is one of the constants: INTERRUPTED KeyboardInterrupt when run under -j RESOURCE_DENIED test skipped because resource denied SKIPPED test skipped for some other reason ENV_CHANGED test failed because it changed the execution environment FAILED test failed PASSED test passed """ output_on_failure = ns.verbose3 use_timeout = (ns.timeout is not None) if use_timeout: faulthandler.dump_traceback_later(ns.timeout, exit=True) try: support.match_tests = ns.match_tests # reset the environment_altered flag to detect if a test altered # the environment support.environment_altered = False if ns.failfast: support.failfast = True if output_on_failure: support.verbose = True # Reuse the same instance to all calls to runtest(). Some # tests keep a reference to sys.stdout or sys.stderr # (eg. test_argparse). if runtest.stringio is None: stream = io.StringIO() runtest.stringio = stream else: stream = runtest.stringio stream.seek(0) stream.truncate() orig_stdout = sys.stdout orig_stderr = sys.stderr try: sys.stdout = stream sys.stderr = stream result = runtest_inner(ns, test, display_failure=False) if result[0] != PASSED: output = stream.getvalue() orig_stderr.write(output) orig_stderr.flush() finally: sys.stdout = orig_stdout sys.stderr = orig_stderr else: support.verbose = ns.verbose # Tell tests to be moderately quiet result = runtest_inner(ns, test, display_failure=not ns.verbose) return result finally: if use_timeout: faulthandler.cancel_dump_traceback_later() cleanup_test_droppings(test, ns.verbose)
def runtest(ns, test): """Run a single test. test -- the name of the test verbose -- if true, print more messages quiet -- if true, don't print 'skipped' messages (probably redundant) huntrleaks -- run multiple times to test for leaks; requires a debug build; a triple corresponding to -R's three arguments output_on_failure -- if true, display test output on failure timeout -- dump the traceback and exit if a test takes more than timeout seconds failfast, match_tests -- See regrtest command-line flags for these. pgo -- if true, suppress any info irrelevant to a generating a PGO build Returns the tuple result, test_time, where result is one of the constants: INTERRUPTED KeyboardInterrupt when run under -j RESOURCE_DENIED test skipped because resource denied SKIPPED test skipped for some other reason ENV_CHANGED test failed because it changed the execution environment FAILED test failed PASSED test passed """ verbose = ns.verbose quiet = ns.quiet huntrleaks = ns.huntrleaks output_on_failure = ns.verbose3 failfast = ns.failfast match_tests = ns.match_tests timeout = ns.timeout pgo = ns.pgo use_timeout = (timeout is not None) if use_timeout: faulthandler.dump_traceback_later(timeout, exit=True) try: support.match_tests = match_tests if failfast: support.failfast = True if output_on_failure: support.verbose = True # Reuse the same instance to all calls to runtest(). Some # tests keep a reference to sys.stdout or sys.stderr # (eg. test_argparse). if runtest.stringio is None: stream = io.StringIO() runtest.stringio = stream else: stream = runtest.stringio stream.seek(0) stream.truncate() orig_stdout = sys.stdout orig_stderr = sys.stderr try: sys.stdout = stream sys.stderr = stream result = runtest_inner(ns, test, verbose, quiet, huntrleaks, display_failure=False, pgo=pgo) if result[0] == FAILED: output = stream.getvalue() orig_stderr.write(output) orig_stderr.flush() finally: sys.stdout = orig_stdout sys.stderr = orig_stderr else: support.verbose = verbose # Tell tests to be moderately quiet result = runtest_inner(ns, test, verbose, quiet, huntrleaks, display_failure=not verbose, pgo=pgo) return result finally: if use_timeout: faulthandler.cancel_dump_traceback_later() cleanup_test_droppings(test, verbose)
import logging import unittest from unittest import SkipTest from test.test_environment import env, db_user, CLIENT_PEM try: # Enable the fault handler to dump the traceback of each running # thread # after a segfault. import faulthandler faulthandler.enable() # Dump the tracebacks of all threads after 25 minutes. if hasattr(faulthandler, 'dump_traceback_later'): faulthandler.dump_traceback_later(25 * 60) except ImportError: pass def suppress_tornado_warnings(): for name in ['tornado.general', 'tornado.access']: logger = logging.getLogger(name) logger.setLevel(logging.ERROR) class SkippedModule(object): def __init__(self, name, reason): def runTest(self): raise SkipTest(str(reason))