Пример #1
0
    def _get_result(self) -> QueueOutput | None:
        use_faulthandler = (self.ns.timeout is not None)
        timeout = PROGRESS_UPDATE

        # bpo-46205: check the status of workers every iteration to avoid
        # waiting forever on an empty queue.
        while any(worker.is_alive() for worker in self.workers):
            if use_faulthandler:
                faulthandler.dump_traceback_later(MAIN_PROCESS_TIMEOUT,
                                                  exit=True)

            # wait for a thread
            try:
                return self.output.get(timeout=timeout)
            except queue.Empty:
                pass

            # display progress
            running = get_running(self.workers)
            if running and not self.ns.pgo:
                self.log('running: %s' % ', '.join(running))

        # all worker threads are done: consume pending results
        try:
            return self.output.get(timeout=0)
        except queue.Empty:
            return None
Пример #2
0
def main() -> None:
    if len(sys.argv) != 2:
        print(
            "worker_process_env_path must be provided as a commandline argument",
            file=sys.stderr)
        sys.exit(1)

    # Load the worker process env.
    worker_process_env_path = pathlib.Path(sys.argv[1])
    worker_process_env = layers.WorkerProcessContext.from_file(
        worker_process_env_path)

    config_logging(worker_process_env)

    if worker_process_env.env.experiment_config.debug_enabled():
        faulthandler.dump_traceback_later(30, repeat=True)

    # Establish the connection to the ZMQBroadcastServer in this container.
    pub_url = f"tcp://localhost:{worker_process_env.broadcast_pub_port}"
    sub_url = f"tcp://localhost:{worker_process_env.broadcast_pull_port}"
    with ipc.ZMQBroadcastClient(pub_url, sub_url) as broadcast_client:

        # Wrap the communication layer in a workload.Stream.
        subrec = layers.SubprocessReceiver(broadcast_client)

        controller = load.prepare_controller(
            worker_process_env.env,
            iter(subrec),
            worker_process_env.load_path,
            worker_process_env.rendezvous_info,
            worker_process_env.hvd_config,
        )
        controller.run()
Пример #3
0
    def _get_result(self) -> QueueOutput | None:
        if not any(worker.is_alive() for worker in self.workers):
            # all worker threads are done: consume pending results
            try:
                return self.output.get(timeout=0)
            except queue.Empty:
                return None

        use_faulthandler = (self.ns.timeout is not None)
        timeout = PROGRESS_UPDATE
        while True:
            if use_faulthandler:
                faulthandler.dump_traceback_later(MAIN_PROCESS_TIMEOUT,
                                                  exit=True)

            # wait for a thread
            try:
                return self.output.get(timeout=timeout)
            except queue.Empty:
                pass

            # display progress
            running = get_running(self.workers)
            if running and not self.ns.pgo:
                self.log('running: %s' % ', '.join(running))
Пример #4
0
    def setUpClass(cls):
        cls.orig_handler = signal.signal(signal.SIGALRM, lambda *args: None)
        signal.setitimer(signal.ITIMER_REAL, cls.signal_delay,
                         cls.signal_period)

        # Issue #25277: Use faulthandler to try to debug a hang on FreeBSD
        faulthandler.dump_traceback_later(10 * 60, exit=True)
Пример #5
0
def main():
    # In case of import deadlock, crash after a finite timeout
    faulthandler.dump_traceback_later(timeout, exit=True)
    with ThreadPoolExecutor(num_threads) as pool:
        assert "pandas" not in sys.modules  # pandas is imported lazily
        list(pool.map(thread_func, range(num_threads)))
        assert "pandas" in sys.modules
Пример #6
0
    def main(self, tests=None, **kwargs):
        self.parse_args(kwargs)

        self.set_temp_dir()

        if self.ns.cleanup:
            self.cleanup()
            sys.exit(0)

        test_cwd = self.create_temp_dir()

        try:
            # Run the tests in a context manager that temporarily changes the CWD
            # to a temporary and writable directory. If it's not possible to
            # create or change the CWD, the original CWD will be used.
            # The original CWD is available from os_helper.SAVEDCWD.
            with os_helper.temp_cwd(test_cwd, quiet=True):
                # When using multiprocessing, worker processes will use test_cwd
                # as their parent temporary directory. So when the main process
                # exit, it removes also subdirectories of worker processes.
                self.ns.tempdir = test_cwd

                self._main(tests, kwargs)
        except SystemExit as exc:
            # bpo-38203: Python can hang at exit in Py_Finalize(), especially
            # on threading._shutdown() call: put a timeout
            faulthandler.dump_traceback_later(EXIT_TIMEOUT, exit=True)

            sys.exit(exc.code)
Пример #7
0
def build_and_run_training_pipeline(env: det.EnvContext) -> None:

    # Create the socket manager. The socket manager will connect to the master and read messages
    # until it receives the rendezvous_info.
    #
    # TODO(ryan): Pull profiler hooks out of SocketManager and into their own layer.
    with layers.SocketManager(env) as socket_mgr:

        # Create the storage manager. This is used to download the initial checkpoint here in
        # build_training_pipeline and also used by the workload manager to create and store
        # checkpoints during training.
        storage_mgr = storage.build(env.experiment_config["checkpoint_storage"])

        [tensorboard_mgr, tensorboard_writer] = load.prepare_tensorboard(env)

        # Create the workload manager. The workload manager will receive workloads from the
        # socket_mgr, and augment them with some additional arguments. Additionally, the
        # workload manager is responsible for some generic workload hooks for things like timing
        # workloads, preparing checkpoints, and uploading completed checkpoints.  Finally, the
        # workload manager does some sanity checks on response messages that originate from the
        # trial.
        #
        # TODO(ryan): Refactor WorkloadManager into separate layers that do each separate task.
        workload_mgr = layers.build_workload_manager(
            env,
            iter(socket_mgr),
            socket_mgr.get_rendezvous_info(),
            storage_mgr,
            tensorboard_mgr,
            tensorboard_writer,
        )

        hvd_config = horovod.HorovodContext.from_configs(
            env.experiment_config, socket_mgr.get_rendezvous_info(), env.hparams
        )
        logging.info(f"Horovod config: {hvd_config.__dict__}.")

        # Load the checkpoint, if necessary. Any possible sinks to this pipeline will need access
        # to this checkpoint.
        with maybe_load_checkpoint(storage_mgr, env.latest_checkpoint) as load_path:

            # Horovod distributed training is done inside subprocesses.
            if hvd_config.use:
                subproc = layers.SubprocessLauncher(
                    env, iter(workload_mgr), load_path, socket_mgr.get_rendezvous_info(), hvd_config
                )
                subproc.run()
            else:
                if env.experiment_config.debug_enabled():
                    faulthandler.dump_traceback_later(30, repeat=True)

                controller = load.prepare_controller(
                    env,
                    iter(workload_mgr),
                    load_path,
                    socket_mgr.get_rendezvous_info(),
                    hvd_config,
                )
                controller.run()
Пример #8
0
def maybe_periodic_stacktraces(debug_enabled: bool) -> Iterator[None]:
    if debug_enabled:
        faulthandler.dump_traceback_later(30, repeat=True)
    try:
        yield
    finally:
        if debug_enabled:
            faulthandler.cancel_dump_traceback_later()
Пример #9
0
def pytest_unconfigure(config):
    # Setup a global traceback printer callback to debug deadlocks that
    # would happen once pytest has completed: for instance in atexit
    # finalizers. At this point the stdout/stderr capture of pytest
    # should be disabled.

    # Note that we also use a shorter timeout for the per-test callback
    # configured via the pytest-timeout extension.
    import faulthandler
    faulthandler.dump_traceback_later(60, exit=True)
Пример #10
0
def _runtest(ns, test_name):
    # Handle faulthandler timeout, capture stdout+stderr, XML serialization
    # and measure time.

    output_on_failure = ns.verbose3

    use_timeout = (ns.timeout is not None)
    if use_timeout:
        faulthandler.dump_traceback_later(ns.timeout, exit=True)

    start_time = time.perf_counter()
    try:
        support.set_match_tests(ns.match_tests)
        support.junit_xml_list = xml_list = [] if ns.xmlpath else None
        if ns.failfast:
            support.failfast = True

        if output_on_failure:
            support.verbose = True

            stream = io.StringIO()
            orig_stdout = sys.stdout
            orig_stderr = sys.stderr
            try:
                sys.stdout = stream
                sys.stderr = stream
                result = _runtest_inner(ns, test_name, display_failure=False)
                if result != PASSED:
                    output = stream.getvalue()
                    orig_stderr.write(output)
                    orig_stderr.flush()
            finally:
                sys.stdout = orig_stdout
                sys.stderr = orig_stderr
        else:
            # Tell tests to be moderately quiet
            support.verbose = ns.verbose

            result = _runtest_inner(ns,
                                    test_name,
                                    display_failure=not ns.verbose)

        if xml_list:
            import xml.etree.ElementTree as ET
            xml_data = [ET.tostring(x).decode('us-ascii') for x in xml_list]
        else:
            xml_data = None

        test_time = time.perf_counter() - start_time

        return TestResult(test_name, result, test_time, xml_data)
    finally:
        if use_timeout:
            faulthandler.cancel_dump_traceback_later()
        support.junit_xml_list = None
Пример #11
0
    def setUp(self):
        self.signals = 0
        self.orig_handler = signal.signal(signal.SIGALRM, self.sighandler)
        signal.setitimer(signal.ITIMER_REAL, self.signal_delay,
                         self.signal_period)

        # Use faulthandler as watchdog to debug when a test hangs
        # (timeout of 10 minutes)
        faulthandler.dump_traceback_later(10 * 60,
                                          exit=True,
                                          file=sys.__stderr__)
Пример #12
0
def runtest(ns, test):
    """Run a single test.

    ns -- regrtest namespace of options
    test -- the name of the test

    Returns the tuple (result, test_time), where result is one of the
    constants:

        INTERRUPTED      KeyboardInterrupt when run under -j
        RESOURCE_DENIED  test skipped because resource denied
        SKIPPED          test skipped for some other reason
        ENV_CHANGED      test failed because it changed the execution environment
        FAILED           test failed
        PASSED           test passed
    """
    output_on_failure = ns.verbose3
    use_timeout = ns.timeout is not None
    if use_timeout:
        faulthandler.dump_traceback_later(ns.timeout, exit=True)
    try:
        support.match_tests = ns.match_tests
        if ns.failfast:
            support.failfast = True
        if output_on_failure:
            support.verbose = True
            if runtest.stringio is None:
                stream = io.StringIO()
                runtest.stringio = stream
            else:
                stream = runtest.stringio
                stream.seek(0)
                stream.truncate()
            orig_stdout = sys.stdout
            orig_stderr = sys.stderr
            try:
                sys.stdout = stream
                sys.stderr = stream
                result = runtest_inner(ns, test, display_failure=False)
                if result[0] != PASSED:
                    output = stream.getvalue()
                    orig_stderr.write(output)
                    orig_stderr.flush()
            finally:
                sys.stdout = orig_stdout
                sys.stderr = orig_stderr
        else:
            support.verbose = ns.verbose
            result = runtest_inner(ns, test, display_failure=not ns.verbose)
        return result
    finally:
        if use_timeout:
            faulthandler.cancel_dump_traceback_later()
        cleanup_test_droppings(test, ns.verbose)
Пример #13
0
def exit_on_deadlock():
    try:
        TIMEOUT = 5
        from faulthandler import dump_traceback_later
        from faulthandler import cancel_dump_traceback_later
        from sys import stderr
        dump_traceback_later(timeout=TIMEOUT, exit=True, file=stderr)
        yield
        cancel_dump_traceback_later()
    except ImportError:
        yield
Пример #14
0
    def setUp(self):
        self.signals = 0
        self.orig_handler = signal.signal(signal.SIGALRM, self.sighandler)
        signal.setitimer(signal.ITIMER_REAL, self.signal_delay,
                         self.signal_period)

        # Use faulthandler as watchdog to debug when a test hangs
        # (timeout of 10 minutes)
        if hasattr(faulthandler, 'dump_traceback_later'):
            faulthandler.dump_traceback_later(10 * 60, exit=True,
                                              file=sys.__stderr__)
Пример #15
0
 def test_stderr_None(self):
     # Issue #21497: provide a helpful error if sys.stderr is None,
     # instead of just an attribute error: "None has no attribute fileno".
     with self.check_stderr_none():
         faulthandler.enable()
     with self.check_stderr_none():
         faulthandler.dump_traceback()
     with self.check_stderr_none():
         faulthandler.dump_traceback_later(1e-3)
     if hasattr(faulthandler, "register"):
         with self.check_stderr_none():
             faulthandler.register(signal.SIGUSR1)
Пример #16
0
def pytest_runtest_protocol(item):
    timeout = float(item.config.getini("faulthandler_timeout") or 0.0)
    if timeout > 0:
        import faulthandler

        stderr = item.config.fault_handler_stderr
        faulthandler.dump_traceback_later(timeout, file=stderr)
        try:
            yield
        finally:
            faulthandler.cancel_dump_traceback_later()
    else:
        yield
Пример #17
0
 def test_stderr_None(self):
     # Issue #21497: provide a helpful error if sys.stderr is None,
     # instead of just an attribute error: "None has no attribute fileno".
     with self.check_stderr_none():
         faulthandler.enable()
     with self.check_stderr_none():
         faulthandler.dump_traceback()
     if hasattr(faulthandler, 'dump_traceback_later'):
         with self.check_stderr_none():
             faulthandler.dump_traceback_later(1e-3)
     if hasattr(faulthandler, "register"):
         with self.check_stderr_none():
             faulthandler.register(signal.SIGUSR1)
Пример #18
0
def pytest_runtest_protocol(item: Item) -> Generator[None, None, None]:
    timeout = get_timeout_config_value(item.config)
    stderr = item.config._store[fault_handler_stderr_key]
    if timeout > 0 and stderr is not None:
        import faulthandler

        faulthandler.dump_traceback_later(timeout, file=stderr)
        try:
            yield
        finally:
            faulthandler.cancel_dump_traceback_later()
    else:
        yield
Пример #19
0
    def pytest_runtest_protocol(self, item):
        timeout = self.get_timeout_config_value(item.config)
        stderr = item.config.fault_handler_stderr
        if timeout > 0 and stderr is not None:
            import faulthandler

            faulthandler.dump_traceback_later(timeout, file=stderr)
            try:
                yield
            finally:
                faulthandler.cancel_dump_traceback_later()
        else:
            yield
def pytest_runtest_protocol(item):
    enabled = item.config.getoption('fault_handler')
    timeout = item.config.getoption('fault_handler_timeout')
    timeout_supported = timeout_support_available()
    if enabled and timeout > 0 and timeout_supported:
        import faulthandler
        stderr = item.config.fault_handler_stderr
        faulthandler.dump_traceback_later(timeout, file=stderr)
        try:
            yield
        finally:
            faulthandler.cancel_dump_traceback_later()
    else:
        yield
def pytest_runtest_protocol(item):
    enabled = item.config.getoption('fault_handler')
    timeout = item.config.getoption('fault_handler_timeout')
    timeout_supported = timeout_support_available()
    if enabled and timeout > 0 and timeout_supported:
        import faulthandler
        stderr = item.config.fault_handler_stderr
        faulthandler.dump_traceback_later(timeout, file=stderr)
        try:
            yield
        finally:
            faulthandler.cancel_dump_traceback_later()
    else:
        yield
Пример #22
0
def main() -> None:
    if len(sys.argv) != 2:
        print("worker_process_env_path must be provided as a commandline argument", file=sys.stderr)
        sys.exit(1)

    # Load the worker process env.
    worker_process_env_path = pathlib.Path(sys.argv[1])
    worker_process_env = layers.WorkerProcessContext.from_file(worker_process_env_path)

    config_logging(worker_process_env)

    # API code expects credential to be available as an environment variable
    os.environ["DET_TASK_TOKEN"] = worker_process_env.env.det_task_token

    # TODO: refactor websocket, data_layer, and profiling to to not use the cli_cert.
    master_url = (
        f"http{'s' if worker_process_env.env.use_tls else ''}://"
        f"{worker_process_env.env.master_addr}:{worker_process_env.env.master_port}"
    )
    certs.cli_cert = certs.default_load(master_url=master_url)

    if worker_process_env.env.experiment_config.debug_enabled():
        faulthandler.dump_traceback_later(30, repeat=True)

    # Establish the connection to the ZMQBroadcastServer in this container.
    pub_url = f"tcp://localhost:{worker_process_env.broadcast_pub_port}"
    sub_url = f"tcp://localhost:{worker_process_env.broadcast_pull_port}"
    with ipc.ZMQBroadcastClient(pub_url, sub_url) as broadcast_client:

        # Wrap the communication layer in a workload.Stream.
        subrec = layers.SubprocessReceiver(broadcast_client)
        workloads = iter(subrec)

        with det._catch_sys_exit():
            with det._catch_init_invalid_hp(workloads):
                controller = load.prepare_controller(
                    worker_process_env.env,
                    workloads,
                    worker_process_env.load_path,
                    worker_process_env.rendezvous_info,
                    worker_process_env.hvd_config,
                )

            try:
                controller.run()

            except Exception as e:
                broadcast_client.send_exception_message()
                raise e
Пример #23
0
    def _get_result(self):
        if not any(worker.is_alive() for worker in self.workers):
            # all worker threads are done: consume pending results
            try:
                return self.output.get(timeout=0)
            except queue.Empty:
                return None

        while True:
            if self.main_timeout is not None:
                faulthandler.dump_traceback_later(self.main_timeout, exit=True)

            # wait for a thread
            timeout = max(PROGRESS_UPDATE, PROGRESS_MIN_TIME)
            try:
                return self.output.get(timeout=timeout)
            except queue.Empty:
                pass

            # display progress
            running = get_running(self.workers)
            if running and not self.ns.pgo:
                print('running: %s' % ', '.join(running), flush=True)
Пример #24
0
    def _check_dump_traceback_later(self, repeat, cancel, filename):
        """
        Check how many times the traceback is written in timeout x 2.5 seconds,
        or timeout x 3.5 seconds if cancel is True: 1, 2 or 3 times depending
        on repeat and cancel options.

        Raise an error if the output doesn't match the expect format.
        """
        timeout_str = str(datetime.timedelta(seconds=TIMEOUT))
        code = """
import faulthandler
import time

def func(repeat, cancel, timeout):
    if cancel:
        faulthandler.cancel_dump_traceback_later()
    for loop in range(2):
        time.sleep(timeout * 1.25)
    faulthandler.cancel_dump_traceback_later()

timeout = %s
repeat = %s
cancel = %s
if %s:
    file = open(%s, "wb")
else:
    file = None
faulthandler.dump_traceback_later(timeout,
    repeat=repeat, file=file)
func(repeat, cancel, timeout)
if file is not None:
    file.close()
""".strip()
        code = code % (TIMEOUT, repeat, cancel,
                       bool(filename), repr(filename))
        trace, exitcode = self.get_output(code, filename)
        trace = '\n'.join(trace)

        if not cancel:
            if repeat:
                count = 2
            else:
                count = 1
            header = r'Timeout \(%s\)!\nCurrent thread XXX:\n' % timeout_str
            regex = expected_traceback(8, 20, header, count=count)
            self.assertRegex(trace, regex)
        else:
            self.assertEqual(trace, '')
        self.assertEqual(exitcode, 0)
Пример #25
0
def runtest(ns, test):
    """Run a single test.

    ns -- regrtest namespace of options
    test -- the name of the test

    Returns the tuple (result, test_time, xml_data), where result is one
    of the constants:

        INTERRUPTED      KeyboardInterrupt when run under -j
        RESOURCE_DENIED  test skipped because resource denied
        SKIPPED          test skipped for some other reason
        ENV_CHANGED      test failed because it changed the execution environment
        FAILED           test failed
        PASSED           test passed
        EMPTY_TEST_SUITE test ran no subtests.

    If ns.xmlpath is not None, xml_data is a list containing each
    generated testsuite element.
    """

    output_on_failure = ns.verbose3

    use_timeout = (ns.timeout is not None)
    if use_timeout:
        faulthandler.dump_traceback_later(ns.timeout, exit=True)
    try:
        support.set_match_tests(ns.match_tests)
        # reset the environment_altered flag to detect if a test altered
        # the environment
        support.environment_altered = False
        support.junit_xml_list = xml_list = [] if ns.xmlpath else None
        if ns.failfast:
            support.failfast = True
        if output_on_failure:
            support.verbose = True

            stream = io.StringIO()
            orig_stdout = sys.stdout
            orig_stderr = sys.stderr
            try:
                sys.stdout = stream
                sys.stderr = stream
                result = runtest_inner(ns, test, display_failure=False)
                if result[0] != PASSED:
                    output = stream.getvalue()
                    orig_stderr.write(output)
                    orig_stderr.flush()
            finally:
                sys.stdout = orig_stdout
                sys.stderr = orig_stderr
        else:
            support.verbose = ns.verbose  # Tell tests to be moderately quiet
            result = runtest_inner(ns, test, display_failure=not ns.verbose)

        if xml_list:
            import xml.etree.ElementTree as ET
            xml_data = [ET.tostring(x).decode('us-ascii') for x in xml_list]
        else:
            xml_data = None
        return result + (xml_data,)
    finally:
        if use_timeout:
            faulthandler.cancel_dump_traceback_later()
        cleanup_test_droppings(test, ns.verbose)
        support.junit_xml_list = None
Пример #26
0
def runtest(test, verbose, quiet,
            huntrleaks=False, use_resources=None,
            output_on_failure=False, failfast=False, match_tests=None,
            timeout=None):
    """Run a single test.

    test -- the name of the test
    verbose -- if true, print more messages
    quiet -- if true, don't print 'skipped' messages (probably redundant)
    huntrleaks -- run multiple times to test for leaks; requires a debug
                  build; a triple corresponding to -R's three arguments
    use_resources -- list of extra resources to use
    output_on_failure -- if true, display test output on failure
    timeout -- dump the traceback and exit if a test takes more than
               timeout seconds
    failfast, match_tests -- See regrtest command-line flags for these.

    Returns the tuple result, test_time, where result is one of the constants:
        INTERRUPTED      KeyboardInterrupt when run under -j
        RESOURCE_DENIED  test skipped because resource denied
        SKIPPED          test skipped for some other reason
        ENV_CHANGED      test failed because it changed the execution environment
        FAILED           test failed
        PASSED           test passed
    """

    if use_resources is not None:
        support.use_resources = use_resources
    use_timeout = (timeout is not None)
    if use_timeout:
        faulthandler.dump_traceback_later(timeout, exit=True)
    try:
        support.match_tests = match_tests
        if failfast:
            support.failfast = True
        if output_on_failure:
            support.verbose = True

            # Reuse the same instance to all calls to runtest(). Some
            # tests keep a reference to sys.stdout or sys.stderr
            # (eg. test_argparse).
            if runtest.stringio is None:
                stream = io.StringIO()
                runtest.stringio = stream
            else:
                stream = runtest.stringio
                stream.seek(0)
                stream.truncate()

            orig_stdout = sys.stdout
            orig_stderr = sys.stderr
            try:
                sys.stdout = stream
                sys.stderr = stream
                result = runtest_inner(test, verbose, quiet, huntrleaks,
                                       display_failure=False)
                if result[0] == FAILED:
                    output = stream.getvalue()
                    orig_stderr.write(output)
                    orig_stderr.flush()
            finally:
                sys.stdout = orig_stdout
                sys.stderr = orig_stderr
        else:
            support.verbose = verbose  # Tell tests to be moderately quiet
            result = runtest_inner(test, verbose, quiet, huntrleaks,
                                   display_failure=not verbose)
        return result
    finally:
        if use_timeout:
            faulthandler.cancel_dump_traceback_later()
        cleanup_test_droppings(test, verbose)
Пример #27
0
def _runtest(ns: Namespace, test_name: str) -> TestResult:
    # Handle faulthandler timeout, capture stdout+stderr, XML serialization
    # and measure time.

    output_on_failure = ns.verbose3

    use_timeout = (ns.timeout is not None)
    if use_timeout:
        faulthandler.dump_traceback_later(ns.timeout, exit=True)

    start_time = time.perf_counter()
    try:
        support.set_match_tests(ns.match_tests, ns.ignore_tests)
        support.junit_xml_list = xml_list = [] if ns.xmlpath else None
        if ns.failfast:
            support.failfast = True

        if output_on_failure:
            support.verbose = True

            stream = io.StringIO()
            orig_stdout = sys.stdout
            orig_stderr = sys.stderr
            print_warning = support.print_warning
            orig_print_warnings_stderr = print_warning.orig_stderr

            output = None
            try:
                sys.stdout = stream
                sys.stderr = stream
                # print_warning() writes into the temporary stream to preserve
                # messages order. If support.environment_altered becomes true,
                # warnings will be written to sys.stderr below.
                print_warning.orig_stderr = stream

                result = _runtest_inner(ns, test_name,
                                        display_failure=False)
                if not isinstance(result, Passed):
                    output = stream.getvalue()
            finally:
                sys.stdout = orig_stdout
                sys.stderr = orig_stderr
                print_warning.orig_stderr = orig_print_warnings_stderr

            if output is not None:
                sys.stderr.write(output)
                sys.stderr.flush()
        else:
            # Tell tests to be moderately quiet
            support.verbose = ns.verbose

            result = _runtest_inner(ns, test_name,
                                    display_failure=not ns.verbose)

        if xml_list:
            import xml.etree.ElementTree as ET
            result.xml_data = [
                ET.tostring(x).decode('us-ascii')
                for x in xml_list
            ]

        result.duration_sec = time.perf_counter() - start_time
        return result
    finally:
        if use_timeout:
            faulthandler.cancel_dump_traceback_later()
        support.junit_xml_list = None
Пример #28
0
def run_tests_multiprocess(regrtest):
    output = queue.Queue()
    pending = MultiprocessIterator(regrtest.tests)
    test_timeout = regrtest.ns.timeout
    use_timeout = (test_timeout is not None)

    workers = [
        MultiprocessThread(pending, output, regrtest.ns)
        for i in range(regrtest.ns.use_mp)
    ]
    print("Run tests in parallel using %s child processes" % len(workers))
    for worker in workers:
        worker.start()

    def get_running(workers):
        running = []
        for worker in workers:
            current_test = worker.current_test
            if not current_test:
                continue
            dt = time.monotonic() - worker.start_time
            if dt >= PROGRESS_MIN_TIME:
                text = '%s (%s)' % (current_test, format_duration(dt))
                running.append(text)
        return running

    finished = 0
    test_index = 1
    get_timeout = max(PROGRESS_UPDATE, PROGRESS_MIN_TIME)
    try:
        while finished < regrtest.ns.use_mp:
            if use_timeout:
                faulthandler.dump_traceback_later(test_timeout, exit=True)

            try:
                item = output.get(timeout=get_timeout)
            except queue.Empty:
                running = get_running(workers)
                if running and not regrtest.ns.pgo:
                    print('running: %s' % ', '.join(running), flush=True)
                continue

            test, stdout, stderr, result = item
            if test is None:
                finished += 1
                continue
            regrtest.accumulate_result(test, result)

            # Display progress
            ok, test_time = result
            text = format_test_result(test, ok)
            if (ok not in (CHILD_ERROR, INTERRUPTED)
                    and test_time >= PROGRESS_MIN_TIME
                    and not regrtest.ns.pgo):
                text += ' (%.0f sec)' % test_time
            elif ok == CHILD_ERROR:
                text = '%s (%s)' % (text, test_time)
            running = get_running(workers)
            if running and not regrtest.ns.pgo:
                text += ' -- running: %s' % ', '.join(running)
            regrtest.display_progress(test_index, text)

            # Copy stdout and stderr from the child process
            if stdout:
                print(stdout, flush=True)
            if stderr and not regrtest.ns.pgo:
                print(stderr, file=sys.stderr, flush=True)

            if result[0] == INTERRUPTED:
                raise KeyboardInterrupt
            test_index += 1
    except KeyboardInterrupt:
        regrtest.interrupted = True
        pending.interrupted = True
        print()
    finally:
        if use_timeout:
            faulthandler.cancel_dump_traceback_later()

    # If tests are interrupted, wait until tests complete
    wait_start = time.monotonic()
    while True:
        running = [worker.current_test for worker in workers]
        running = list(filter(bool, running))
        if not running:
            break

        dt = time.monotonic() - wait_start
        line = "Waiting for %s (%s tests)" % (', '.join(running), len(running))
        if dt >= WAIT_PROGRESS:
            line = "%s since %.0f sec" % (line, dt)
        print(line, flush=True)
        for worker in workers:
            worker.join(WAIT_PROGRESS)
Пример #29
0
    pythonDict["Variables"], pythonDict["Expression"] = sympy.cse(code)
    for i, expr in enumerate(pythonDict["Variables"]):
        pythonDict["Variables"][i] = {
            "name": str(expr[0]),
            "expr": str(expressionToCode(expr[1], language))
        }
    pythonDict["Expression"] = expressionToCode(pythonDict["Expression"][0],
                                                language)
    return pythonDict


# Begin Parsing
faulthandler.enable()

try:
    # Timeout after 10 seconds if it doesn't return
    faulthandler.dump_traceback_later(10)

    command = sys.argv[1]
    language = sys.argv[2]
    expression = sympy.sympify(sys.argv[3])
    assert expression is not None, "SymPy Error: Cannot evaluate expression!"

    result = None
    if (command == 'eval'):
        print(json.dumps(convertSymPyToDict(expression, language), indent=4))
    sys.stdout.flush()

finally:
    faulthandler.cancel_dump_traceback_later()
Пример #30
0
def exit_on_deadlock():
    dump_traceback_later(timeout=30, exit=True)
    yield
    cancel_dump_traceback_later()
Пример #31
0
def main():
    global g_get_sim_time
    global g_instance_name
    global g_asked_to_pause

    def incomingAsyncEvent(
            packet
    ):  # CAUTION: Called asynchronously from the ZeroMQ rx thread
        logging.info("incoming async event " + str(packet))
        if "action" in packet:
            if packet["action"] == "event":
                if packet["headers"]["Instancename"] == g_instance_name:
                    logging.info("Matches this instance name, so dispatching")
                    engine.register_event_in(0, device_factory.external_event,
                                             packet, None)
            elif packet["action"] == "announce":
                logging.log(packet["severity"],
                            "[broadcast message] " + packet["message"])
            elif packet["action"] == "command":
                if packet["headers"]["Instancename"] == g_instance_name:
                    argv = packet["argv"]
                    logging.info("Received async command " + str(argv))
                    if len(argv) > 0:
                        client.async_command(argv)

    def event_count_callback():
        return events.event_count

    logging.getLogger().setLevel(logging.INFO)
    os.makedirs("../synth_logs", exist_ok=True)
    os.makedirs("../synth_accounts", exist_ok=True)

    params = get_params()
    assert g_instance_name is not None, "Instance name has not been defined, but this is required for logfile naming"
    init_logging(params)
    logging.info("*** Synth starting at real time " + str(datetime.now()) +
                 " ***")
    logging.info(
        "Parameters:\n" +
        json.dumps(params, sort_keys=True, indent=4, separators=(',', ': ')))
    post_to_slack("Started")

    install_signal_catcher()

    Tstart = time.time()  # Human time
    Tstart_process = time.process_time()  # Time CPU usage
    random.seed(12345)  # Ensure reproduceability

    if not "client" in params:
        logging.error("No client defined to receive simulation results")
        return
    client = importer.get_class('client',
                                params['client']['type'])(g_instance_name,
                                                          params,
                                                          params['client'])

    if not "engine" in params:
        logging.error("No simulation engine defined")
        return
    engine = importer.get_class('engine', params['engine']['type'])(
        params['engine'], client.enter_interactive, event_count_callback)
    g_get_sim_time = engine.get_now_no_lock

    if not "events" in params:
        logging.warning("No events defined")
    events = Events(client, engine, g_instance_name, params, params["events"])

    zeromq_rx.init(incomingAsyncEvent, emit_logging=True)
    zeromq_tx.init(emit_logging=True)

    logging.info("Simulation starts")

    faulthandler.dump_traceback_later(
        600, repeat=True
    )  # TEMP - every 10 minutes emit a stack trace for every thread - to diagnose hanging issue
    err_str = ""
    try:
        while engine.events_to_come():
            engine.next_event()
            client.tick()
            if g_asked_to_pause:
                g_asked_to_pause = False
                logging.info("Paused")
                signal.pause(
                )  # Suspend this process. Receiving any signal will then cause us to resume
                logging.info("Resuming")
        device_factory.close()
    except:
        err_str = traceback.format_exc(
        )  # Report any exception, but continue to clean-up anyway
        logging.error("Error at real time " + str(datetime.now()) + " (local)")
        logging.error(err_str)

    logging.info("Simulation ends")
    logging.info("Ending device logging (" +
                 str(len(device_factory.g_devices)) +
                 " devices were emulated)")
    events.flush()
    client.close()

    logging.info("Elapsed real time: " + str(int(time.time() - Tstart)) +
                 " seconds.")
    logging.info("CPU time used: " +
                 str(int(time.process_time() - Tstart_process)) + " seconds.")

    if err_str == "":
        post_to_slack("Finished OK")
        exit(0)
    post_to_slack(err_str)
    exit(-1)
Пример #32
0
def run_tests_multiprocess(regrtest):
    output = queue.Queue()
    pending = MultiprocessIterator(regrtest.tests)
    test_timeout = regrtest.ns.timeout
    use_timeout = (test_timeout is not None)

    workers = [MultiprocessThread(pending, output, regrtest.ns)
               for i in range(regrtest.ns.use_mp)]
    print("Run tests in parallel using %s child processes"
          % len(workers))
    for worker in workers:
        worker.start()

    def get_running(workers):
        running = []
        for worker in workers:
            current_test = worker.current_test
            if not current_test:
                continue
            dt = time.monotonic() - worker.start_time
            if dt >= PROGRESS_MIN_TIME:
                text = '%s (%s)' % (current_test, format_duration(dt))
                running.append(text)
        return running

    finished = 0
    test_index = 1
    get_timeout = max(PROGRESS_UPDATE, PROGRESS_MIN_TIME)
    try:
        while finished < regrtest.ns.use_mp:
            if use_timeout:
                faulthandler.dump_traceback_later(test_timeout, exit=True)

            try:
                item = output.get(timeout=get_timeout)
            except queue.Empty:
                running = get_running(workers)
                if running and not regrtest.ns.pgo:
                    print('running: %s' % ', '.join(running), flush=True)
                continue

            test, stdout, stderr, result = item
            if test is None:
                finished += 1
                continue
            regrtest.accumulate_result(test, result)

            # Display progress
            ok, test_time = result
            text = format_test_result(test, ok)
            if (ok not in (CHILD_ERROR, INTERRUPTED)
                and test_time >= PROGRESS_MIN_TIME
                and not regrtest.ns.pgo):
                text += ' (%s)' % format_duration(test_time)
            elif ok == CHILD_ERROR:
                text = '%s (%s)' % (text, test_time)
            running = get_running(workers)
            if running and not regrtest.ns.pgo:
                text += ' -- running: %s' % ', '.join(running)
            regrtest.display_progress(test_index, text)

            # Copy stdout and stderr from the child process
            if stdout:
                print(stdout, flush=True)
            if stderr and not regrtest.ns.pgo:
                print(stderr, file=sys.stderr, flush=True)

            if result[0] == INTERRUPTED:
                raise KeyboardInterrupt
            test_index += 1
    except KeyboardInterrupt:
        regrtest.interrupted = True
        pending.interrupted = True
        print()
    finally:
        if use_timeout:
            faulthandler.cancel_dump_traceback_later()

    # If tests are interrupted, wait until tests complete
    wait_start = time.monotonic()
    while True:
        running = [worker.current_test for worker in workers]
        running = list(filter(bool, running))
        if not running:
            break

        dt = time.monotonic() - wait_start
        line = "Waiting for %s (%s tests)" % (', '.join(running), len(running))
        if dt >= WAIT_PROGRESS:
            line = "%s since %.0f sec" % (line, dt)
        print(line, flush=True)
        for worker in workers:
            worker.join(WAIT_PROGRESS)
Пример #33
0
def main(args):
    # Set up faulthandler
    faulthandler.enable()
    # Set up logging and devices
    args.save_dir = util.get_save_dir(args.save_dir, args.name, training=True)
    log = util.get_logger(args.save_dir, args.name)
    tbx = SummaryWriter(args.save_dir)
    device, args.gpu_ids = util.get_available_devices()
    log.info(f'Args: {dumps(vars(args), indent=4, sort_keys=True)}')
    args.batch_size *= max(1, len(args.gpu_ids))

    # Set random seed
    log.info(f'Using random seed {args.seed}...')
    random.seed(args.seed)
    torch.manual_seed(args.seed)
    torch.cuda.manual_seed_all(args.seed)

    # Get embeddings
    log.info('Loading embeddings...')
    word_vectors = util.torch_from_json(args.word_emb_file)
    char_vectors = util.torch_from_json(args.char_emb_file)

    # Get model
    log.info('Building model...')
    model_params = {
        'word_vectors': word_vectors,
        'char_vectors': char_vectors,
        'args': args
    }
    model = get_model(args.model, model_params)
    print('Model size: {:f} MB'.format(
        sum(p.nelement() * p.element_size()
            for p in model.parameters()) / (1024 * 1024)))
    # model = nn.DataParallel(model, args.gpu_ids)
    if args.load_path:
        log.info(f'Loading checkpoint from {args.load_path}...')
        model, step = util.load_model(model, args.load_path, args.gpu_ids)
    else:
        step = 0
    model = model.to(device)
    model.train()
    ema = util.EMA(model, args.ema_decay)

    # Get saver
    saver = util.CheckpointSaver(args.save_dir,
                                 max_checkpoints=args.max_checkpoints,
                                 metric_name=args.metric_name,
                                 maximize_metric=args.maximize_metric,
                                 log=log)

    # Get optimizer and scheduler
    optimizer = optim.Adadelta(model.parameters(),
                               args.lr,
                               weight_decay=args.l2_wd)
    scheduler = sched.LambdaLR(optimizer, lambda s: 1.)  # Constant LR

    # Get data loader
    log.info('Building dataset...')
    train_dataset = SQuAD(args.train_record_file, args.use_squad_v2)
    train_loader = data.DataLoader(train_dataset,
                                   batch_size=args.batch_size,
                                   shuffle=True,
                                   num_workers=args.num_workers,
                                   collate_fn=collate_fn)
    dev_dataset = SQuAD(args.dev_record_file, args.use_squad_v2)
    dev_loader = data.DataLoader(dev_dataset,
                                 batch_size=args.batch_size,
                                 shuffle=False,
                                 num_workers=args.num_workers,
                                 collate_fn=collate_fn)

    # Train
    log.info('Training...')
    steps_till_eval = args.eval_steps
    epoch = step // len(train_dataset)
    while epoch != args.num_epochs:
        epoch += 1
        log.info(f'Starting epoch {epoch}...')
        with torch.enable_grad(), \
                tqdm(total=len(train_loader.dataset)) as progress_bar:
            for cw_idxs, cc_idxs, qw_idxs, qc_idxs, y1, y2, ids in train_loader:
                progress_bar.set_description(
                    'Batch data_loading finished'.ljust(30))
                progress_bar.refresh()

                # Setup for forward
                cw_idxs = cw_idxs.to(device)
                qw_idxs = qw_idxs.to(device)
                cc_idxs = cc_idxs.to(device)
                qc_idxs = qc_idxs.to(device)
                batch_size = cw_idxs.size(0)
                optimizer.zero_grad()
                progress_bar.set_description(
                    'Batch initialization finished'.ljust(30))
                progress_bar.refresh()

                # Forward
                faulthandler.dump_traceback_later(timeout=3)
                log_p1, log_p2 = model(cw_idxs, cc_idxs, qw_idxs, qc_idxs)
                faulthandler.cancel_dump_traceback_later()
                progress_bar.set_description(
                    'Batch forward finished'.ljust(30))
                progress_bar.refresh()
                y1, y2 = y1.to(device), y2.to(device)
                loss = F.nll_loss(log_p1, y1) + F.nll_loss(log_p2, y2)
                loss_val = loss.item()

                # Backward
                faulthandler.dump_traceback_later(timeout=3)
                loss.backward()
                faulthandler.cancel_dump_traceback_later()
                progress_bar.set_description(
                    'Batch backward finished'.ljust(30))
                progress_bar.refresh()
                nn.utils.clip_grad_norm_(model.parameters(),
                                         args.max_grad_norm)
                optimizer.step()
                progress_bar.set_description('Optimization finished'.ljust(30))
                progress_bar.refresh()
                scheduler.step()
                ema(model, step // batch_size)

                # Log info
                step += batch_size
                progress_bar.update(batch_size)
                progress_bar.set_postfix(epoch=epoch, NLL=loss_val)
                tbx.add_scalar('train/NLL', loss_val, step)
                tbx.add_scalar('train/LR', optimizer.param_groups[0]['lr'],
                               step)

                steps_till_eval -= batch_size
                if steps_till_eval <= 0:
                    steps_till_eval = args.eval_steps

                    # Evaluate and save checkpoint
                    log.info(f'Evaluating at step {step}...')
                    ema.assign(model)
                    results, pred_dict = evaluate(model, dev_loader, device,
                                                  args.dev_eval_file,
                                                  args.max_ans_len,
                                                  args.use_squad_v2)
                    progress_bar.set_description(
                        'Evaluation finished'.ljust(30))
                    progress_bar.refresh()
                    saver.save(step, model, results[args.metric_name], device)
                    ema.resume(model)

                    # Log to console
                    results_str = ', '.join(f'{k}: {v:05.2f}'
                                            for k, v in results.items())
                    log.info(f'Dev {results_str}')

                    # Log to TensorBoard
                    log.info('Visualizing in TensorBoard...')
                    for k, v in results.items():
                        tbx.add_scalar(f'dev/{k}', v, step)
                    util.visualize(tbx,
                                   pred_dict=pred_dict,
                                   eval_path=args.dev_eval_file,
                                   step=step,
                                   split='dev',
                                   num_visuals=args.num_visuals)
def runtest(ns, test):
    """Run a single test.

    ns -- regrtest namespace of options
    test -- the name of the test

    Returns the tuple (result, test_time, xml_data), where result is one
    of the constants:

        INTERRUPTED      KeyboardInterrupt when run under -j
        RESOURCE_DENIED  test skipped because resource denied
        SKIPPED          test skipped for some other reason
        ENV_CHANGED      test failed because it changed the execution environment
        FAILED           test failed
        PASSED           test passed
        EMPTY_TEST_SUITE test ran no subtests.

    If ns.xmlpath is not None, xml_data is a list containing each
    generated testsuite element.
    """

    output_on_failure = ns.verbose3

    use_timeout = (ns.timeout is not None)
    if use_timeout:
        faulthandler.dump_traceback_later(ns.timeout, exit=True)
    try:
        support.set_match_tests(ns.match_tests)
        # reset the environment_altered flag to detect if a test altered
        # the environment
        support.environment_altered = False
        support.junit_xml_list = xml_list = [] if ns.xmlpath else None
        if ns.failfast:
            support.failfast = True
        if output_on_failure:
            support.verbose = True

            stream = io.StringIO()
            orig_stdout = sys.stdout
            orig_stderr = sys.stderr
            try:
                sys.stdout = stream
                sys.stderr = stream
                result = runtest_inner(ns, test, display_failure=False)
                if result[0] != PASSED:
                    output = stream.getvalue()
                    orig_stderr.write(output)
                    orig_stderr.flush()
            finally:
                sys.stdout = orig_stdout
                sys.stderr = orig_stderr
        else:
            support.verbose = ns.verbose  # Tell tests to be moderately quiet
            result = runtest_inner(ns, test, display_failure=not ns.verbose)

        if xml_list:
            import xml.etree.ElementTree as ET
            xml_data = [ET.tostring(x).decode('us-ascii') for x in xml_list]
        else:
            xml_data = None
        return result + (xml_data, )
    finally:
        if use_timeout:
            faulthandler.cancel_dump_traceback_later()
        cleanup_test_droppings(test, ns.verbose)
        support.junit_xml_list = None
Пример #35
0
def runtest(ns, test):
    """Run a single test.

    ns -- regrtest namespace of options
    test -- the name of the test

    Returns the tuple (result, test_time), where result is one of the
    constants:

        INTERRUPTED      KeyboardInterrupt when run under -j
        RESOURCE_DENIED  test skipped because resource denied
        SKIPPED          test skipped for some other reason
        ENV_CHANGED      test failed because it changed the execution environment
        FAILED           test failed
        PASSED           test passed
    """

    output_on_failure = ns.verbose3

    use_timeout = (ns.timeout is not None)
    if use_timeout:
        faulthandler.dump_traceback_later(ns.timeout, exit=True)
    try:
        support.match_tests = ns.match_tests
        # reset the environment_altered flag to detect if a test altered
        # the environment
        support.environment_altered = False
        if ns.failfast:
            support.failfast = True
        if output_on_failure:
            support.verbose = True

            # Reuse the same instance to all calls to runtest(). Some
            # tests keep a reference to sys.stdout or sys.stderr
            # (eg. test_argparse).
            if runtest.stringio is None:
                stream = io.StringIO()
                runtest.stringio = stream
            else:
                stream = runtest.stringio
                stream.seek(0)
                stream.truncate()

            orig_stdout = sys.stdout
            orig_stderr = sys.stderr
            try:
                sys.stdout = stream
                sys.stderr = stream
                result = runtest_inner(ns, test, display_failure=False)
                if result[0] != PASSED:
                    output = stream.getvalue()
                    orig_stderr.write(output)
                    orig_stderr.flush()
            finally:
                sys.stdout = orig_stdout
                sys.stderr = orig_stderr
        else:
            support.verbose = ns.verbose  # Tell tests to be moderately quiet
            result = runtest_inner(ns, test, display_failure=not ns.verbose)
        return result
    finally:
        if use_timeout:
            faulthandler.cancel_dump_traceback_later()
        cleanup_test_droppings(test, ns.verbose)
Пример #36
0
def runtest(ns, test):
    """Run a single test.

    test -- the name of the test
    verbose -- if true, print more messages
    quiet -- if true, don't print 'skipped' messages (probably redundant)
    huntrleaks -- run multiple times to test for leaks; requires a debug
                  build; a triple corresponding to -R's three arguments
    output_on_failure -- if true, display test output on failure
    timeout -- dump the traceback and exit if a test takes more than
               timeout seconds
    failfast, match_tests -- See regrtest command-line flags for these.
    pgo -- if true, suppress any info irrelevant to a generating a PGO build

    Returns the tuple result, test_time, where result is one of the constants:
        INTERRUPTED      KeyboardInterrupt when run under -j
        RESOURCE_DENIED  test skipped because resource denied
        SKIPPED          test skipped for some other reason
        ENV_CHANGED      test failed because it changed the execution environment
        FAILED           test failed
        PASSED           test passed
    """

    verbose = ns.verbose
    quiet = ns.quiet
    huntrleaks = ns.huntrleaks
    output_on_failure = ns.verbose3
    failfast = ns.failfast
    match_tests = ns.match_tests
    timeout = ns.timeout
    pgo = ns.pgo

    use_timeout = (timeout is not None)
    if use_timeout:
        faulthandler.dump_traceback_later(timeout, exit=True)
    try:
        support.match_tests = match_tests
        if failfast:
            support.failfast = True
        if output_on_failure:
            support.verbose = True

            # Reuse the same instance to all calls to runtest(). Some
            # tests keep a reference to sys.stdout or sys.stderr
            # (eg. test_argparse).
            if runtest.stringio is None:
                stream = io.StringIO()
                runtest.stringio = stream
            else:
                stream = runtest.stringio
                stream.seek(0)
                stream.truncate()

            orig_stdout = sys.stdout
            orig_stderr = sys.stderr
            try:
                sys.stdout = stream
                sys.stderr = stream
                result = runtest_inner(ns, test, verbose, quiet, huntrleaks,
                                       display_failure=False, pgo=pgo)
                if result[0] == FAILED:
                    output = stream.getvalue()
                    orig_stderr.write(output)
                    orig_stderr.flush()
            finally:
                sys.stdout = orig_stdout
                sys.stderr = orig_stderr
        else:
            support.verbose = verbose  # Tell tests to be moderately quiet
            result = runtest_inner(ns, test, verbose, quiet, huntrleaks,
                                   display_failure=not verbose, pgo=pgo)
        return result
    finally:
        if use_timeout:
            faulthandler.cancel_dump_traceback_later()
        cleanup_test_droppings(test, verbose)
Пример #37
0
import logging
import unittest
from unittest import SkipTest

from test.test_environment import env, db_user, CLIENT_PEM

try:
    # Enable the fault handler to dump the traceback of each running
    # thread
    # after a segfault.
    import faulthandler

    faulthandler.enable()
    # Dump the tracebacks of all threads after 25 minutes.
    if hasattr(faulthandler, 'dump_traceback_later'):
        faulthandler.dump_traceback_later(25 * 60)
except ImportError:
    pass


def suppress_tornado_warnings():
    for name in ['tornado.general', 'tornado.access']:
        logger = logging.getLogger(name)
        logger.setLevel(logging.ERROR)


class SkippedModule(object):
    def __init__(self, name, reason):
        def runTest(self):
            raise SkipTest(str(reason))