Пример #1
0
    def run_py(self):
        # TODO: run pytest with appropriate cmdline options.
        # Q: record output?
        args = self.args
        with with_chdir(py_config.INSTALL_ROOT):

            # 'python'
            cmd = [sys.executable]
            if args.Werror:
                cmd.append('-Werror')
            # '-Wignore:::_pytest.assertion.rewrite' Suppresses deprecation warnings
            # in pytest (up to at least version 6.1.1)
            #
            # https://github.com/pytest-dev/pytest/issues/1403#issuecomment-443533232
            cmd.extend(
                ['-Wignore:::_pytest.assertion.rewrite', '-m', 'pytest'])
            if args.debug:
                cmd.append(['--pdb', '-s'])

            print_cmd(cmd)
            proc = subprocess.run(cmd)
            if proc.returncode != 0:
                logger.error("RL-Scope python unit tests failed")
                sys.exit(proc.returncode)
        logger.info("RL-Scope python unit tests PASSED")
Пример #2
0
def run_nvidia_smi(cmd_opts=[], debug=False):
    cmd = [
        NVIDIA_SMI_EXEC,
    ] + cmd_opts
    if debug:
        print_cmd(cmd)
    # output = subprocess.check_output(cmd, stderr=subprocess.STDOUT).decode('utf-8')
    lines = run_cmd(cmd)
    return lines
Пример #3
0
 def run_cpp(self):
     args = self.args
     if shutil.which(py_config.CPP_UNIT_TEST_CMD) is None:
         logger.error(
             "Didn't find C++ test binary ({bin}) on PATH; have you run build_rlscope yet?"
             .format(bin=py_config.CPP_UNIT_TEST_CMD, ))
         sys.exit(1)
     cmd = [py_config.CPP_UNIT_TEST_CMD]
     if args.debug:
         cmd = ['gdb', '--args'] + cmd
     print_cmd(cmd)
     proc = subprocess.run(cmd)
     if proc.returncode != 0:
         logger.error("RL-Scope C++ unit tests failed")
         sys.exit(proc.returncode)
     logger.info("RL-Scope C++ unit tests PASSED")
Пример #4
0
def execve_rlscope_binary(binary):
    exe_path = _j(CPP_BIN, binary)
    if not os.path.exists(exe_path):
        logger.error("Couldn't find {bin} binary @ {path}".format(
            bin=binary,
            path=exe_path,
        ))
        sys.exit(1)
    cmd = [exe_path] + sys.argv[1:]
    if DEBUG:
        print_cmd(cmd)
    env = dict(os.environ)

    sys.stdout.flush()
    sys.stderr.flush()
    os.execve(exe_path, cmd, env)
    # Shouldn't return from os.execve
    assert False
Пример #5
0
def _parse_entity(cmd_opts, csv_fields, post_process_entity=None, debug=False):
    format_opts = [
        'csv',
        'noheader',
        'nounits',
    ]

    cmd = [
        NVIDIA_SMI_EXEC,
        '--format={opts}'.format(opts=','.join(format_opts)),
    ] + cmd_opts

    if debug:
        print_cmd(cmd)
    # output = subprocess.check_output(cmd, stderr=subprocess.STDOUT).decode('utf-8')
    # lines = output.split(os.linesep)
    lines = run_cmd(cmd)
    entities = []
    for line in lines:
        if re.search(r'^\s*$', line):
            continue
        fields = line.split(', ')
        if len(fields) != len(csv_fields):
            if debug:
                logger.info(
                    pprint_msg({
                        'fields': fields,
                        'csv_fields': csv_fields,
                    }))
            assert len(fields) == len(csv_fields)
        dic = dict()
        for k, v in zip(csv_fields, fields):
            value = parse_value(k, v)
            dic[k] = value
        entities.append(dic)
    if post_process_entity is not None:
        for entity in entities:
            post_process_entity(entity)
    return entities
Пример #6
0
def main():

    try:
        check_host.check_config()
    except RLScopeConfigurationError as e:
        logger.error(e)
        sys.exit(1)

    rlscope_prof_argv, cmd_argv = gather_argv(sys.argv[1:])

    parser = argparse.ArgumentParser(
        description=
        "RL-Scope cross-stack profiler for reinforcement learning workloads.",
        formatter_class=argparse.RawTextHelpFormatter)
    # NOTE: these arguments must precede the executable (python some/script.py), otherwise they will be sent
    # to the training script, and not handled by this script (rls-prof).
    parser.add_argument('--debug', action='store_true')
    parser.add_argument("--verbosity",
                        choices=['progress', 'commands', 'output'],
                        default='progress',
                        help=textwrap.dedent("""\
                            Output information about running commands.
                            --verbosity progress (Default)
                                Only show high-level progress bar information.
                              
                            --verbosity commands
                                Show the command-line of commands that are being run.
                                
                            --verbosity output
                                Show the output of each analysis (not configuration) command on sys.stdout.
                                NOTE: This may cause interleaving of lines.
                            """))
    parser.add_argument('--line-numbers',
                        action='store_true',
                        help=textwrap.dedent("""\
    Show line numbers and timestamps in RL-Scope logging messages.
    """))
    parser.add_argument('--rlscope-debug', action='store_true')
    parser.add_argument('--rlscope-rm-traces-from',
                        help=textwrap.dedent("""\
    Delete traces rooted at this --rlscope-directory. 
    Useful if your training script has multiple training scripts, and you need to use --rlscope-skip-rm-traces 
    when launching the other scripts.
    """))
    # parser.add_argument('--rlscope-disable', action='store_true', help=textwrap.dedent("""\
    #     RL-Scope: Skip any profiling. Used for uninstrumented runs.
    #     Useful for ensuring minimal libcupti registration when we run --cuda-api-calls during config_uninstrumented.
    #
    #     Effect: sets "export RLSCOPE_DISABLE=1" for librlscope.so.
    # """))

    add_bool_arg(parser,
                 '--cuda-api-calls',
                 help=textwrap.dedent("""\
                        Trace CUDA API runtime/driver calls.
                        
                        i.e. total number of calls, and total time (usec) spent in a given API call.
                        
                        Effect: sets "export RLSCOPE_CUDA_API_CALLS=1" for librlscope.so.
                        """))
    add_bool_arg(parser,
                 '--cuda-activities',
                 help=textwrap.dedent("""\
                        Trace CUDA activities (i.e. GPU kernel runtimes, memcpy's).
                        
                        Effect: sets "export RLSCOPE_CUDA_ACTIVITIES=yes" for librlscope.so.
                        """))
    add_bool_arg(parser,
                 '--cuda-api-events',
                 help=textwrap.dedent("""\
                        Trace all the start/end timestamps of CUDA API calls.
                        Needed during instrumented runs so we know when to subtract profiling overheads.
                        
                        Effect: sets "export RLSCOPE_CUDA_API_EVENTS=yes" for librlscope.so.
                        """))
    add_bool_arg(parser,
                 '--gpu-hw',
                 help=textwrap.dedent("""\
                        Collect GPU hardware counters.
                        
                        Effect: sets "export RLSCOPE_GPU_HW=yes" for librlscope.so.
                        """))

    # parser.add_argument('--fuzz-cuda-api', action='store_true',
    #                     help=textwrap.dedent("""\
    #                     Use libcupti to trace ALL CUDA runtime API calls (# of calls, and total time spent in them).
    #                     This is useful for determining which CUDA API's we need to "calibrate subtractions" for.
    #                     NOTE: this SHOULDN'T be used for finding profiling book-keeping "subtractions", since it
    #                     adds a LOT of overhead to add start/end callbacks to all CUDA API functions.
    #
    #                     Effect: sets "export RLSCOPE_FUZZ_CUDA_API=yes" for librlscope.so.
    #                     """))

    parser.add_argument('--pc-sampling',
                        action='store_true',
                        help=textwrap.dedent("""\
                        Perform sample-profiling using CUDA's "PC Sampling" API.
                        
                        Currently, we're just going to record GPUSamplingState.is_gpu_active.
                        
                        Effect: sets "export RLSCOPE_PC_SAMPLING=1" for librlscope.so.
                        """))
    parser.add_argument('--trace-at-start',
                        action='store_true',
                        help=textwrap.dedent("""\
                        Start tracing right at application startup.
                        
                        Effect: sets "export RLSCOPE_TRACE_AT_START=yes" for librlscope.so.
                        """))
    # parser.add_argument('--stream-sampling', action='store_true',
    #                     help=textwrap.dedent("""\
    #                     Poll cudaStreamQuery() to see if the GPU is being used.
    #
    #                     Effect: sets "export RLSCOPE_STREAM_SAMPLING=yes" for librlscope.so.
    #                     """))

    calibrate_help = textwrap.dedent("""\
    Perform multiple runs in order to calibrate for profiling overhead 
    specific to the workload being run.
    """).rstrip()
    parser.add_argument("--calibrate",
                        dest='calibrate',
                        action='store_true',
                        default=True,
                        help=calibrate_help)
    parser.add_argument("--no-calibrate",
                        dest='calibrate',
                        action='store_false',
                        help=calibrate_help)

    parser.add_argument("--re-calibrate",
                        action='store_true',
                        help=textwrap.dedent("""\
                            Remove existing profiling overhead calibration files, and recompute them.
                            """))
    parser.add_argument("--re-plot",
                        action='store_true',
                        help=textwrap.dedent("""\
                            Remove existing plots and remake them (NOTE: doesn't recompute analysis; see --re-calibrate).
                            """))

    parallel_runs_help = textwrap.dedent("""\
                            Parallelize running configurations across GPUs on this machine (assume no CPU interference). 
                            See --gpus.
                            """)
    parser.add_argument("--parallel-runs",
                        dest='parallel_runs',
                        action='store_true',
                        default=True,
                        help=parallel_runs_help)
    parser.add_argument("--no-parallel-runs",
                        dest='parallel_runs',
                        action='store_false',
                        help=parallel_runs_help)

    parser.add_argument("--retry",
                        type=int,
                        help=textwrap.dedent("""\
                            If a command fails, retry it up to --retry times.
                            Default: don't retry.
                            """))
    parser.add_argument("--dry-run",
                        action='store_true',
                        help=textwrap.dedent("""\
                            Dry run
                            """))
    # parser.add_argument("--gpus",
    #                     action='store_true',
    #                     help=textwrap.dedent("""\
    #                         Parallelize running configurations across GPUs on this machine (assume no CPU inteference). See --rlscope-gpus
    #                         """))
    parser.add_argument("--gpus",
                        help=textwrap.dedent("""\
                        # Run on the first GPU only
                        --gpus 0
                        # Run on the first 2 GPUs
                        --gpus 0,1
                        # Run on all available GPUs
                        --gpus all
                        # Don't allow running with any GPUs (CUDA_VISIBLE_DEVICES="")
                        --gpus none
                        """))
    parser.add_argument(
        '--config',
        choices=[
            'interception',
            'no-interception',
            'gpu-activities',
            'gpu-activities-api-time',
            'no-gpu-activities',
            'full',
            'time-breakdown',
            'gpu-hw',
            'uninstrumented',
        ],
        # Detect if user provides --config or not.
        # By default, run with full RL-Scope instrumentation.
        # default=DEFAULT_CONFIG,
        help=textwrap.dedent("""\
                        For measuring LD_PRELOAD CUDA API interception overhead:
                            interception:
                                Enable LD_PRELOAD CUDA API interception.
                                $ rls-prof --debug --cuda-api-calls --cuda-api-events --rlscope-disable
                            no-interception:
                                Disable LD_PRELOAD CUDA API interception.
                                $ rls-prof --debug --rlscope-disable
                                
                        For measuring CUPTI GPU activity gathering overhead on a per CUDA API call basis.
                            gpu-activities:
                                Enable CUPTI GPU activity recording.
                                $ rls-prof --debug --cuda-api-calls --cuda-activities --rlscope-disable
                            no-gpu-activities:
                                Disable CUPTI GPU activity recording.
                                $ rls-prof --debug --cuda-api-calls --rlscope-disable
                                
                        Expect (for the above configurations):
                        You should run train.py with these arguments set
                        
                            # Since we are comparing total training time, 
                            # run each configuration with the same number of training loop steps.
                            --rlscope-max-passes $N
                            
                            # Disable any pyprof or old tfprof tracing code.
                            --rlscope-disable
                                
                        For collecting full RL-Scope traces for using with rls-run / rlscope-drill:
                            full:
                                Enable all of tfprof and pyprof collection.
                                $ rls-prof --cuda-api-calls --cuda-api-events --cuda-activities --rlscope-disable
                                NOTE: we still use --rlscope-disable to prevent "old" tfprof collection.
                                
                        gpu-hw:
                          ONLY collect GPU hardware counters
                        """))
    args = parser.parse_args(rlscope_prof_argv)

    is_debug = args.debug or args.rlscope_debug or is_env_true('RLSCOPE_DEBUG')
    rlscope_logging.setup_logger(
        debug=is_debug,
        line_numbers=is_debug or args.line_numbers
        or py_config.is_development_mode(),
    )

    if args.rlscope_rm_traces_from is not None:
        logger.info(
            "rls-prof: Delete trace-files rooted at --rlscope-directory = {dir}"
            .format(dir=args.rlscope_rm_traces_from))
        return

    rlscope_api.find_librlscope()
    so_path = rlscope_api.RLSCOPE_CLIB
    assert so_path is not None
    env = dict(os.environ)
    add_env = dict()
    add_env['LD_PRELOAD'] = "{ld}:{so_path}".format(ld=env.get(
        'LD_PRELOAD', ''),
                                                    so_path=so_path)
    # Q: I just want LD_LIBRARY_PATH to get printed...
    if 'LD_LIBRARY_PATH' in env:
        add_env['LD_LIBRARY_PATH'] = env['LD_LIBRARY_PATH']
    # if 'LD_LIBRARY_PATH' in env:
    #     add_env['LD_LIBRARY_PATH'] = env['LD_LIBRARY_PATH']

    def _set_if_none(attr, value):
        if getattr(args, attr) is None:
            setattr(args, attr, value)

    def maybe_remove(xs, x):
        if x in xs:
            xs.remove(x)

    if args.calibrate:
        if args.config is not None:
            logger.error(
                "Only --calibrate or --config should be provided for rls-prof."
            )
            parser.exit(1)
        # Run calibrate.py
        cmd = ['rls-calibrate', 'run']

        if args.gpu_hw:
            cmd.extend(['--gpu-hw'])
            maybe_remove(rlscope_prof_argv, '--gpu-hw')

        cmd.extend(['--verbosity', args.verbosity])

        if args.parallel_runs:
            cmd.extend(['--parallel-runs'])
            maybe_remove(rlscope_prof_argv, '--parallel-runs')
        else:
            cmd.extend(['--no-parallel-runs'])
            maybe_remove(rlscope_prof_argv, '--no-parallel-runs')

        if args.retry is not None:
            cmd.extend(['--retry', str(args.retry)])

        # Q: Can't we just pass this through?
        # if args.re_calibrate:
        #     cmd.extend(['--re-calibrate'])
        #     rlscope_prof_argv.remove('--re-calibrate')

        # if args.gpus is not None:
        #     cmd.extend(['--gpus', args.gpus])
        maybe_remove(rlscope_prof_argv, '--calibrate')
        cmd.extend(rlscope_prof_argv)
        cmd.extend(cmd_argv)
        # cmd.remove('--calibrate')
        print_cmd(cmd)
        try:
            proc = subprocess.run(cmd, check=False)
            sys.exit(proc.returncode)
        except KeyboardInterrupt:
            logger.info(
                "Saw Ctrl-C during calibration; aborting remaining runs.")
            sys.exit(1)

    if args.config is None:
        args.config = DEFAULT_CONFIG

    add_env['RLSCOPE_CONFIG'] = args.config
    if args.config == 'interception':
        "rls-prof --debug --cuda-api-calls --cuda-api-events"
        _set_if_none('cuda_api_calls', True)
        _set_if_none('cuda_api_events', True)
    elif args.config in ['no-interception', 'uninstrumented']:
        "rls-prof --debug"
        pass
    elif args.config == 'gpu-hw':
        "$ rls-prof --debug --gpu-hw"
        _set_if_none('cuda_api_calls', False)
        _set_if_none('cuda_api_events', False)
        _set_if_none('cuda_activities', False)
        _set_if_none('gpu_hw', True)
    elif args.config == 'no-gpu-activities':
        "$ rls-prof --debug --cuda-api-calls"
        _set_if_none('cuda_api_calls', True)
        _set_if_none('gpu_hw', False)
    elif args.config == 'gpu-activities':
        "$ rls-prof --debug --cuda-api-calls --cuda-activities"
        _set_if_none('cuda_api_calls', True)
        _set_if_none('cuda_activities', True)
        _set_if_none('gpu_hw', False)
    elif args.config == 'gpu-activities-api-time':
        "$ rls-prof --debug --cuda-api-calls --cuda-api-events --cuda-activities"
        _set_if_none('cuda_api_calls', True)
        _set_if_none('cuda_api_events', True)
        _set_if_none('cuda_activities', True)
        _set_if_none('gpu_hw', False)
    elif args.config is None or args.config in {'full', 'time-breakdown'}:
        "$ rls-prof --cuda-api-calls --cuda-api-events --cuda-activities"
        _set_if_none('cuda_api_calls', True)
        _set_if_none('cuda_api_events', True)
        _set_if_none('cuda_activities', True)
        _set_if_none('gpu_hw', False)
    else:
        raise NotImplementedError()

    # if args.fuzz_cuda_api and args.cuda_api_calls:
    #     parser.error("Can only run rls-prof with --fuzz-cuda-api or --cuda-api-calls, not both")

    if args.debug or args.rlscope_debug or is_env_true('RLSCOPE_DEBUG'):
        logger.info(
            "Detected debug mode; enabling C++ logging statements (export RLSCOPE_CPP_MIN_VLOG_LEVEL=1)"
        )
        add_env['RLSCOPE_CPP_MIN_VLOG_LEVEL'] = 1

    # if args.rlscope_disable:
    #     add_env['RLSCOPE_DISABLE'] = 'yes'

    def set_yes_no(attr, env_var):
        if getattr(args, attr):
            add_env[env_var] = 'yes'
        else:
            add_env[env_var] = 'no'

    set_yes_no('cuda_api_calls', 'RLSCOPE_CUDA_API_CALLS')

    set_yes_no('cuda_activities', 'RLSCOPE_CUDA_ACTIVITIES')

    set_yes_no('gpu_hw', 'RLSCOPE_GPU_HW')

    set_yes_no('pc_sampling', 'RLSCOPE_PC_SAMPLING')

    # set_yes_no('fuzz_cuda_api', 'RLSCOPE_FUZZ_CUDA_API')

    set_yes_no('cuda_api_events', 'RLSCOPE_CUDA_API_EVENTS')

    set_yes_no('gpu_hw', 'RLSCOPE_GPU_HW')

    set_yes_no('trace_at_start', 'RLSCOPE_TRACE_AT_START')

    # set_yes_no('stream_sampling', 'RLSCOPE_STREAM_SAMPLING')

    if len(cmd_argv) == 0:
        parser.print_usage()
        logger.error(
            "You must provide a command to execute after \"rls-prof\"")
        sys.exit(1)

    exe_path = shutil.which(cmd_argv[0])
    if exe_path is None:
        print(
            "RL-Scope ERROR: couldn't locate {exe} on $PATH; try giving a full path to {exe} perhaps?"
            .format(exe=cmd_argv[0], ))
        sys.exit(1)
    # cmd = argv
    cmd = [exe_path] + cmd_argv[1:]
    print_cmd(cmd, env=add_env)

    env.update(add_env)
    for k in list(env.keys()):
        env[k] = str(env[k])

    sys.stdout.flush()
    sys.stderr.flush()
    os.execve(exe_path, cmd, env)
    # os.execve shouldn't return.
    assert False
Пример #7
0
def main():

    try:
        check_host.check_config()
    except RLScopeConfigurationError as e:
        logger.error(e)
        sys.exit(1)

    rlscope_util_argv, cmd_argv = split_argv_on(sys.argv[1:])
    parser = get_util_sampler_parser(add_rlscope_root_pid=len(cmd_argv) == 0)
    args = parser.parse_args(rlscope_util_argv)

    # To make it easy to launch utilization sampler manually in certain code bases,
    # allow ignoring all the --rlscope-* arguments:
    #
    # e.g. in minigo's loop_main.sh shell script we do
    #   python3 -m scripts.utilization_sampler "$@" --rlscope-directory $BASE_DIR &
    # where $@ contains all the --rlscope-* args.
    args, extra_argv = parser.parse_known_args()
    # args = parser.parse_args()

    # NOTE: During profiling, we depend on this being called from the root training script.
    if not args.skip_smi_check:
        nvidia_gpu_query.check_nvidia_smi()

    if args.kill:
        for proc in psutil.process_iter():
            # if proc.name() == sys.argv[0]:
            # pinfo = proc.as_dict(attrs=['pid', 'name', 'username'])
            pinfo = proc.as_dict(attrs=['pid', 'username', 'cmdline'])
            pprint.pprint({'pinfo': pinfo})
            # cmdline = proc.cmdline()
            try:
                logger.info(pinfo['cmdline'])
                if re.search(r'rls-util-sampler', ' '.join(
                        pinfo['cmdline'])) and pinfo['pid'] != os.getpid():
                    logger.info(
                        "> Kill rls-util-sampler: {proc}".format(proc=proc))
                    proc.kill()
            except psutil.NoSuchProcess:
                pass
        sys.exit(0)

    if args.rlscope_directory is None:
        logger.info(
            "--rlscope-directory is required: directory where trace-files are saved"
        )
        parser.print_help()
        sys.exit(1)

    os.makedirs(args.rlscope_directory, exist_ok=True)

    # if args.measure_samples_per_sec:
    #     measure_samples_per_sec()
    #     return

    if args.rlscope_util_sample_frequency_sec < MIN_UTIL_SAMPLE_FREQUENCY_SEC:
        parser.error(
            "Need --rlscope-util-sample-frequency-sec={val} to be larger than minimum sample frequency ({min} sec)"
            .format(
                val=args.rlscope_util_sample_frequency_sec,
                min=MIN_UTIL_SAMPLE_FREQUENCY_SEC,
            ))

    rlscope_root_pid = None
    cmd_proc = None
    if len(cmd_argv) != 0:
        exe_path = shutil.which(cmd_argv[0])
        if exe_path is None:
            print(
                "RL-Scope ERROR: couldn't locate {exe} on $PATH; try giving a full path to {exe} perhaps?"
                .format(exe=cmd_argv[0], ))
            sys.exit(1)
        cmd = [exe_path] + cmd_argv[1:]
        print_cmd(cmd)

        sys.stdout.flush()
        sys.stderr.flush()
        cmd_proc = subprocess.Popen(cmd)
        rlscope_root_pid = cmd_proc.pid
    else:
        rlscope_root_pid = args.rlscope_root_pid

    # NOTE: usually, we have rls-prof program signal us to terminate.
    # However if they provide a cmd, we would like to terminate sampler when cmd finishes, and return cmd's exit status.
    util_sampler = UtilizationSampler(
        directory=args.rlscope_directory,
        pid=rlscope_root_pid,
        async_process=cmd_proc,
        util_dump_frequency_sec=args.rlscope_util_dump_frequency_sec,
        util_sample_frequency_sec=args.rlscope_util_sample_frequency_sec,
        debug=args.rlscope_debug,
        debug_single_thread=args.rlscope_debug_single_thread,
    )
    util_sampler.run()
    sys.exit(util_sampler.exit_status)
Пример #8
0
def tee(cmd, to_file,
        cwd=None,
        append=False,
        makedirs=True,
        check=True,
        dry_run=False,
        tee_output=True,
        tee_cmd=None,
        tee_prefix=None,
        only_show_env=None,
        **kwargs):
    """
    Run shell *cmd*, outputting its results to *to_file*, and echo-ing the output to ``sys.stdout``.

    Arguments
    ---------
    cmd: List[str]
        Shell command.
    to_file: str
        Save *cmd* output to this file.
    cwd: str
        Directory to run command from.
    append: bool
        Append to *to_file*.
    makedirs: bool
    check: bool
    dry_run: bool
    tee_output: bool
    tee_prefix: str
    only_show_env: Set[str]
    """

    # In case there are int's or float's in cmd.
    cmd = [str(opt) for opt in cmd]

    if tee_cmd is None:
        tee_cmd = tee_output

    if dry_run:
        if tee_cmd:
            with in_directory(cwd):
                print_cmd(cmd, files=[sys.stdout], env=kwargs.get('env', None), dry_run=dry_run, only_show_env=only_show_env)
        return

    with ScopedLogFile(to_file, append=append, makedirs=makedirs) as f:
        with in_directory(cwd):
            if tee_cmd:
                files = [sys.stdout, f]
            else:
                files = [f]
            print_cmd(cmd, files=files, env=kwargs.get('env', None), only_show_env=only_show_env)

            # NOTE: Regarding the bug mentioned below, using p.communicate() STILL hangs.
            #
            # p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, **kwargs)
            # with p:
            #     outs, errs = p.communicate()
            #     sys.stdout.write(outs)
            #     sys.stdout.write(errs)
            #     sys.stdout.flush()
            #
            #     f.write(outs)
            #     f.write(errs)
            #     f.flush()

            debug = False

            p = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT, **kwargs)
            with p:
                # NOTE: if this blocks it may be because there's a zombie utilization_sampler.py still running
                # (that was created by the training script) that hasn't been terminated!
                # for line in p.stdout:

                while True:

                    if debug:
                        logger.info("RUN [05]: p.poll()")
                    rc = p.poll()
                    if rc is not None:
                        break

                    # BUG: SOMETIMES (I don't know WHY), this line will HANG even after
                    # train_stable_baselines.sh is terminated (it shows up as a Zombie process in htop/top).
                    # Sadly, this only happens occasionally, and I have yet to understand WHY it happens.
                    if debug:
                        logger.info("RUN [06]: line = p.stdout.readline()")
                    line = p.stdout.readline()

                    # b'\n'-separated lines
                    if debug:
                        logger.info("RUN [07]: line.decode")
                    line = line.decode("utf-8")

                    if debug:
                        logger.info("RUN [08]: line (\"{line}\") == '' (result={result})".format(
                            result=(line == ''),
                            line=line))
                    if line == '':
                        break

                    if re.search(r'> train\.py has exited', line):
                        pass
                        # logger.info("> ENABLE TEE DEBUGGING")
                        # debug = True

                    if debug:
                        logger.info("RUN [01]: sys.stdout.write(line)")
                    if tee_output:
                        if tee_prefix is not None:
                            sys.stdout.write(tee_prefix)
                        sys.stdout.write(line)
                    if debug:
                        logger.info("RUN [02]: sys.stdout.flush()")
                    if tee_output:
                        sys.stdout.flush()

                    if debug:
                        logger.info("RUN [03]: f.write(line)")
                    f.write(line)
                    if debug:
                        logger.info("RUN [04]: f.flush()")
                    f.flush()
            if tee_output:
                sys.stdout.flush()
            f.flush()

            if check and p.returncode != 0:
                raise subprocess.CalledProcessError(p.returncode, p.args)
            return p