def check_nvidia_smi(exit_if_fail=False, debug=False): """ Make sure nvidia-smi runs fast enough to perform GPU utilization sampling. :return: """ start_t = time.time() # $ nvidia-smi smi_output = run_nvidia_smi(debug=debug) end_t = time.time() nvidia_smi_sec = end_t - start_t if nvidia_smi_sec > MAX_NVIDIA_SMI_TIME_SEC: # $ sudo service nvidia-persistenced start errmsg = textwrap.dedent(""" RL-Scope WARNING: nvidia-smi takes a long time to run on your system. In particular, it took {sec} sec to run nvidia-smi (we would prefer < {limit_sec}). This will interfere with sampling GPU utilization. You can fix this by running the following command: # Start systemd nvidia-persistenced service (if it's not already running). $ sudo nvidia-persistenced --persistence-mode For more details see: https://devtalk.nvidia.com/default/topic/1011192/nvidia-smi-is-slow-on-ubuntu-16-04-/ """).format( sec=nvidia_smi_sec, limit_sec=MAX_NVIDIA_SMI_TIME_SEC, ) if exit_if_fail: logger.error(errmsg) sys.exit(1) else: logger.warning(errmsg)
def run_py(self): # TODO: run pytest with appropriate cmdline options. # Q: record output? args = self.args with with_chdir(py_config.INSTALL_ROOT): # 'python' cmd = [sys.executable] if args.Werror: cmd.append('-Werror') # '-Wignore:::_pytest.assertion.rewrite' Suppresses deprecation warnings # in pytest (up to at least version 6.1.1) # # https://github.com/pytest-dev/pytest/issues/1403#issuecomment-443533232 cmd.extend( ['-Wignore:::_pytest.assertion.rewrite', '-m', 'pytest']) if args.debug: cmd.append(['--pdb', '-s']) print_cmd(cmd) proc = subprocess.run(cmd) if proc.returncode != 0: logger.error("RL-Scope python unit tests failed") sys.exit(proc.returncode) logger.info("RL-Scope python unit tests PASSED")
def _gpu_worker(self, gpu, *args, **kwargs): try: self.gpu_worker(gpu, *args, **kwargs) except KeyboardInterrupt: logger.debug(f"GPU[{gpu}] worker saw Ctrl-C; exiting early") return except Exception as e: logger.error(textwrap.dedent("""\ BUG: GPU[{gpu}] worker failed with unhandled exception: {error} """).format( gpu=gpu, error=textwrap.indent(traceback.format_exc(), prefix=' '), ).rstrip()) sys.exit(1)
def run_cpp(self): args = self.args if shutil.which(py_config.CPP_UNIT_TEST_CMD) is None: logger.error( "Didn't find C++ test binary ({bin}) on PATH; have you run build_rlscope yet?" .format(bin=py_config.CPP_UNIT_TEST_CMD, )) sys.exit(1) cmd = [py_config.CPP_UNIT_TEST_CMD] if args.debug: cmd = ['gdb', '--args'] + cmd print_cmd(cmd) proc = subprocess.run(cmd) if proc.returncode != 0: logger.error("RL-Scope C++ unit tests failed") sys.exit(proc.returncode) logger.info("RL-Scope C++ unit tests PASSED")
def execve_rlscope_binary(binary): exe_path = _j(CPP_BIN, binary) if not os.path.exists(exe_path): logger.error("Couldn't find {bin} binary @ {path}".format( bin=binary, path=exe_path, )) sys.exit(1) cmd = [exe_path] + sys.argv[1:] if DEBUG: print_cmd(cmd) env = dict(os.environ) sys.stdout.flush() sys.stderr.flush() os.execve(exe_path, cmd, env) # Shouldn't return from os.execve assert False
def main(): parser = argparse.ArgumentParser( description=textwrap.dedent(__doc__.lstrip().rstrip()), formatter_class=argparse.RawTextHelpFormatter) # TODO: add --pdb to break on failed python tests, and gdb on failed C++ tests. parser.add_argument("--debug", action='store_true', help=textwrap.dedent("""\ Debug unit tests. """)) parser.add_argument("--Werror", action='store_true', help=textwrap.dedent("""\ Treat warnings as errors (pytest) """)) parser.add_argument("--tests", choices=['py', 'cpp', 'all'], default='all', help=textwrap.dedent("""\ Which unit tests to run: py: Just python unit tests. cpp: Just C++ unit tests (rls-test). all: Both python and C++ unit tests. """)) try: import pytest except ModuleNotFoundError as e: logger.error( textwrap.dedent(""" To run rls-unit-tests, you must install pytest: $ pip install "pytest >= 4.4.1" """).rstrip()) sys.exit(1) # raise args = parser.parse_args() unit_tests = RLSUnitTests(args) unit_tests.run()
def main(): try: check_host.check_config() except RLScopeConfigurationError as e: logger.error(e) sys.exit(1) parser = argparse.ArgumentParser( description=textwrap.dedent(__doc__.lstrip().rstrip()), formatter_class=argparse.RawTextHelpFormatter) parser.add_argument("--run", action='store_true', help=textwrap.dedent("""\ Run the command as-is. """)) parser.add_argument("--append", action='store_true', help=textwrap.dedent("""\ Append the command to --sh """)) parser.add_argument("--sh", help=textwrap.dedent("""\ Shell file to append commands to (see --append). """)) parser.add_argument('--run-sh', action='store_true', help=textwrap.dedent("""\ Run all the commands in --sh on the available --gpus """)) parser.add_argument('--rlscope-directory', help=textwrap.dedent("""\ The output directory of the command being run. This is where logfile.out will be output. """)) parser.add_argument("--verbosity", choices=['progress', 'commands', 'output'], default='progress', help=textwrap.dedent("""\ Output information about running commands. --verbosity progress (Default) Only show high-level progress bar information. --verbosity commands Show the command-line of commands that are being run. --verbosity output Show the output of each analysis (not configuration) command on sys.stdout. NOTE: This may cause interleaving of lines. """)) parser.add_argument('--line-numbers', action='store_true', help=textwrap.dedent("""\ Show line numbers and timestamps in RL-Scope logging messages. """)) parser.add_argument('--debug', action='store_true', help=textwrap.dedent("""\ Debug """)) parser.add_argument('--skip-final-error-message', action='store_true', help=textwrap.dedent("""\ Skip error message printed at the end if at least one command fails. """)) parser.add_argument("--retry", type=int, help=textwrap.dedent("""\ If a command fails, retry it up to --retry times. Default: don't retry. """)) parser.add_argument("--tee", action='store_true', help=textwrap.dedent("""\ (debug) tee output of parallel processes to stdout (prefix output with worker name) """)) parser.add_argument("--pdb", action='store_true', help=textwrap.dedent("""\ Debug """)) parser.add_argument('--dry-run', action='store_true', help=textwrap.dedent("""\ Dry run """)) parser.add_argument('--skip-errors', action='store_true', help=textwrap.dedent("""\ If a command fails, ignore the failure and continue running other commands. """)) parser.add_argument("--gpus", default='all', help=textwrap.dedent("""\ # Run on the first GPU only --gpus 0 # Run on the first 2 GPUs --gpus 0,1 # Run on all available GPUs --gpus all # Don't allow running with any GPUs (CUDA_VISIBLE_DEVICES="") --gpus none """)) all_args, _ = parser.parse_known_args(sys.argv) ignore_opts = set() if all_args.sh is not None: ignore_opts.add(all_args.sh) run_expr_argv, cmd = gather_argv( sys.argv[1:], ignore_opts=ignore_opts) args = parser.parse_args(run_expr_argv) if args.debug: logger.debug({ 'run_expr_argv': run_expr_argv, 'cmd': cmd, }) rlscope_logging.setup_logger( debug=args.debug, line_numbers=args.debug or args.line_numbers or py_config.is_development_mode(), ) if args.sh is None and ( args.run_sh or args.append ): error("--sh is required when either --run-sh or --append are given", parser=parser) if args.run_sh and ( args.append or args.run ): error("When --run-sh is given, you cannot provide either --append or --run", parser=parser) available_gpus = get_available_gpus() if args.gpus == 'all': gpus = sorted([gpu['device_number'] for gpu in available_gpus]) elif args.gpus.lower() == 'none': args.gpus = [None] else: try: gpus = sorted([int(gpu) for gpu in re.split(r',', args.gpus)]) except ValueError: error("Failed to parser --gpus={gpus}".format(gpus=args.gpus), parser=parser) assert len(gpus) >= 1 if args.run or args.append: if len(cmd) == 0: error("Expected cmd to run in arguments, but none was provided", parser=parser) if shutil.which(cmd[0]) is None: error("Couldn't find {exec} on PATH".format( exec=cmd[0]), parser=parser) if all_args.rlscope_directory is None: # No --rlscope-directory argument; just use current directory? args.rlscope_directory = os.getcwd() else: args.rlscope_directory = all_args.rlscope_directory # # error("\n {cmd}".format(cmd=' '.join(cmd))) # error(textwrap.dedent("""\ # --rlscope-directory must be provided so we know where to output logfile.out for cmd: # > CMD: # $ {cmd} # """).format( # cmd=' '.join(cmd), # ).rstrip()) # # "Copy" --rlscope-directory argument from cmd. # args.rlscope_directory = all_args.rlscope_directory args_dict = dict(vars(args)) args_dict.pop('gpus') args_dict.pop('pdb') obj = RunExpr( cmd=cmd, gpus=gpus, **args_dict, ) def _run(): obj.run_program() run_with_pdb(args, _run)
def mode_run_sh(self): # Fill queue with commands to run. run_commands = self.run_commands() for run_cmd in run_commands: logger.debug(f"Put: {run_cmd}") self.cmd_queue.put(run_cmd) self.start_gpu_workers() bar = None if self.should_show_progress: bar = progressbar.ProgressBar(max_value=len(run_commands)) last_completed = None # Wait for workers to terminate try: while True: if self.should_show_progress: completed = len(run_commands) - self.cmd_queue.qsize() if last_completed is None or completed > last_completed: bar.update(completed) last_completed = completed if self.worker_failed.is_set(): self.stop_workers() # ; use --skip-errors to ignore failed commands. if not self.skip_final_error_message: logger.error("At least one command failed with non-zero exit status") if self.should_show_progress: bar.finish() sys.exit(1) alive_workers = 0 failed_workers = 0 for gpu, worker in self.gpu_workers.items(): if worker.is_alive(): alive_workers += 1 continue if worker.exitcode < 0: logger.error("GPU[{gpu}] worker failed with exitcode={ret} (unhandled exception)".format( gpu=gpu, ret=worker.exitcode, )) self.worker_failed.set() failed_workers += 1 if failed_workers > 0: self.stop_workers() if self.should_show_progress: bar.finish() sys.exit(1) if alive_workers == 0: if self.cmd_queue.qsize() > 0: logger.warning("GPU workers have finished with {len} remaining commands unfinished".format( len=self.cmd_queue.qsize() )) sys.exit(1) logger.debug("GPU workers have finished successfully".format( len=self.cmd_queue.qsize() )) if self.should_show_progress: bar.finish() sys.exit(0) time.sleep(2) except KeyboardInterrupt: logger.info("Saw Ctrl-C; waiting for workers to terminate") self.stop_workers() logger.warning("{len} remaining commands went unprocessed".format(len=self.cmd_queue.qsize())) if self.should_show_progress: bar.finish() sys.exit(1)
def main(): try: check_host.check_config() except RLScopeConfigurationError as e: logger.error(e) sys.exit(1) parser = argparse.ArgumentParser( description=textwrap.dedent(__doc__.lstrip().rstrip()), formatter_class=argparse.RawTextHelpFormatter) parser.add_argument('--rlscope-directory', required=True, help=textwrap.dedent("""\ Look for *.venn_js.json rooted at this directory. The output file will be <directory>/rlscope_plot_index_data.py. All the venn_js_path's in the index will be relative to --directory. """)) parser.add_argument('--out-dir', help=textwrap.dedent("""\ The output file will be <out-dir>/rlscope_plot_index_data.py. Default: --directory """)) parser.add_argument('--debug', action='store_true', help=textwrap.dedent("""\ Debug """)) parser.add_argument('--dry-run', action='store_true', help=textwrap.dedent("""\ Don't write file. """)) parser.add_argument('--basename', default='rlscope_plot_index_data.py', help=textwrap.dedent("""\ Name of python file to generate. """)) parser.add_argument('--replace', action='store_true', help=textwrap.dedent("""\ Replace if exists. """)) parser.add_argument('--pdb', action='store_true', help=textwrap.dedent("""\ Python debugger on unhandled exception. """)) args = parser.parse_args() if args.out_dir is None: args.out_dir = args.rlscope_directory try: obj = GeneratePlotIndex( directory=args.rlscope_directory, out_dir=args.out_dir, basename=args.basename, debug=args.debug, replace=args.replace, dry_run=args.dry_run, ) obj.run() except Exception as e: if not args.pdb: raise print("> RL-Scope: Detected exception:") print(e) print("> Entering pdb:") import pdb pdb.post_mortem() raise
def main(): try: check_host.check_config() except RLScopeConfigurationError as e: logger.error(e) sys.exit(1) rlscope_prof_argv, cmd_argv = gather_argv(sys.argv[1:]) parser = argparse.ArgumentParser( description= "RL-Scope cross-stack profiler for reinforcement learning workloads.", formatter_class=argparse.RawTextHelpFormatter) # NOTE: these arguments must precede the executable (python some/script.py), otherwise they will be sent # to the training script, and not handled by this script (rls-prof). parser.add_argument('--debug', action='store_true') parser.add_argument("--verbosity", choices=['progress', 'commands', 'output'], default='progress', help=textwrap.dedent("""\ Output information about running commands. --verbosity progress (Default) Only show high-level progress bar information. --verbosity commands Show the command-line of commands that are being run. --verbosity output Show the output of each analysis (not configuration) command on sys.stdout. NOTE: This may cause interleaving of lines. """)) parser.add_argument('--line-numbers', action='store_true', help=textwrap.dedent("""\ Show line numbers and timestamps in RL-Scope logging messages. """)) parser.add_argument('--rlscope-debug', action='store_true') parser.add_argument('--rlscope-rm-traces-from', help=textwrap.dedent("""\ Delete traces rooted at this --rlscope-directory. Useful if your training script has multiple training scripts, and you need to use --rlscope-skip-rm-traces when launching the other scripts. """)) # parser.add_argument('--rlscope-disable', action='store_true', help=textwrap.dedent("""\ # RL-Scope: Skip any profiling. Used for uninstrumented runs. # Useful for ensuring minimal libcupti registration when we run --cuda-api-calls during config_uninstrumented. # # Effect: sets "export RLSCOPE_DISABLE=1" for librlscope.so. # """)) add_bool_arg(parser, '--cuda-api-calls', help=textwrap.dedent("""\ Trace CUDA API runtime/driver calls. i.e. total number of calls, and total time (usec) spent in a given API call. Effect: sets "export RLSCOPE_CUDA_API_CALLS=1" for librlscope.so. """)) add_bool_arg(parser, '--cuda-activities', help=textwrap.dedent("""\ Trace CUDA activities (i.e. GPU kernel runtimes, memcpy's). Effect: sets "export RLSCOPE_CUDA_ACTIVITIES=yes" for librlscope.so. """)) add_bool_arg(parser, '--cuda-api-events', help=textwrap.dedent("""\ Trace all the start/end timestamps of CUDA API calls. Needed during instrumented runs so we know when to subtract profiling overheads. Effect: sets "export RLSCOPE_CUDA_API_EVENTS=yes" for librlscope.so. """)) add_bool_arg(parser, '--gpu-hw', help=textwrap.dedent("""\ Collect GPU hardware counters. Effect: sets "export RLSCOPE_GPU_HW=yes" for librlscope.so. """)) # parser.add_argument('--fuzz-cuda-api', action='store_true', # help=textwrap.dedent("""\ # Use libcupti to trace ALL CUDA runtime API calls (# of calls, and total time spent in them). # This is useful for determining which CUDA API's we need to "calibrate subtractions" for. # NOTE: this SHOULDN'T be used for finding profiling book-keeping "subtractions", since it # adds a LOT of overhead to add start/end callbacks to all CUDA API functions. # # Effect: sets "export RLSCOPE_FUZZ_CUDA_API=yes" for librlscope.so. # """)) parser.add_argument('--pc-sampling', action='store_true', help=textwrap.dedent("""\ Perform sample-profiling using CUDA's "PC Sampling" API. Currently, we're just going to record GPUSamplingState.is_gpu_active. Effect: sets "export RLSCOPE_PC_SAMPLING=1" for librlscope.so. """)) parser.add_argument('--trace-at-start', action='store_true', help=textwrap.dedent("""\ Start tracing right at application startup. Effect: sets "export RLSCOPE_TRACE_AT_START=yes" for librlscope.so. """)) # parser.add_argument('--stream-sampling', action='store_true', # help=textwrap.dedent("""\ # Poll cudaStreamQuery() to see if the GPU is being used. # # Effect: sets "export RLSCOPE_STREAM_SAMPLING=yes" for librlscope.so. # """)) calibrate_help = textwrap.dedent("""\ Perform multiple runs in order to calibrate for profiling overhead specific to the workload being run. """).rstrip() parser.add_argument("--calibrate", dest='calibrate', action='store_true', default=True, help=calibrate_help) parser.add_argument("--no-calibrate", dest='calibrate', action='store_false', help=calibrate_help) parser.add_argument("--re-calibrate", action='store_true', help=textwrap.dedent("""\ Remove existing profiling overhead calibration files, and recompute them. """)) parser.add_argument("--re-plot", action='store_true', help=textwrap.dedent("""\ Remove existing plots and remake them (NOTE: doesn't recompute analysis; see --re-calibrate). """)) parallel_runs_help = textwrap.dedent("""\ Parallelize running configurations across GPUs on this machine (assume no CPU interference). See --gpus. """) parser.add_argument("--parallel-runs", dest='parallel_runs', action='store_true', default=True, help=parallel_runs_help) parser.add_argument("--no-parallel-runs", dest='parallel_runs', action='store_false', help=parallel_runs_help) parser.add_argument("--retry", type=int, help=textwrap.dedent("""\ If a command fails, retry it up to --retry times. Default: don't retry. """)) parser.add_argument("--dry-run", action='store_true', help=textwrap.dedent("""\ Dry run """)) # parser.add_argument("--gpus", # action='store_true', # help=textwrap.dedent("""\ # Parallelize running configurations across GPUs on this machine (assume no CPU inteference). See --rlscope-gpus # """)) parser.add_argument("--gpus", help=textwrap.dedent("""\ # Run on the first GPU only --gpus 0 # Run on the first 2 GPUs --gpus 0,1 # Run on all available GPUs --gpus all # Don't allow running with any GPUs (CUDA_VISIBLE_DEVICES="") --gpus none """)) parser.add_argument( '--config', choices=[ 'interception', 'no-interception', 'gpu-activities', 'gpu-activities-api-time', 'no-gpu-activities', 'full', 'time-breakdown', 'gpu-hw', 'uninstrumented', ], # Detect if user provides --config or not. # By default, run with full RL-Scope instrumentation. # default=DEFAULT_CONFIG, help=textwrap.dedent("""\ For measuring LD_PRELOAD CUDA API interception overhead: interception: Enable LD_PRELOAD CUDA API interception. $ rls-prof --debug --cuda-api-calls --cuda-api-events --rlscope-disable no-interception: Disable LD_PRELOAD CUDA API interception. $ rls-prof --debug --rlscope-disable For measuring CUPTI GPU activity gathering overhead on a per CUDA API call basis. gpu-activities: Enable CUPTI GPU activity recording. $ rls-prof --debug --cuda-api-calls --cuda-activities --rlscope-disable no-gpu-activities: Disable CUPTI GPU activity recording. $ rls-prof --debug --cuda-api-calls --rlscope-disable Expect (for the above configurations): You should run train.py with these arguments set # Since we are comparing total training time, # run each configuration with the same number of training loop steps. --rlscope-max-passes $N # Disable any pyprof or old tfprof tracing code. --rlscope-disable For collecting full RL-Scope traces for using with rls-run / rlscope-drill: full: Enable all of tfprof and pyprof collection. $ rls-prof --cuda-api-calls --cuda-api-events --cuda-activities --rlscope-disable NOTE: we still use --rlscope-disable to prevent "old" tfprof collection. gpu-hw: ONLY collect GPU hardware counters """)) args = parser.parse_args(rlscope_prof_argv) is_debug = args.debug or args.rlscope_debug or is_env_true('RLSCOPE_DEBUG') rlscope_logging.setup_logger( debug=is_debug, line_numbers=is_debug or args.line_numbers or py_config.is_development_mode(), ) if args.rlscope_rm_traces_from is not None: logger.info( "rls-prof: Delete trace-files rooted at --rlscope-directory = {dir}" .format(dir=args.rlscope_rm_traces_from)) return rlscope_api.find_librlscope() so_path = rlscope_api.RLSCOPE_CLIB assert so_path is not None env = dict(os.environ) add_env = dict() add_env['LD_PRELOAD'] = "{ld}:{so_path}".format(ld=env.get( 'LD_PRELOAD', ''), so_path=so_path) # Q: I just want LD_LIBRARY_PATH to get printed... if 'LD_LIBRARY_PATH' in env: add_env['LD_LIBRARY_PATH'] = env['LD_LIBRARY_PATH'] # if 'LD_LIBRARY_PATH' in env: # add_env['LD_LIBRARY_PATH'] = env['LD_LIBRARY_PATH'] def _set_if_none(attr, value): if getattr(args, attr) is None: setattr(args, attr, value) def maybe_remove(xs, x): if x in xs: xs.remove(x) if args.calibrate: if args.config is not None: logger.error( "Only --calibrate or --config should be provided for rls-prof." ) parser.exit(1) # Run calibrate.py cmd = ['rls-calibrate', 'run'] if args.gpu_hw: cmd.extend(['--gpu-hw']) maybe_remove(rlscope_prof_argv, '--gpu-hw') cmd.extend(['--verbosity', args.verbosity]) if args.parallel_runs: cmd.extend(['--parallel-runs']) maybe_remove(rlscope_prof_argv, '--parallel-runs') else: cmd.extend(['--no-parallel-runs']) maybe_remove(rlscope_prof_argv, '--no-parallel-runs') if args.retry is not None: cmd.extend(['--retry', str(args.retry)]) # Q: Can't we just pass this through? # if args.re_calibrate: # cmd.extend(['--re-calibrate']) # rlscope_prof_argv.remove('--re-calibrate') # if args.gpus is not None: # cmd.extend(['--gpus', args.gpus]) maybe_remove(rlscope_prof_argv, '--calibrate') cmd.extend(rlscope_prof_argv) cmd.extend(cmd_argv) # cmd.remove('--calibrate') print_cmd(cmd) try: proc = subprocess.run(cmd, check=False) sys.exit(proc.returncode) except KeyboardInterrupt: logger.info( "Saw Ctrl-C during calibration; aborting remaining runs.") sys.exit(1) if args.config is None: args.config = DEFAULT_CONFIG add_env['RLSCOPE_CONFIG'] = args.config if args.config == 'interception': "rls-prof --debug --cuda-api-calls --cuda-api-events" _set_if_none('cuda_api_calls', True) _set_if_none('cuda_api_events', True) elif args.config in ['no-interception', 'uninstrumented']: "rls-prof --debug" pass elif args.config == 'gpu-hw': "$ rls-prof --debug --gpu-hw" _set_if_none('cuda_api_calls', False) _set_if_none('cuda_api_events', False) _set_if_none('cuda_activities', False) _set_if_none('gpu_hw', True) elif args.config == 'no-gpu-activities': "$ rls-prof --debug --cuda-api-calls" _set_if_none('cuda_api_calls', True) _set_if_none('gpu_hw', False) elif args.config == 'gpu-activities': "$ rls-prof --debug --cuda-api-calls --cuda-activities" _set_if_none('cuda_api_calls', True) _set_if_none('cuda_activities', True) _set_if_none('gpu_hw', False) elif args.config == 'gpu-activities-api-time': "$ rls-prof --debug --cuda-api-calls --cuda-api-events --cuda-activities" _set_if_none('cuda_api_calls', True) _set_if_none('cuda_api_events', True) _set_if_none('cuda_activities', True) _set_if_none('gpu_hw', False) elif args.config is None or args.config in {'full', 'time-breakdown'}: "$ rls-prof --cuda-api-calls --cuda-api-events --cuda-activities" _set_if_none('cuda_api_calls', True) _set_if_none('cuda_api_events', True) _set_if_none('cuda_activities', True) _set_if_none('gpu_hw', False) else: raise NotImplementedError() # if args.fuzz_cuda_api and args.cuda_api_calls: # parser.error("Can only run rls-prof with --fuzz-cuda-api or --cuda-api-calls, not both") if args.debug or args.rlscope_debug or is_env_true('RLSCOPE_DEBUG'): logger.info( "Detected debug mode; enabling C++ logging statements (export RLSCOPE_CPP_MIN_VLOG_LEVEL=1)" ) add_env['RLSCOPE_CPP_MIN_VLOG_LEVEL'] = 1 # if args.rlscope_disable: # add_env['RLSCOPE_DISABLE'] = 'yes' def set_yes_no(attr, env_var): if getattr(args, attr): add_env[env_var] = 'yes' else: add_env[env_var] = 'no' set_yes_no('cuda_api_calls', 'RLSCOPE_CUDA_API_CALLS') set_yes_no('cuda_activities', 'RLSCOPE_CUDA_ACTIVITIES') set_yes_no('gpu_hw', 'RLSCOPE_GPU_HW') set_yes_no('pc_sampling', 'RLSCOPE_PC_SAMPLING') # set_yes_no('fuzz_cuda_api', 'RLSCOPE_FUZZ_CUDA_API') set_yes_no('cuda_api_events', 'RLSCOPE_CUDA_API_EVENTS') set_yes_no('gpu_hw', 'RLSCOPE_GPU_HW') set_yes_no('trace_at_start', 'RLSCOPE_TRACE_AT_START') # set_yes_no('stream_sampling', 'RLSCOPE_STREAM_SAMPLING') if len(cmd_argv) == 0: parser.print_usage() logger.error( "You must provide a command to execute after \"rls-prof\"") sys.exit(1) exe_path = shutil.which(cmd_argv[0]) if exe_path is None: print( "RL-Scope ERROR: couldn't locate {exe} on $PATH; try giving a full path to {exe} perhaps?" .format(exe=cmd_argv[0], )) sys.exit(1) # cmd = argv cmd = [exe_path] + cmd_argv[1:] print_cmd(cmd, env=add_env) env.update(add_env) for k in list(env.keys()): env[k] = str(env[k]) sys.stdout.flush() sys.stderr.flush() os.execve(exe_path, cmd, env) # os.execve shouldn't return. assert False
def find_librlscope(): global RLSCOPE_CLIB if RLSCOPE_CLIB is not None: return # Older version of python (<=3.6) need 'LIBRARY_PATH' to be defined for find_library to work. # assert 'LIBRARY_PATH' not in ENV or ENV['LIBRARY_PATH'] == ENV['LD_LIBRARY_PATH'] # First, try to find librlscope.so using our current LD_LIBRARY_PATH. # # NOTE: This will succeed in development mode (i.e., "python setup.py develop") # since we set LD_LIBRARY_PATH in source_me.sh. # In "pip install rlscope" distribution mode, this will fail, since # librlscope.lib is packaged inside: # rlscope/cpp/lib/librlscope.so ENV['LIBRARY_PATH'] = ENV.get('LD_LIBRARY_PATH', '') RLSCOPE_CLIB = ctypes.util.find_library(RLSCOPE_LIBNAME) if RLSCOPE_CLIB is None: orig_LD_LIBRARY_PATH = ENV.get('LD_LIBRARY_PATH', '') # Locations to search for librlscope.so # Currently, we just search for rlscope/cpp/lib/librlscope.so rlscope_lib_dirs = [py_config.CPP_LIB] for path in rlscope_lib_dirs: if not os.path.isdir(path): continue ENV['LD_LIBRARY_PATH'] = "{LD_LIBRARY_PATH}:{path}".format( path=path, LD_LIBRARY_PATH=ENV.get('LD_LIBRARY_PATH', ''), ) ENV['LIBRARY_PATH'] = ENV['LD_LIBRARY_PATH'] RLSCOPE_CLIB = ctypes.util.find_library(RLSCOPE_LIBNAME) if RLSCOPE_CLIB is not None: break ENV['LD_LIBRARY_PATH'] = orig_LD_LIBRARY_PATH ENV['LIBRARY_PATH'] = ENV['LD_LIBRARY_PATH'] if RLSCOPE_CLIB is None: if py_config.is_development_mode(): """ RL-Scope has been installed using "python setup.py develop", and is being run from a github repo checkout. Provide instructions on how to build librlscope.so from scratch, and add it to the user's LD_LIBRARY_PATH so we can find it. """ logger.error( textwrap.dedent("""\ RL-Scope ERROR: couldn't find RL-Scope library (lib{name}.so); to build it, do: $ cd {root} $ bash ./setup.sh # To modify your LD_LIBRARY_PATH to include lib{name}.so, run: $ source source_me.sh """.format( name=RLSCOPE_LIBNAME, root=py_config.ROOT, )).rstrip()) else: """ RL-Scope has been installed using "pip install rlscope". librlscope.so SHOULD be bundled with the install python package; if it isn't then this is a BUG. """ logger.error( textwrap.dedent("""\ RL-Scope ERROR: couldn't find RL-Scope library (lib{name}.so) inside {lib_dir}. This looks like a BUG in RL-Scope; please report it at: https://github.com/UofT-EcoSystem/rlscope/issues """.format( lib_dir=py_config.CPP_LIB, name=RLSCOPE_LIBNAME, )).rstrip()) sys.exit(1)
def main(): try: check_host.check_config() except RLScopeConfigurationError as e: logger.error(e) sys.exit(1) rlscope_util_argv, cmd_argv = split_argv_on(sys.argv[1:]) parser = get_util_sampler_parser(add_rlscope_root_pid=len(cmd_argv) == 0) args = parser.parse_args(rlscope_util_argv) # To make it easy to launch utilization sampler manually in certain code bases, # allow ignoring all the --rlscope-* arguments: # # e.g. in minigo's loop_main.sh shell script we do # python3 -m scripts.utilization_sampler "$@" --rlscope-directory $BASE_DIR & # where $@ contains all the --rlscope-* args. args, extra_argv = parser.parse_known_args() # args = parser.parse_args() # NOTE: During profiling, we depend on this being called from the root training script. if not args.skip_smi_check: nvidia_gpu_query.check_nvidia_smi() if args.kill: for proc in psutil.process_iter(): # if proc.name() == sys.argv[0]: # pinfo = proc.as_dict(attrs=['pid', 'name', 'username']) pinfo = proc.as_dict(attrs=['pid', 'username', 'cmdline']) pprint.pprint({'pinfo': pinfo}) # cmdline = proc.cmdline() try: logger.info(pinfo['cmdline']) if re.search(r'rls-util-sampler', ' '.join( pinfo['cmdline'])) and pinfo['pid'] != os.getpid(): logger.info( "> Kill rls-util-sampler: {proc}".format(proc=proc)) proc.kill() except psutil.NoSuchProcess: pass sys.exit(0) if args.rlscope_directory is None: logger.info( "--rlscope-directory is required: directory where trace-files are saved" ) parser.print_help() sys.exit(1) os.makedirs(args.rlscope_directory, exist_ok=True) # if args.measure_samples_per_sec: # measure_samples_per_sec() # return if args.rlscope_util_sample_frequency_sec < MIN_UTIL_SAMPLE_FREQUENCY_SEC: parser.error( "Need --rlscope-util-sample-frequency-sec={val} to be larger than minimum sample frequency ({min} sec)" .format( val=args.rlscope_util_sample_frequency_sec, min=MIN_UTIL_SAMPLE_FREQUENCY_SEC, )) rlscope_root_pid = None cmd_proc = None if len(cmd_argv) != 0: exe_path = shutil.which(cmd_argv[0]) if exe_path is None: print( "RL-Scope ERROR: couldn't locate {exe} on $PATH; try giving a full path to {exe} perhaps?" .format(exe=cmd_argv[0], )) sys.exit(1) cmd = [exe_path] + cmd_argv[1:] print_cmd(cmd) sys.stdout.flush() sys.stderr.flush() cmd_proc = subprocess.Popen(cmd) rlscope_root_pid = cmd_proc.pid else: rlscope_root_pid = args.rlscope_root_pid # NOTE: usually, we have rls-prof program signal us to terminate. # However if they provide a cmd, we would like to terminate sampler when cmd finishes, and return cmd's exit status. util_sampler = UtilizationSampler( directory=args.rlscope_directory, pid=rlscope_root_pid, async_process=cmd_proc, util_dump_frequency_sec=args.rlscope_util_dump_frequency_sec, util_sample_frequency_sec=args.rlscope_util_sample_frequency_sec, debug=args.rlscope_debug, debug_single_thread=args.rlscope_debug_single_thread, ) util_sampler.run() sys.exit(util_sampler.exit_status)
def expr_run_cmd(cmd, to_file, cwd=None, env=None, replace=False, dry_run=False, skip_error=False, tee_output=True, tee_cmd=None, tee_prefix=None, # extra_argv=[], only_show_env=None, debug=False, raise_exception=False, exception_class=None, log_errors=True, log_func=None): """ Run an experiment, if it hasn't been run already. We check if an experiment as already been run by looking for a log file, and whether that logfile has a success-line in it (we search for "IML BENCH DONE") :param self: :param cmd: :param to_file: :param env: :param replace: :param debug: :return: """ if log_func is None: log_func = logger.error if env is None: # Make sure rls-run get RLSCOPE_POSTGRES_HOST env = dict(os.environ) proc = None failed = False if replace or not expr_already_ran(to_file, debug=debug): try: tee_kwargs = dict() if skip_error: tee_kwargs['check'] = False proc = tee( cmd=cmd, to_file=to_file, cwd=cwd, env=env, dry_run=dry_run, tee_output=tee_output, tee_cmd=tee_cmd, tee_prefix=tee_prefix, only_show_env=only_show_env, **tee_kwargs, ) if not dry_run and skip_error and proc.returncode != 0: if log_errors: log_func( "Command failed; see {path}; continuing".format( path=to_file, )) failed = True except subprocess.CalledProcessError as e: err_msg = textwrap.dedent("""\ Command failed: see {path} for command and output. """).format( path=to_file, ).rstrip() if log_errors: logger.error(err_msg) if raise_exception: if exception_class is None: raise raise exception_class(err_msg) ret = 1 if debug: logger.error("Exiting with ret={ret}\n{stack}".format( ret=ret, stack=get_stacktrace(), )) sys.exit(ret) if not failed: if not dry_run and proc.returncode != 0: logger.error("BUG: saw returncode = {ret}, expected 0".format( ret=proc.returncode)) assert proc.returncode == 0 if not dry_run: with open(to_file, 'a') as f: f.write("{success_line}\n".format(success_line=EXPERIMENT_SUCCESS_LINE)) if not dry_run: assert expr_already_ran(to_file, debug=debug) return proc
def main(): try: check_host.check_config() except RLScopeConfigurationError as e: logger.error(e) sys.exit(1) parser = argparse.ArgumentParser( description=textwrap.dedent("""\ Process trace-files collected from running an ML script with the RL-Scope profiler. For task-specific help, provided task-name and --help, e.g.: $ rls-run --task OverlapStackedBarTask --help NOTE: - This script is a thin usage/debugging wrapper around a "luigi" DAG execution script. It just forwards arguments to it. - Any unparsed args are forward to the luigi script. """), formatter_class=argparse.RawTextHelpFormatter, add_help=False, ) parser.add_argument('--pdb', action='store_true', help="Break into pdb when an exception occurs") parser.add_argument('--task', choices=[klass.__name__ for klass in tasks.RLSCOPE_TASKS], help="Name of a runnable IMLTask defined in rlscope.parser.tasks") parser.add_argument('--workers', type=int, # DISABLE --workers for now to prevent opening to many postgres connections by accident; # we parallelize internally instead # e.g. ResourceOverlap with 32 worker threads, each of which opens a SQL # connection. # default=multiprocessing.cpu_count(), default=1, help="Maximum number of parallel tasks to run (luigi parameter)") parser.add_argument('--help', '-h', action='store_true') args, luigi_argv = parser.parse_known_args(sys.argv) if args.help and not args.task: # Print available tasks. parser.print_help() sys.exit(0) if args.task is None and not args.help: # If they just run this: # $ rls-run --rlscope-directory <dir> # Then run all the targets. args.task = 'All' extra_argv = [ '--module', 'rlscope.parser.tasks', '--local-scheduler', # Default log-level from luigi is DEBUG which is too noisy. # Make the default level INFO instead. '--log-level', 'INFO', ] luigi_argv.extend(extra_argv) if args.task: # Task needs to be the first argument after rls-run. luigi_argv.insert(1, args.task) if args.help: luigi_argv.extend(['--help']) if args.workers > 1: logger.warning("Each overlap plot uses all the cores; forcing --workers=1") args.workers = 1 if args.pdb: logger.debug("Registering pdb breakpoint (--pdb)") register_pdb_breakpoint() # Debugger is useless when multithreaded. args.workers = 1 luigi_argv.extend(['--workers', str(args.workers)]) # logger.debug("Luigi arguments:\n{msg}".format(msg=textwrap.indent(pprint.pformat({ # 'luigi_argv':luigi_argv, # 'sys.argv':sys.argv, # }), prefix=' '))) with warnings.catch_warnings(): # I don't really take much advantage of luigi's DFS scheduler and instead run things manually. # Oh well. warnings.filterwarnings('ignore', category=UserWarning, message=r'.*without outputs has no custom complete', module=r'luigi') warnings.filterwarnings('ignore', category=UserWarning, message=r'Parameter.*with value "None" is not of type string', module=r'luigi') tasks.main(argv=luigi_argv[1:], should_exit=False)
def pdb_breakpoint(task, ex): logger.error("> Detected unhandled exception {ex} in {task}; entering pdb".format( ex=ex.__class__.__name__, task=task.__class__.__name__, )) pdb.post_mortem()